From ceb051af4ed29b70d58c3343211abeeaacef5f62 Mon Sep 17 00:00:00 2001 From: Anthony Minessale Date: Wed, 11 Sep 2019 15:51:47 +0000 Subject: [PATCH] [libvpx] Update to v1.8.1 from https://chromium.googlesource.com/webm/libvpx --- libs/libvpx/AUTHORS | 29 +- libs/libvpx/CHANGELOG | 61 +- libs/libvpx/README | 51 +- libs/libvpx/args.h | 6 +- libs/libvpx/build/.gitattributes | 2 - libs/libvpx/build/.gitignore | 1 - libs/libvpx/build/make/Android.mk | 37 +- libs/libvpx/build/make/Makefile | 1 + libs/libvpx/build/make/ads2gas.pl | 28 +- libs/libvpx/build/make/ads2gas_apple.pl | 4 +- libs/libvpx/build/make/configure.sh | 223 +- libs/libvpx/build/make/gen_msvs_vcxproj.sh | 15 + libs/libvpx/build/make/iosbuild.sh | 7 +- libs/libvpx/build/make/msvs_common.sh | 9 + libs/libvpx/build/make/rtcd.pl | 4 +- libs/libvpx/build/make/thumb.pm | 7 - libs/libvpx/codereview.settings | 5 +- libs/libvpx/configure | 94 +- libs/libvpx/examples.mk | 46 +- libs/libvpx/{vpx => examples}/svc_context.h | 24 +- .../{vpx/src => examples}/svc_encodeframe.c | 118 +- .../examples/vp8_multi_resolution_encoder.c | 8 +- .../libvpx/examples/vp9_spatial_svc_encoder.c | 739 +- libs/libvpx/examples/vp9cx_set_ref.c | 122 - libs/libvpx/examples/vpx_dec_fuzzer.cc | 118 + .../examples/vpx_temporal_svc_encoder.c | 165 +- libs/libvpx/ivfdec.c | 8 +- libs/libvpx/ivfdec.h | 6 +- libs/libvpx/ivfenc.h | 6 +- libs/libvpx/libs.doxy_template | 12 - libs/libvpx/libs.mk | 51 +- libs/libvpx/mainpage.dox | 2 + libs/libvpx/md5_utils.c | 2 +- libs/libvpx/md5_utils.h | 6 +- libs/libvpx/rate_hist.h | 6 +- libs/libvpx/test/acm_random.h | 24 +- libs/libvpx/test/active_map_refresh_test.cc | 2 +- libs/libvpx/test/active_map_test.cc | 2 +- libs/libvpx/test/add_noise_test.cc | 38 +- libs/libvpx/test/alt_ref_aq_segment_test.cc | 2 +- libs/libvpx/test/altref_test.cc | 2 +- libs/libvpx/test/android/README | 4 +- libs/libvpx/test/aq_segment_test.cc | 2 +- libs/libvpx/test/avg_test.cc | 254 +- libs/libvpx/test/bench.cc | 38 + libs/libvpx/test/bench.h | 30 + libs/libvpx/test/blockiness_test.cc | 16 +- libs/libvpx/test/borders_test.cc | 2 +- libs/libvpx/test/buffer.h | 6 +- libs/libvpx/test/byte_alignment_test.cc | 3 +- libs/libvpx/test/clear_system_state.h | 16 +- libs/libvpx/test/codec_factory.h | 17 +- libs/libvpx/test/comp_avg_pred_test.cc | 4 + libs/libvpx/test/consistency_test.cc | 11 +- libs/libvpx/test/convolve_test.cc | 106 +- libs/libvpx/test/cpu_speed_test.cc | 4 +- libs/libvpx/test/cq_test.cc | 2 +- libs/libvpx/test/datarate_test.cc | 1876 ---- libs/libvpx/test/dct16x16_test.cc | 10 +- libs/libvpx/test/dct32x32_test.cc | 30 +- libs/libvpx/test/dct_partial_test.cc | 48 +- libs/libvpx/test/dct_test.cc | 917 +- libs/libvpx/test/decode_api_test.cc | 52 +- libs/libvpx/test/decode_corrupted.cc | 103 + libs/libvpx/test/decode_perf_test.cc | 8 +- libs/libvpx/test/decode_svc_test.cc | 9 +- libs/libvpx/test/decode_test_driver.cc | 5 +- libs/libvpx/test/decode_test_driver.h | 6 +- libs/libvpx/test/encode_perf_test.cc | 2 +- libs/libvpx/test/encode_test_driver.cc | 8 +- libs/libvpx/test/encode_test_driver.h | 28 +- .../libvpx/test/external_frame_buffer_test.cc | 12 +- libs/libvpx/test/fdct8x8_test.cc | 10 +- libs/libvpx/test/frame_size_tests.cc | 2 +- libs/libvpx/test/hadamard_test.cc | 369 +- libs/libvpx/test/i420_video_source.h | 6 +- libs/libvpx/test/idct_test.cc | 4 +- libs/libvpx/test/invalid_file_test.cc | 7 +- libs/libvpx/test/ivf_video_source.h | 8 +- libs/libvpx/test/keyframe_test.cc | 9 +- libs/libvpx/test/lpf_test.cc | 7 +- libs/libvpx/test/md5_helper.h | 6 +- libs/libvpx/test/partial_idct_test.cc | 8 +- libs/libvpx/test/pp_filter_test.cc | 352 +- libs/libvpx/test/predict_test.cc | 43 +- libs/libvpx/test/quantize_test.cc | 22 +- libs/libvpx/test/register_state_check.h | 10 +- libs/libvpx/test/resize_test.cc | 40 + libs/libvpx/test/sad_test.cc | 61 +- libs/libvpx/test/stress.sh | 36 +- libs/libvpx/test/sum_squares_test.cc | 19 +- libs/libvpx/test/superframe_test.cc | 8 +- libs/libvpx/test/svc_datarate_test.cc | 1428 +++ libs/libvpx/test/svc_end_to_end_test.cc | 481 + libs/libvpx/test/svc_test.cc | 871 +- libs/libvpx/test/svc_test.h | 67 + libs/libvpx/test/temporal_filter_test.cc | 277 - libs/libvpx/test/test-data.mk | 23 +- libs/libvpx/test/test-data.sha1 | 23 +- libs/libvpx/test/test.mk | 15 +- libs/libvpx/test/test_intra_pred_speed.cc | 3 + libs/libvpx/test/test_libvpx.cc | 1 - libs/libvpx/test/test_vector_test.cc | 52 +- libs/libvpx/test/test_vectors.h | 6 +- libs/libvpx/test/tile_independence_test.cc | 2 +- libs/libvpx/test/timestamp_test.cc | 109 + libs/libvpx/test/tools_common.sh | 6 +- libs/libvpx/test/user_priv_test.cc | 4 +- libs/libvpx/test/util.h | 10 +- libs/libvpx/test/variance_test.cc | 80 +- libs/libvpx/test/video_source.h | 6 +- libs/libvpx/test/vp8_datarate_test.cc | 416 + .../test/vp8_multi_resolution_encoder.sh | 22 +- libs/libvpx/test/vp9_arf_freq_test.cc | 4 +- libs/libvpx/test/vp9_block_error_test.cc | 5 +- libs/libvpx/test/vp9_datarate_test.cc | 901 ++ libs/libvpx/test/vp9_denoiser_test.cc | 5 +- .../test/vp9_encoder_parms_get_to_decoder.cc | 6 +- libs/libvpx/test/vp9_end_to_end_test.cc | 175 +- libs/libvpx/test/vp9_ethread_test.cc | 4 +- libs/libvpx/test/vp9_intrapred_test.cc | 89 +- libs/libvpx/test/vp9_lossless_test.cc | 2 +- libs/libvpx/test/vp9_motion_vector_test.cc | 8 +- libs/libvpx/test/vp9_quantize_test.cc | 319 +- libs/libvpx/test/vp9_scale_test.cc | 91 +- libs/libvpx/test/vp9_spatial_svc_encoder.sh | 72 - libs/libvpx/test/vp9_subtract_test.cc | 110 +- libs/libvpx/test/vp9_thread_test.cc | 3 +- libs/libvpx/test/vpx_scale_test.cc | 24 +- libs/libvpx/test/vpx_scale_test.h | 9 +- libs/libvpx/test/vpx_temporal_svc_encoder.sh | 55 +- libs/libvpx/test/vpxdec.sh | 37 +- libs/libvpx/test/vpxenc.sh | 143 +- libs/libvpx/test/webm_video_source.h | 6 +- libs/libvpx/test/y4m_test.cc | 24 +- libs/libvpx/test/y4m_video_source.h | 9 +- libs/libvpx/test/yuv_temporal_filter_test.cc | 708 ++ libs/libvpx/test/yuv_video_source.h | 6 +- .../third_party/googletest/README.libvpx | 14 +- .../third_party/googletest/src/README.md | 401 +- .../src/include/gtest/gtest-death-test.h | 66 +- .../src/include/gtest/gtest-message.h | 13 +- .../src/include/gtest/gtest-param-test.h | 34 +- .../src/include/gtest/gtest-param-test.h.pump | 28 +- .../src/include/gtest/gtest-printers.h | 230 +- .../googletest/src/include/gtest/gtest-spi.h | 15 +- .../src/include/gtest/gtest-test-part.h | 10 +- .../src/include/gtest/gtest-typed-test.h | 115 +- .../googletest/src/include/gtest/gtest.h | 189 +- .../src/include/gtest/gtest_pred_impl.h | 15 +- .../googletest/src/include/gtest/gtest_prod.h | 17 +- .../include/gtest/internal/custom/README.md | 56 + .../gtest/internal/custom/gtest-port.h | 34 +- .../gtest/internal/custom/gtest-printers.h | 4 +- .../src/include/gtest/internal/custom/gtest.h | 6 +- .../internal/gtest-death-test-internal.h | 77 +- .../include/gtest/internal/gtest-filepath.h | 11 +- .../include/gtest/internal/gtest-internal.h | 252 +- .../include/gtest/internal/gtest-linked_ptr.h | 6 +- .../internal/gtest-param-util-generated.h | 492 +- .../gtest-param-util-generated.h.pump | 20 +- .../include/gtest/internal/gtest-param-util.h | 31 +- .../include/gtest/internal/gtest-port-arch.h | 9 +- .../src/include/gtest/internal/gtest-port.h | 395 +- .../src/include/gtest/internal/gtest-string.h | 8 +- .../src/include/gtest/internal/gtest-tuple.h | 7 +- .../include/gtest/internal/gtest-tuple.h.pump | 7 +- .../include/gtest/internal/gtest-type-util.h | 23 +- .../gtest/internal/gtest-type-util.h.pump | 23 +- .../googletest/src/src/gtest-all.cc | 5 +- .../googletest/src/src/gtest-death-test.cc | 309 +- .../googletest/src/src/gtest-filepath.cc | 16 +- .../googletest/src/src/gtest-internal-inl.h | 72 +- .../googletest/src/src/gtest-port.cc | 213 +- .../googletest/src/src/gtest-printers.cc | 108 +- .../googletest/src/src/gtest-test-part.cc | 13 +- .../googletest/src/src/gtest-typed-test.cc | 4 +- .../third_party/googletest/src/src/gtest.cc | 1084 +- .../googletest/src/src/gtest_main.cc | 3 +- libs/libvpx/third_party/libwebm/Android.mk | 2 +- libs/libvpx/third_party/libwebm/README.libvpx | 14 +- .../third_party/libwebm/common/file_util.cc | 19 +- .../third_party/libwebm/common/file_util.h | 5 +- .../third_party/libwebm/common/hdr_util.cc | 8 +- .../third_party/libwebm/common/hdr_util.h | 10 +- .../third_party/libwebm/common/webmids.h | 1 + .../third_party/libwebm/mkvmuxer/mkvmuxer.cc | 77 +- .../third_party/libwebm/mkvmuxer/mkvmuxer.h | 5 +- .../libwebm/mkvmuxer/mkvmuxerutil.cc | 13 +- .../libwebm/mkvmuxer/mkvmuxerutil.h | 3 + .../third_party/libwebm/mkvmuxer/mkvwriter.cc | 2 + .../libwebm/mkvparser/mkvparser.cc | 73 +- .../third_party/libwebm/mkvparser/mkvparser.h | 6 +- .../libwebm/mkvparser/mkvreader.cc | 2 + libs/libvpx/third_party/libyuv/LICENSE | 29 + libs/libvpx/third_party/libyuv/README.libvpx | 23 +- .../libyuv/include/libyuv/basic_types.h | 109 +- .../libyuv/include/libyuv/compare.h | 93 +- .../libyuv/include/libyuv/convert.h | 421 +- .../libyuv/include/libyuv/convert_argb.h | 676 +- .../libyuv/include/libyuv/convert_from.h | 377 +- .../libyuv/include/libyuv/convert_from_argb.h | 283 +- .../libyuv/include/libyuv/cpu_id.h | 75 +- .../libyuv/include/libyuv/macros_msa.h | 233 + .../libyuv/include/libyuv/mjpeg_decoder.h | 33 +- .../libyuv/include/libyuv/planar_functions.h | 1248 +- .../libyuv/include/libyuv/rotate.h | 143 +- .../libyuv/include/libyuv/rotate_argb.h | 14 +- .../libyuv/include/libyuv/rotate_row.h | 203 +- .../third_party/libyuv/include/libyuv/row.h | 4065 ++++--- .../third_party/libyuv/include/libyuv/scale.h | 110 +- .../libyuv/include/libyuv/scale_argb.h | 60 +- .../libyuv/include/libyuv/scale_row.h | 1083 +- .../libyuv/include/libyuv/version.h | 6 +- .../libyuv/include/libyuv/video_common.h | 52 +- .../third_party/libyuv/source/compare.cc | 267 +- .../libyuv/source/compare_common.cc | 70 +- .../third_party/libyuv/source/compare_gcc.cc | 427 +- .../third_party/libyuv/source/compare_msa.cc | 97 + .../third_party/libyuv/source/compare_neon.cc | 94 +- .../libyuv/source/compare_neon64.cc | 88 +- .../third_party/libyuv/source/compare_win.cc | 119 +- .../third_party/libyuv/source/convert.cc | 963 +- .../third_party/libyuv/source/convert_argb.cc | 1777 ++- .../third_party/libyuv/source/convert_from.cc | 1165 +- .../libyuv/source/convert_from_argb.cc | 839 +- .../third_party/libyuv/source/convert_jpeg.cc | 243 +- .../libyuv/source/convert_to_argb.cc | 246 +- .../libyuv/source/convert_to_i420.cc | 302 +- .../third_party/libyuv/source/cpu_id.cc | 208 +- .../libyuv/source/mjpeg_decoder.cc | 126 +- .../libyuv/source/mjpeg_validate.cc | 11 +- .../libyuv/source/planar_functions.cc | 1876 +++- .../third_party/libyuv/source/rotate.cc | 377 +- .../third_party/libyuv/source/rotate_any.cc | 57 +- .../third_party/libyuv/source/rotate_argb.cc | 163 +- .../libyuv/source/rotate_common.cc | 40 +- .../third_party/libyuv/source/rotate_gcc.cc | 660 +- .../third_party/libyuv/source/rotate_mips.cc | 484 - .../third_party/libyuv/source/rotate_msa.cc | 250 + .../third_party/libyuv/source/rotate_neon.cc | 567 +- .../libyuv/source/rotate_neon64.cc | 685 +- .../third_party/libyuv/source/rotate_win.cc | 51 +- .../third_party/libyuv/source/row_any.cc | 937 +- .../third_party/libyuv/source/row_common.cc | 2514 +++-- .../third_party/libyuv/source/row_gcc.cc | 9987 +++++++++-------- .../third_party/libyuv/source/row_mips.cc | 782 -- .../third_party/libyuv/source/row_msa.cc | 3512 ++++++ .../third_party/libyuv/source/row_neon.cc | 4374 ++++---- .../third_party/libyuv/source/row_neon64.cc | 4147 +++---- .../third_party/libyuv/source/row_win.cc | 3943 ++++--- .../libvpx/third_party/libyuv/source/scale.cc | 987 +- .../third_party/libyuv/source/scale_any.cc | 489 +- .../third_party/libyuv/source/scale_argb.cc | 573 +- .../third_party/libyuv/source/scale_common.cc | 808 +- .../third_party/libyuv/source/scale_gcc.cc | 2280 ++-- .../third_party/libyuv/source/scale_mips.cc | 644 -- .../third_party/libyuv/source/scale_msa.cc | 949 ++ .../third_party/libyuv/source/scale_neon.cc | 1453 ++- .../third_party/libyuv/source/scale_neon64.cc | 1582 +-- .../third_party/libyuv/source/scale_win.cc | 861 +- .../third_party/libyuv/source/video_common.cc | 51 +- .../tools/3D-Reconstruction/genY4M/genY4M.py | 76 + .../sketch_3D_reconstruction/BVH.pde | 163 + .../sketch_3D_reconstruction/Camera.pde | 138 + .../sketch_3D_reconstruction/MotionField.pde | 94 + .../sketch_3D_reconstruction/PointCloud.pde | 138 + .../sketch_3D_reconstruction/Ray_Tracing.pde | 61 + .../sketch_3D_reconstruction/Scene.pde | 59 + .../sketch_3D_reconstruction/Transform.pde | 82 + .../sketch_3D_reconstruction/Util.pde | 28 + .../sketch_3D_reconstruction.pde | 74 + .../tools/non_greedy_mv/non_greedy_mv.py | 186 + libs/libvpx/tools/set_analyzer_env.sh | 142 + libs/libvpx/tools/tiny_ssim.c | 452 +- libs/libvpx/tools_common.c | 314 +- libs/libvpx/tools_common.h | 26 +- libs/libvpx/usage_cx.dox | 2 + libs/libvpx/usage_dx.dox | 2 + libs/libvpx/video_common.h | 6 +- libs/libvpx/video_reader.c | 32 +- libs/libvpx/video_reader.h | 6 +- libs/libvpx/video_writer.c | 14 +- libs/libvpx/video_writer.h | 6 +- libs/libvpx/vp8/common/alloccommon.h | 8 +- libs/libvpx/vp8/common/arm/loopfilter_arm.c | 22 +- libs/libvpx/vp8/common/arm/loopfilter_arm.h | 31 + .../common/arm/neon/bilinearpredict_neon.c | 2 + .../libvpx/vp8/common/arm/neon/copymem_neon.c | 2 + .../vp8/common/arm/neon/dequantizeb_neon.c | 1 + .../vp8/common/arm/neon/idct_blk_neon.c | 251 +- .../common/arm/neon/idct_dequant_0_2x_neon.c | 59 - .../arm/neon/idct_dequant_full_2x_neon.c | 182 - libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c | 2 + .../loopfiltersimplehorizontaledge_neon.c | 2 + .../neon/loopfiltersimpleverticaledge_neon.c | 2 + .../vp8/common/arm/neon/mbloopfilter_neon.c | 2 + .../vp8/common/arm/neon/sixtappredict_neon.c | 1 + .../vp8/common/arm/neon/vp8_loopfilter_neon.c | 2 + libs/libvpx/vp8/common/blockd.c | 12 +- libs/libvpx/vp8/common/blockd.h | 21 +- libs/libvpx/vp8/common/coefupdateprobs.h | 6 +- libs/libvpx/vp8/common/common.h | 18 +- libs/libvpx/vp8/common/default_coef_probs.h | 8 +- libs/libvpx/vp8/common/entropy.c | 18 +- libs/libvpx/vp8/common/entropy.h | 6 +- libs/libvpx/vp8/common/entropymode.c | 10 +- libs/libvpx/vp8/common/entropymode.h | 6 +- libs/libvpx/vp8/common/entropymv.h | 6 +- libs/libvpx/vp8/common/extend.c | 3 +- libs/libvpx/vp8/common/extend.h | 6 +- libs/libvpx/vp8/common/filter.h | 6 +- libs/libvpx/vp8/common/findnearmv.c | 28 +- libs/libvpx/vp8/common/findnearmv.h | 8 +- libs/libvpx/vp8/common/header.h | 6 +- libs/libvpx/vp8/common/idct_blk.c | 26 +- libs/libvpx/vp8/common/invtrans.h | 6 +- libs/libvpx/vp8/common/loopfilter.h | 6 +- libs/libvpx/vp8/common/loopfilter_filters.c | 22 +- libs/libvpx/vp8/common/mfqe.c | 4 +- .../vp8/common/mips/dspr2/idct_blk_dspr2.c | 20 +- .../mips/dspr2/vp8_loopfilter_filters_dspr2.c | 12 +- .../libvpx/vp8/common/mips/mmi/idct_blk_mmi.c | 23 +- libs/libvpx/vp8/common/mips/msa/idct_msa.c | 58 +- .../vp8/common/mips/msa/vp8_macros_msa.h | 6 +- libs/libvpx/vp8/common/modecont.c | 36 +- libs/libvpx/vp8/common/modecont.h | 6 +- libs/libvpx/vp8/common/mv.h | 6 +- libs/libvpx/vp8/common/onyx.h | 34 +- libs/libvpx/vp8/common/onyxc_int.h | 6 +- libs/libvpx/vp8/common/onyxd.h | 18 +- libs/libvpx/vp8/common/postproc.c | 122 +- libs/libvpx/vp8/common/postproc.h | 12 +- libs/libvpx/vp8/common/ppflags.h | 6 +- libs/libvpx/vp8/common/quant_common.h | 6 +- libs/libvpx/vp8/common/reconinter.c | 7 + libs/libvpx/vp8/common/reconinter.h | 29 +- libs/libvpx/vp8/common/reconintra.h | 6 +- libs/libvpx/vp8/common/reconintra4x4.h | 8 +- libs/libvpx/vp8/common/rtcd_defs.pl | 68 +- libs/libvpx/vp8/common/setupintrarecon.h | 6 +- libs/libvpx/vp8/common/swapyv12buffer.h | 6 +- libs/libvpx/vp8/common/systemdependent.h | 6 +- libs/libvpx/vp8/common/threading.h | 16 +- libs/libvpx/vp8/common/treecoder.c | 9 +- libs/libvpx/vp8/common/treecoder.h | 8 +- libs/libvpx/vp8/common/vp8_entropymodedata.h | 8 +- libs/libvpx/vp8/common/vp8_skin_detection.h | 6 +- .../vp8/common/x86/bilinear_filter_sse2.c | 336 + libs/libvpx/vp8/common/x86/filter_x86.c | 29 - libs/libvpx/vp8/common/x86/filter_x86.h | 33 - libs/libvpx/vp8/common/x86/idct_blk_sse2.c | 24 +- libs/libvpx/vp8/common/x86/iwalsh_sse2.asm | 2 +- libs/libvpx/vp8/common/x86/subpixel_mmx.asm | 276 - libs/libvpx/vp8/common/x86/subpixel_sse2.asm | 414 - libs/libvpx/vp8/common/x86/vp8_asm_stubs.c | 13 +- libs/libvpx/vp8/decoder/dboolhuff.h | 8 +- libs/libvpx/vp8/decoder/decodeframe.c | 20 +- libs/libvpx/vp8/decoder/decodemv.h | 6 +- libs/libvpx/vp8/decoder/decoderthreading.h | 8 +- libs/libvpx/vp8/decoder/detokenize.h | 6 +- libs/libvpx/vp8/decoder/ec_types.h | 10 +- libs/libvpx/vp8/decoder/error_concealment.c | 10 +- libs/libvpx/vp8/decoder/error_concealment.h | 6 +- libs/libvpx/vp8/decoder/onyxd_if.c | 29 +- libs/libvpx/vp8/decoder/onyxd_int.h | 26 +- libs/libvpx/vp8/decoder/threading.c | 97 +- libs/libvpx/vp8/decoder/treereader.h | 8 +- .../vp8/encoder/arm/neon/fastquantizeb_neon.c | 12 +- .../vp8/encoder/arm/neon/shortfdct_neon.c | 2 + .../encoder/arm/neon/vp8_shortwalsh4x4_neon.c | 2 + libs/libvpx/vp8/encoder/bitstream.c | 157 +- libs/libvpx/vp8/encoder/bitstream.h | 6 +- libs/libvpx/vp8/encoder/block.h | 6 +- libs/libvpx/vp8/encoder/boolhuff.c | 26 +- libs/libvpx/vp8/encoder/boolhuff.h | 67 +- libs/libvpx/vp8/{common => encoder}/copy_c.c | 0 libs/libvpx/vp8/encoder/dct_value_cost.h | 6 +- libs/libvpx/vp8/encoder/dct_value_tokens.h | 6 +- libs/libvpx/vp8/encoder/defaultcoefcounts.h | 6 +- libs/libvpx/vp8/encoder/denoising.c | 47 +- libs/libvpx/vp8/encoder/denoising.h | 6 +- libs/libvpx/vp8/encoder/encodeframe.c | 6 +- libs/libvpx/vp8/encoder/encodeframe.h | 6 +- libs/libvpx/vp8/encoder/encodeintra.h | 6 +- libs/libvpx/vp8/encoder/encodemb.h | 6 +- libs/libvpx/vp8/encoder/encodemv.c | 11 - libs/libvpx/vp8/encoder/encodemv.h | 6 +- libs/libvpx/vp8/encoder/ethreading.h | 6 +- libs/libvpx/vp8/encoder/firstpass.c | 34 +- libs/libvpx/vp8/encoder/firstpass.h | 6 +- libs/libvpx/vp8/encoder/lookahead.h | 8 +- libs/libvpx/vp8/encoder/mcomp.c | 123 +- libs/libvpx/vp8/encoder/mcomp.h | 34 +- libs/libvpx/vp8/encoder/modecosts.h | 8 +- libs/libvpx/vp8/encoder/mr_dissim.h | 6 +- libs/libvpx/vp8/encoder/onyx_if.c | 133 +- libs/libvpx/vp8/encoder/onyx_int.h | 21 +- libs/libvpx/vp8/encoder/pickinter.c | 38 +- libs/libvpx/vp8/encoder/pickinter.h | 6 +- libs/libvpx/vp8/encoder/picklpf.h | 6 +- libs/libvpx/vp8/encoder/quantize.h | 6 +- libs/libvpx/vp8/encoder/ratectrl.c | 26 +- libs/libvpx/vp8/encoder/ratectrl.h | 6 +- libs/libvpx/vp8/encoder/rdopt.c | 33 +- libs/libvpx/vp8/encoder/rdopt.h | 24 +- libs/libvpx/vp8/encoder/segmentation.h | 6 +- libs/libvpx/vp8/encoder/temporal_filter.c | 1 + libs/libvpx/vp8/encoder/temporal_filter.h | 6 +- libs/libvpx/vp8/encoder/tokenize.c | 70 - libs/libvpx/vp8/encoder/tokenize.h | 14 +- libs/libvpx/vp8/encoder/treewriter.h | 16 +- .../{encodeopt.asm => block_error_sse2.asm} | 0 .../vp8/{common => encoder}/x86/copy_sse2.asm | 0 .../vp8/{common => encoder}/x86/copy_sse3.asm | 0 libs/libvpx/vp8/encoder/x86/quantize_sse4.c | 49 +- .../vp8/encoder/x86/vp8_quantize_ssse3.c | 6 +- libs/libvpx/vp8/vp8_common.mk | 9 +- libs/libvpx/vp8/vp8_cx_iface.c | 130 +- libs/libvpx/vp8/vp8_dx_iface.c | 67 +- libs/libvpx/vp8/vp8cx.mk | 5 +- .../arm/neon/vp9_highbd_iht16x16_add_neon.c | 446 + .../arm/neon/vp9_highbd_iht4x4_add_neon.c | 181 + .../arm/neon/vp9_highbd_iht8x8_add_neon.c | 345 + .../common/arm/neon/vp9_iht16x16_add_neon.c | 279 + .../vp9/common/arm/neon/vp9_iht4x4_add_neon.c | 229 +- .../vp9/common/arm/neon/vp9_iht8x8_add_neon.c | 542 +- .../libvpx/vp9/common/arm/neon/vp9_iht_neon.h | 272 + .../vp9/common/mips/msa/vp9_idct16x16_msa.c | 1 + .../vp9/common/mips/msa/vp9_idct4x4_msa.c | 1 + .../vp9/common/mips/msa/vp9_idct8x8_msa.c | 1 + libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c | 116 + libs/libvpx/vp9/common/vp9_alloccommon.h | 8 +- libs/libvpx/vp9/common/vp9_blockd.h | 37 +- libs/libvpx/vp9/common/vp9_common.h | 24 +- libs/libvpx/vp9/common/vp9_common_data.c | 2 +- libs/libvpx/vp9/common/vp9_common_data.h | 6 +- libs/libvpx/vp9/common/vp9_entropy.c | 2 + libs/libvpx/vp9/common/vp9_entropy.h | 7 +- libs/libvpx/vp9/common/vp9_entropymode.c | 65 +- libs/libvpx/vp9/common/vp9_entropymode.h | 6 +- libs/libvpx/vp9/common/vp9_entropymv.c | 4 +- libs/libvpx/vp9/common/vp9_entropymv.h | 10 +- libs/libvpx/vp9/common/vp9_enums.h | 8 +- libs/libvpx/vp9/common/vp9_filter.c | 18 +- libs/libvpx/vp9/common/vp9_filter.h | 9 +- libs/libvpx/vp9/common/vp9_frame_buffers.h | 6 +- libs/libvpx/vp9/common/vp9_idct.h | 6 +- libs/libvpx/vp9/common/vp9_loopfilter.c | 24 +- libs/libvpx/vp9/common/vp9_loopfilter.h | 10 +- libs/libvpx/vp9/common/vp9_mfqe.h | 6 +- libs/libvpx/vp9/common/vp9_mv.h | 6 +- libs/libvpx/vp9/common/vp9_mvref_common.h | 10 +- libs/libvpx/vp9/common/vp9_onyxc_int.h | 24 +- libs/libvpx/vp9/common/vp9_postproc.c | 4 +- libs/libvpx/vp9/common/vp9_postproc.h | 8 +- libs/libvpx/vp9/common/vp9_ppflags.h | 6 +- libs/libvpx/vp9/common/vp9_pred_common.c | 31 +- libs/libvpx/vp9/common/vp9_pred_common.h | 16 +- libs/libvpx/vp9/common/vp9_quant_common.h | 6 +- libs/libvpx/vp9/common/vp9_reconinter.c | 20 +- libs/libvpx/vp9/common/vp9_reconinter.h | 19 +- libs/libvpx/vp9/common/vp9_reconintra.h | 6 +- libs/libvpx/vp9/common/vp9_rtcd_defs.pl | 45 +- libs/libvpx/vp9/common/vp9_scale.h | 10 +- libs/libvpx/vp9/common/vp9_scan.h | 6 +- libs/libvpx/vp9/common/vp9_seg_common.h | 6 +- libs/libvpx/vp9/common/vp9_thread_common.c | 225 +- libs/libvpx/vp9/common/vp9_thread_common.h | 32 +- libs/libvpx/vp9/common/vp9_tile_common.h | 6 +- .../common/x86/vp9_highbd_iht16x16_add_sse4.c | 419 + .../common/x86/vp9_highbd_iht4x4_add_sse4.c | 131 + .../common/x86/vp9_highbd_iht8x8_add_sse4.c | 255 + .../vp9/common/x86/vp9_idct_intrin_sse2.c | 40 +- libs/libvpx/vp9/decoder/vp9_decodeframe.c | 1089 +- libs/libvpx/vp9/decoder/vp9_decodeframe.h | 6 +- libs/libvpx/vp9/decoder/vp9_decodemv.c | 2 +- libs/libvpx/vp9/decoder/vp9_decodemv.h | 6 +- libs/libvpx/vp9/decoder/vp9_decoder.c | 181 +- libs/libvpx/vp9/decoder/vp9_decoder.h | 64 +- libs/libvpx/vp9/decoder/vp9_detokenize.c | 41 + libs/libvpx/vp9/decoder/vp9_detokenize.h | 6 +- libs/libvpx/vp9/decoder/vp9_dsubexp.h | 6 +- libs/libvpx/vp9/decoder/vp9_job_queue.c | 124 + libs/libvpx/vp9/decoder/vp9_job_queue.h | 45 + .../vp9/encoder/arm/neon/vp9_dct_neon.c | 35 - .../vp9/encoder/arm/neon/vp9_quantize_neon.c | 26 +- .../vp9/encoder/mips/msa/vp9_error_msa.c | 3 + .../vp9/encoder/mips/msa/vp9_fdct16x16_msa.c | 1 + .../vp9/encoder/mips/msa/vp9_fdct4x4_msa.c | 1 + .../vp9/encoder/mips/msa/vp9_fdct8x8_msa.c | 1 + .../vp9/encoder/mips/msa/vp9_fdct_msa.h | 6 +- .../libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c | 292 + libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h | 6 +- libs/libvpx/vp9/encoder/vp9_aq_360.h | 6 +- libs/libvpx/vp9/encoder/vp9_aq_complexity.h | 6 +- .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c | 145 +- .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h | 14 +- libs/libvpx/vp9/encoder/vp9_aq_variance.c | 56 +- libs/libvpx/vp9/encoder/vp9_aq_variance.h | 10 +- libs/libvpx/vp9/encoder/vp9_bitstream.c | 83 +- libs/libvpx/vp9/encoder/vp9_bitstream.h | 14 +- libs/libvpx/vp9/encoder/vp9_block.h | 21 +- libs/libvpx/vp9/encoder/vp9_blockiness.c | 1 + libs/libvpx/vp9/encoder/vp9_blockiness.h | 26 + libs/libvpx/vp9/encoder/vp9_context_tree.c | 28 +- libs/libvpx/vp9/encoder/vp9_context_tree.h | 12 +- libs/libvpx/vp9/encoder/vp9_cost.h | 6 +- libs/libvpx/vp9/encoder/vp9_dct.c | 103 - libs/libvpx/vp9/encoder/vp9_denoiser.c | 231 +- libs/libvpx/vp9/encoder/vp9_denoiser.h | 30 +- libs/libvpx/vp9/encoder/vp9_encodeframe.c | 1971 +++- libs/libvpx/vp9/encoder/vp9_encodeframe.h | 11 +- libs/libvpx/vp9/encoder/vp9_encodemb.c | 174 +- libs/libvpx/vp9/encoder/vp9_encodemb.h | 14 +- libs/libvpx/vp9/encoder/vp9_encodemv.h | 8 +- libs/libvpx/vp9/encoder/vp9_encoder.c | 2815 ++++- libs/libvpx/vp9/encoder/vp9_encoder.h | 235 +- libs/libvpx/vp9/encoder/vp9_ethread.c | 69 +- libs/libvpx/vp9/encoder/vp9_ethread.h | 10 +- libs/libvpx/vp9/encoder/vp9_extend.h | 6 +- libs/libvpx/vp9/encoder/vp9_firstpass.c | 1147 +- libs/libvpx/vp9/encoder/vp9_firstpass.h | 45 +- libs/libvpx/vp9/encoder/vp9_job_queue.h | 6 +- libs/libvpx/vp9/encoder/vp9_lookahead.h | 10 +- libs/libvpx/vp9/encoder/vp9_mbgraph.c | 5 +- libs/libvpx/vp9/encoder/vp9_mbgraph.h | 10 +- libs/libvpx/vp9/encoder/vp9_mcomp.c | 1021 +- libs/libvpx/vp9/encoder/vp9_mcomp.h | 76 +- libs/libvpx/vp9/encoder/vp9_multi_thread.c | 50 +- libs/libvpx/vp9/encoder/vp9_multi_thread.h | 9 +- libs/libvpx/vp9/encoder/vp9_noise_estimate.c | 133 +- libs/libvpx/vp9/encoder/vp9_noise_estimate.h | 9 +- .../libvpx/vp9/encoder/vp9_partition_models.h | 975 ++ libs/libvpx/vp9/encoder/vp9_picklpf.c | 30 +- libs/libvpx/vp9/encoder/vp9_picklpf.h | 6 +- libs/libvpx/vp9/encoder/vp9_pickmode.c | 1103 +- libs/libvpx/vp9/encoder/vp9_pickmode.h | 6 +- libs/libvpx/vp9/encoder/vp9_quantize.c | 26 +- libs/libvpx/vp9/encoder/vp9_quantize.h | 6 +- libs/libvpx/vp9/encoder/vp9_ratectrl.c | 1275 ++- libs/libvpx/vp9/encoder/vp9_ratectrl.h | 48 +- libs/libvpx/vp9/encoder/vp9_rd.c | 169 +- libs/libvpx/vp9/encoder/vp9_rd.h | 42 +- libs/libvpx/vp9/encoder/vp9_rdopt.c | 657 +- libs/libvpx/vp9/encoder/vp9_rdopt.h | 10 +- libs/libvpx/vp9/encoder/vp9_resize.c | 16 +- libs/libvpx/vp9/encoder/vp9_resize.h | 6 +- libs/libvpx/vp9/encoder/vp9_segmentation.c | 54 + libs/libvpx/vp9/encoder/vp9_segmentation.h | 11 +- libs/libvpx/vp9/encoder/vp9_skin_detection.h | 6 +- libs/libvpx/vp9/encoder/vp9_speed_features.c | 330 +- libs/libvpx/vp9/encoder/vp9_speed_features.h | 148 +- libs/libvpx/vp9/encoder/vp9_subexp.c | 1 + libs/libvpx/vp9/encoder/vp9_subexp.h | 6 +- .../libvpx/vp9/encoder/vp9_svc_layercontext.c | 788 +- .../libvpx/vp9/encoder/vp9_svc_layercontext.h | 132 +- libs/libvpx/vp9/encoder/vp9_temporal_filter.c | 837 +- libs/libvpx/vp9/encoder/vp9_temporal_filter.h | 21 +- libs/libvpx/vp9/encoder/vp9_tokenize.h | 6 +- libs/libvpx/vp9/encoder/vp9_treewriter.h | 6 +- .../encoder/x86/highbd_temporal_filter_sse4.c | 943 ++ .../encoder/x86/temporal_filter_constants.h | 410 + .../vp9/encoder/x86/temporal_filter_sse4.c | 1046 +- .../vp9/encoder/x86/vp9_dct_intrin_sse2.c | 452 +- libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c | 465 - .../encoder/x86/vp9_diamond_search_sad_avx.c | 2 +- .../x86/vp9_highbd_block_error_intrin_sse2.c | 19 +- .../vp9/encoder/x86/vp9_quantize_avx2.c | 139 + .../vp9/encoder/x86/vp9_quantize_sse2.c | 16 +- libs/libvpx/vp9/vp9_common.mk | 44 +- libs/libvpx/vp9/vp9_cx_iface.c | 361 +- libs/libvpx/vp9/vp9_dx_iface.c | 44 +- libs/libvpx/vp9/vp9_dx_iface.h | 8 +- libs/libvpx/vp9/vp9_iface_common.h | 12 +- libs/libvpx/vp9/vp9cx.mk | 22 +- libs/libvpx/vp9/vp9dx.mk | 2 + libs/libvpx/vpx/exports_spatial_svc | 6 - libs/libvpx/vpx/internal/vpx_codec_internal.h | 6 +- libs/libvpx/vpx/src/vpx_encoder.c | 34 +- libs/libvpx/vpx/src/vpx_image.c | 21 +- libs/libvpx/vpx/vp8.h | 27 +- libs/libvpx/vpx/vp8cx.h | 238 +- libs/libvpx/vpx/vp8dx.h | 32 +- libs/libvpx/vpx/vpx_codec.h | 12 +- libs/libvpx/vpx/vpx_codec.mk | 4 - libs/libvpx/vpx/vpx_decoder.h | 6 +- libs/libvpx/vpx/vpx_encoder.h | 79 +- libs/libvpx/vpx/vpx_frame_buffer.h | 14 +- libs/libvpx/vpx/vpx_image.h | 43 +- libs/libvpx/vpx/vpx_integer.h | 35 +- libs/libvpx/vpx_dsp/add_noise.c | 2 + libs/libvpx/vpx_dsp/arm/avg_pred_neon.c | 46 +- libs/libvpx/vpx_dsp/arm/deblock_neon.c | 5 - libs/libvpx/vpx_dsp/arm/fdct_neon.c | 1 + libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c | 1 + .../vpx_dsp/arm/highbd_idct16x16_add_neon.c | 178 +- .../arm/highbd_idct32x32_1024_add_neon.c | 82 +- .../arm/highbd_idct32x32_135_add_neon.c | 1 + .../arm/highbd_idct32x32_34_add_neon.c | 1 + .../vpx_dsp/arm/highbd_idct4x4_add_neon.c | 130 +- .../vpx_dsp/arm/highbd_idct8x8_add_neon.c | 504 +- libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h | 474 + libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c | 59 - .../vpx_dsp/arm/idct32x32_135_add_neon.c | 12 +- .../vpx_dsp/arm/idct32x32_34_add_neon.c | 12 +- libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c | 45 +- libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c | 112 +- libs/libvpx/vpx_dsp/arm/idct_neon.h | 769 +- libs/libvpx/vpx_dsp/arm/intrapred_neon.c | 2 - libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm | 2 +- libs/libvpx/vpx_dsp/arm/mem_neon.h | 33 +- libs/libvpx/vpx_dsp/arm/quantize_neon.c | 135 +- libs/libvpx/vpx_dsp/arm/sad4d_neon.c | 478 +- libs/libvpx/vpx_dsp/arm/sad_neon.c | 273 +- .../libvpx/vpx_dsp/arm/subpel_variance_neon.c | 104 +- libs/libvpx/vpx_dsp/arm/subtract_neon.c | 84 +- libs/libvpx/vpx_dsp/arm/sum_neon.h | 15 +- libs/libvpx/vpx_dsp/arm/sum_squares_neon.c | 85 + libs/libvpx/vpx_dsp/arm/transpose_neon.h | 6 +- libs/libvpx/vpx_dsp/arm/variance_neon.c | 170 +- ..._convolve8_avg_horiz_filter_type1_neon.asm | 438 + ..._convolve8_avg_horiz_filter_type2_neon.asm | 439 + .../arm/vpx_convolve8_avg_neon_asm.asm | 295 - ...x_convolve8_avg_vert_filter_type1_neon.asm | 486 + ...x_convolve8_avg_vert_filter_type2_neon.asm | 487 + .../vpx_convolve8_horiz_filter_type1_neon.asm | 415 + .../vpx_convolve8_horiz_filter_type2_neon.asm | 415 + libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h | 5 + .../vpx_dsp/arm/vpx_convolve8_neon_asm.asm | 273 - .../vpx_dsp/arm/vpx_convolve8_neon_asm.c | 41 + .../vpx_dsp/arm/vpx_convolve8_neon_asm.h | 29 + .../vpx_convolve8_vert_filter_type1_neon.asm | 457 + .../vpx_convolve8_vert_filter_type2_neon.asm | 455 + libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c | 5 +- libs/libvpx/vpx_dsp/avg.c | 204 + libs/libvpx/vpx_dsp/bitreader.h | 37 +- libs/libvpx/vpx_dsp/bitreader_buffer.c | 2 +- libs/libvpx/vpx_dsp/bitreader_buffer.h | 6 +- libs/libvpx/vpx_dsp/bitwriter.c | 11 + libs/libvpx/vpx_dsp/bitwriter.h | 32 +- libs/libvpx/vpx_dsp/bitwriter_buffer.h | 6 +- libs/libvpx/vpx_dsp/deblock.c | 43 +- libs/libvpx/vpx_dsp/fastssim.c | 50 +- libs/libvpx/vpx_dsp/fwd_txfm.c | 67 +- libs/libvpx/vpx_dsp/fwd_txfm.h | 6 +- libs/libvpx/vpx_dsp/inv_txfm.c | 8 +- libs/libvpx/vpx_dsp/inv_txfm.h | 7 +- libs/libvpx/vpx_dsp/loopfilter.c | 188 +- libs/libvpx/vpx_dsp/mips/add_noise_msa.c | 4 +- libs/libvpx/vpx_dsp/mips/avg_msa.c | 3 + libs/libvpx/vpx_dsp/mips/common_dspr2.h | 6 +- .../libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c | 3 +- .../vpx_dsp/mips/convolve8_avg_horiz_dspr2.c | 3 +- libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c | 4 +- .../vpx_dsp/mips/convolve8_horiz_dspr2.c | 2 +- .../vpx_dsp/mips/convolve8_vert_dspr2.c | 2 +- .../vpx_dsp/mips/convolve_common_dspr2.h | 6 +- libs/libvpx/vpx_dsp/mips/deblock_msa.c | 88 +- libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c | 1 + libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h | 6 +- libs/libvpx/vpx_dsp/mips/idct16x16_msa.c | 1 + libs/libvpx/vpx_dsp/mips/idct32x32_msa.c | 1 + libs/libvpx/vpx_dsp/mips/idct4x4_msa.c | 1 + libs/libvpx/vpx_dsp/mips/idct8x8_msa.c | 1 + libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h | 7 +- libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h | 6 +- .../vpx_dsp/mips/loopfilter_filters_dspr2.h | 6 +- .../vpx_dsp/mips/loopfilter_macros_dspr2.h | 6 +- .../vpx_dsp/mips/loopfilter_masks_dspr2.h | 6 +- libs/libvpx/vpx_dsp/mips/loopfilter_msa.h | 6 +- libs/libvpx/vpx_dsp/mips/macros_msa.h | 6 +- libs/libvpx/vpx_dsp/mips/sad_mmi.c | 2 +- .../vpx_dsp/mips/sub_pixel_variance_msa.c | 61 +- libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h | 6 +- libs/libvpx/vpx_dsp/mips/variance_mmi.c | 639 +- libs/libvpx/vpx_dsp/mips/variance_msa.c | 5 +- .../mips/vpx_convolve8_avg_horiz_msa.c | 2 +- .../vpx_dsp/mips/vpx_convolve8_avg_msa.c | 8 +- .../vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c | 2 +- .../vpx_dsp/mips/vpx_convolve8_horiz_msa.c | 2 +- libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c | 716 ++ libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c | 8 +- .../vpx_dsp/mips/vpx_convolve8_vert_msa.c | 2 +- libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h | 6 +- libs/libvpx/vpx_dsp/postproc.h | 6 +- .../vpx_dsp/ppc/bitdepth_conversion_vsx.h | 6 +- libs/libvpx/vpx_dsp/ppc/deblock_vsx.c | 374 + libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c | 553 + libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c | 18 + libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c | 1231 +- libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h | 48 + libs/libvpx/vpx_dsp/ppc/quantize_vsx.c | 305 + libs/libvpx/vpx_dsp/ppc/sad_vsx.c | 93 +- libs/libvpx/vpx_dsp/ppc/subtract_vsx.c | 117 + libs/libvpx/vpx_dsp/ppc/transpose_vsx.h | 38 +- libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h | 90 + libs/libvpx/vpx_dsp/ppc/types_vsx.h | 50 +- libs/libvpx/vpx_dsp/ppc/variance_vsx.c | 198 +- libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c | 96 +- libs/libvpx/vpx_dsp/prob.h | 8 +- libs/libvpx/vpx_dsp/psnr.c | 20 +- libs/libvpx/vpx_dsp/psnr.h | 36 +- libs/libvpx/vpx_dsp/psnrhvs.c | 18 +- libs/libvpx/vpx_dsp/quantize.c | 26 +- libs/libvpx/vpx_dsp/quantize.h | 23 +- libs/libvpx/vpx_dsp/sad.c | 127 +- libs/libvpx/vpx_dsp/skin_detection.h | 6 +- libs/libvpx/vpx_dsp/ssim.c | 16 +- libs/libvpx/vpx_dsp/ssim.h | 6 +- libs/libvpx/vpx_dsp/subtract.c | 28 +- libs/libvpx/vpx_dsp/sum_squares.c | 5 +- libs/libvpx/vpx_dsp/txfm_common.h | 6 +- libs/libvpx/vpx_dsp/variance.c | 563 +- libs/libvpx/vpx_dsp/variance.h | 45 +- libs/libvpx/vpx_dsp/vpx_convolve.h | 6 +- libs/libvpx/vpx_dsp/vpx_dsp.mk | 46 +- libs/libvpx/vpx_dsp/vpx_dsp_common.h | 14 +- libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl | 792 +- libs/libvpx/vpx_dsp/vpx_filter.h | 15 +- libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c | 303 +- libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c | 215 +- libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c | 28 +- .../vpx_dsp/x86/bitdepth_conversion_avx2.h | 6 +- .../vpx_dsp/x86/bitdepth_conversion_sse2.h | 6 +- libs/libvpx/vpx_dsp/x86/convolve.h | 142 +- libs/libvpx/vpx_dsp/x86/convolve_avx2.h | 63 +- libs/libvpx/vpx_dsp/x86/convolve_sse2.h | 88 + libs/libvpx/vpx_dsp/x86/convolve_ssse3.h | 6 +- libs/libvpx/vpx_dsp/x86/deblock_sse2.asm | 231 - .../vpx_dsp/x86/fwd_dct32x32_impl_avx2.h | 252 +- .../vpx_dsp/x86/fwd_dct32x32_impl_sse2.h | 258 +- libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c | 3 + libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h | 6 +- libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h | 6 +- .../libvpx/vpx_dsp/x86/highbd_convolve_avx2.c | 483 +- .../vpx_dsp/x86/highbd_idct16x16_add_sse4.c | 6 +- .../vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 26 +- .../vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 4 +- .../vpx_dsp/x86/highbd_idct8x8_add_sse4.c | 14 +- .../x86/highbd_intrapred_intrin_sse2.c | 3 +- .../x86/highbd_intrapred_intrin_ssse3.c | 6 +- .../vpx_dsp/x86/highbd_intrapred_sse2.asm | 16 +- .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h | 10 +- .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h | 31 +- .../vpx_dsp/x86/highbd_loopfilter_sse2.c | 366 +- .../vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 1 + .../x86/highbd_subpel_variance_impl_sse2.asm | 374 +- .../vpx_dsp/x86/highbd_variance_impl_sse2.asm | 16 +- .../libvpx/vpx_dsp/x86/highbd_variance_sse2.c | 93 +- libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c | 553 +- libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h | 9 +- libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h | 6 +- libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c | 198 +- libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c | 569 +- libs/libvpx/vpx_dsp/x86/mem_sse2.h | 36 +- libs/libvpx/vpx_dsp/x86/post_proc_sse2.c | 141 + libs/libvpx/vpx_dsp/x86/quantize_avx.c | 93 +- libs/libvpx/vpx_dsp/x86/quantize_sse2.c | 29 +- .../x86/{quantize_x86.h => quantize_sse2.h} | 34 +- libs/libvpx/vpx_dsp/x86/quantize_ssse3.c | 90 +- libs/libvpx/vpx_dsp/x86/quantize_ssse3.h | 51 + libs/libvpx/vpx_dsp/x86/sad4d_avx2.c | 240 +- libs/libvpx/vpx_dsp/x86/sad4d_avx512.c | 26 +- .../vpx_dsp/x86/subpel_variance_sse2.asm | 337 +- libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c | 190 +- libs/libvpx/vpx_dsp/x86/transpose_sse2.h | 6 +- libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h | 6 +- libs/libvpx/vpx_dsp/x86/variance_avx2.c | 588 +- libs/libvpx/vpx_dsp/x86/variance_sse2.c | 646 +- libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c | 162 - .../vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm | 16 +- .../x86/vpx_high_subpixel_bilinear_sse2.asm | 4 +- .../vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 1161 ++ .../vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 603 +- .../x86/vpx_subpixel_8t_intrin_ssse3.c | 532 +- libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h | 6 +- libs/libvpx/vpx_mem/vpx_mem.c | 2 + libs/libvpx/vpx_mem/vpx_mem.h | 6 +- libs/libvpx/vpx_ports/arm.h | 6 +- libs/libvpx/vpx_ports/asmdefs_mmi.h | 6 +- libs/libvpx/vpx_ports/bitops.h | 6 +- libs/libvpx/vpx_ports/emmintrin_compat.h | 6 +- libs/libvpx/vpx_ports/emms_mmx.asm | 18 + .../libvpx/vpx_ports/{config.h => emms_mmx.c} | 9 +- .../{emms.asm => float_control_word.asm} | 5 - libs/libvpx/vpx_ports/mem.h | 6 +- libs/libvpx/vpx_ports/mem_ops.h | 7 +- libs/libvpx/vpx_ports/mem_ops_aligned.h | 6 +- libs/libvpx/vpx_ports/msvc.h | 6 +- libs/libvpx/vpx_ports/ppc.h | 6 +- libs/libvpx/vpx_ports/system_state.h | 22 +- libs/libvpx/vpx_ports/vpx_once.h | 6 +- libs/libvpx/vpx_ports/vpx_ports.mk | 13 +- libs/libvpx/vpx_ports/vpx_timer.h | 6 +- libs/libvpx/vpx_ports/x86.h | 81 +- libs/libvpx/vpx_scale/generic/gen_scalers.c | 4 +- libs/libvpx/vpx_scale/generic/vpx_scale.c | 4 +- libs/libvpx/vpx_scale/generic/yv12config.c | 57 +- libs/libvpx/vpx_scale/vpx_scale.h | 6 +- libs/libvpx/vpx_scale/yv12config.h | 10 +- libs/libvpx/vpx_util/endian_inl.h | 6 +- libs/libvpx/vpx_util/vpx_atomics.h | 10 +- libs/libvpx/vpx_util/vpx_debug_util.c | 282 + libs/libvpx/vpx_util/vpx_debug_util.h | 70 + libs/libvpx/vpx_util/vpx_thread.h | 29 +- libs/libvpx/vpx_util/vpx_timestamp.h | 45 + libs/libvpx/vpx_util/vpx_util.mk | 3 + libs/libvpx/vpx_util/vpx_write_yuv_frame.c | 2 +- libs/libvpx/vpx_util/vpx_write_yuv_frame.h | 6 +- libs/libvpx/vpxdec.c | 74 +- libs/libvpx/vpxenc.c | 370 +- libs/libvpx/vpxenc.h | 6 +- libs/libvpx/vpxstats.h | 6 +- libs/libvpx/warnings.h | 6 +- libs/libvpx/webmdec.h | 6 +- libs/libvpx/webmenc.h | 6 +- libs/libvpx/y4menc.c | 8 +- libs/libvpx/y4menc.h | 6 +- libs/libvpx/y4minput.c | 31 +- libs/libvpx/y4minput.h | 6 +- 821 files changed, 89961 insertions(+), 48650 deletions(-) delete mode 100644 libs/libvpx/build/.gitattributes delete mode 100644 libs/libvpx/build/.gitignore rename libs/libvpx/{vpx => examples}/svc_context.h (83%) rename libs/libvpx/{vpx/src => examples}/svc_encodeframe.c (85%) create mode 100644 libs/libvpx/examples/vpx_dec_fuzzer.cc create mode 100644 libs/libvpx/test/bench.cc create mode 100644 libs/libvpx/test/bench.h delete mode 100644 libs/libvpx/test/datarate_test.cc create mode 100644 libs/libvpx/test/decode_corrupted.cc create mode 100644 libs/libvpx/test/svc_datarate_test.cc create mode 100644 libs/libvpx/test/svc_end_to_end_test.cc create mode 100644 libs/libvpx/test/svc_test.h delete mode 100644 libs/libvpx/test/temporal_filter_test.cc create mode 100644 libs/libvpx/test/timestamp_test.cc create mode 100644 libs/libvpx/test/vp8_datarate_test.cc create mode 100644 libs/libvpx/test/vp9_datarate_test.cc delete mode 100755 libs/libvpx/test/vp9_spatial_svc_encoder.sh create mode 100644 libs/libvpx/test/yuv_temporal_filter_test.cc create mode 100644 libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md create mode 100644 libs/libvpx/third_party/libyuv/LICENSE create mode 100644 libs/libvpx/third_party/libyuv/include/libyuv/macros_msa.h create mode 100644 libs/libvpx/third_party/libyuv/source/compare_msa.cc delete mode 100644 libs/libvpx/third_party/libyuv/source/rotate_mips.cc create mode 100644 libs/libvpx/third_party/libyuv/source/rotate_msa.cc delete mode 100644 libs/libvpx/third_party/libyuv/source/row_mips.cc create mode 100644 libs/libvpx/third_party/libyuv/source/row_msa.cc delete mode 100644 libs/libvpx/third_party/libyuv/source/scale_mips.cc create mode 100644 libs/libvpx/third_party/libyuv/source/scale_msa.cc create mode 100644 libs/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde create mode 100644 libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde create mode 100644 libs/libvpx/tools/non_greedy_mv/non_greedy_mv.py create mode 100644 libs/libvpx/tools/set_analyzer_env.sh create mode 100644 libs/libvpx/vp8/common/arm/loopfilter_arm.h delete mode 100644 libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c delete mode 100644 libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c create mode 100644 libs/libvpx/vp8/common/x86/bilinear_filter_sse2.c delete mode 100644 libs/libvpx/vp8/common/x86/filter_x86.c delete mode 100644 libs/libvpx/vp8/common/x86/filter_x86.h rename libs/libvpx/vp8/{common => encoder}/copy_c.c (100%) rename libs/libvpx/vp8/encoder/x86/{encodeopt.asm => block_error_sse2.asm} (100%) rename libs/libvpx/vp8/{common => encoder}/x86/copy_sse2.asm (100%) rename libs/libvpx/vp8/{common => encoder}/x86/copy_sse3.asm (100%) create mode 100644 libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c create mode 100644 libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c create mode 100644 libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c create mode 100644 libs/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c create mode 100644 libs/libvpx/vp9/common/arm/neon/vp9_iht_neon.h create mode 100644 libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c create mode 100644 libs/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c create mode 100644 libs/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c create mode 100644 libs/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c create mode 100644 libs/libvpx/vp9/decoder/vp9_job_queue.c create mode 100644 libs/libvpx/vp9/decoder/vp9_job_queue.h delete mode 100644 libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c create mode 100644 libs/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c create mode 100644 libs/libvpx/vp9/encoder/vp9_blockiness.h create mode 100644 libs/libvpx/vp9/encoder/vp9_partition_models.h create mode 100644 libs/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c create mode 100644 libs/libvpx/vp9/encoder/x86/temporal_filter_constants.h delete mode 100644 libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c create mode 100644 libs/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c delete mode 100644 libs/libvpx/vpx/exports_spatial_svc create mode 100644 libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h create mode 100644 libs/libvpx/vpx_dsp/arm/sum_squares_neon.c create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm delete mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm delete mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm create mode 100644 libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm create mode 100644 libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c create mode 100644 libs/libvpx/vpx_dsp/ppc/deblock_vsx.c create mode 100644 libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c create mode 100644 libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h create mode 100644 libs/libvpx/vpx_dsp/ppc/quantize_vsx.c create mode 100644 libs/libvpx/vpx_dsp/ppc/subtract_vsx.c create mode 100644 libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h create mode 100644 libs/libvpx/vpx_dsp/x86/convolve_sse2.h create mode 100644 libs/libvpx/vpx_dsp/x86/post_proc_sse2.c rename libs/libvpx/vpx_dsp/x86/{quantize_x86.h => quantize_sse2.h} (70%) create mode 100644 libs/libvpx/vpx_dsp/x86/quantize_ssse3.h delete mode 100644 libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c create mode 100644 libs/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c create mode 100644 libs/libvpx/vpx_ports/emms_mmx.asm rename libs/libvpx/vpx_ports/{config.h => emms_mmx.c} (66%) rename libs/libvpx/vpx_ports/{emms.asm => float_control_word.asm} (90%) create mode 100644 libs/libvpx/vpx_util/vpx_debug_util.c create mode 100644 libs/libvpx/vpx_util/vpx_debug_util.h create mode 100644 libs/libvpx/vpx_util/vpx_timestamp.h diff --git a/libs/libvpx/AUTHORS b/libs/libvpx/AUTHORS index 04c2872432..2f1f8a6946 100644 --- a/libs/libvpx/AUTHORS +++ b/libs/libvpx/AUTHORS @@ -4,12 +4,13 @@ Aaron Watry Abo Talib Mahfoodh Adrian Grange -Aℓex Converse Ahmad Sharif +Aidan Welch Aleksey Vasenev Alexander Potapenko Alexander Voronov Alexandra Hájková +Aℓex Converse Alexis Ballier Alok Ahuja Alpha Lam @@ -26,11 +27,13 @@ Brion Vibber changjun.yang Charles 'Buck' Krasic Cheng Chen +Chi Yo Tsai chm Chris Cunningham Christian Duvivier Daniele Castagna Daniel Kang +Dan Zhu Deb Mukherjee Deepa K G Dim Temp @@ -38,11 +41,13 @@ Dmitry Kovalev Dragan Mrdjan Ed Baker Ehsan Akhgari +Elliott Karpilovsky Erik Niemeyer Fabio Pedretti Frank Galligan Fredrik Söderquist Fritz Koenig +Fyodor Kyslov Gabriel Marin Gaute Strokkenes Geza Lore @@ -55,7 +60,9 @@ Guillermo Ballester Valor Hangyu Kuang Hanno Böck Han Shen +Harish Mahendrakar Henrik Lundin +Hien Ho Hui Su Ivan Krasin Ivan Maltz @@ -81,6 +88,7 @@ Johann Koenig John Koleszar Johnny Klonaris John Stark +Jon Kunkee Joshua Bleecher Snyder Joshua Litt Julia Robson @@ -91,15 +99,19 @@ KO Myung-Hun Kyle Siefring Lawrence Velázquez Linfeng Zhang +Liu Peng Lou Quillio Luca Barbato +Luc Trudeau Makoto Kato Mans Rullgard Marco Paniconi Mark Mentovai Martin Ettl -Martin Storsjo +Martin Storsjö Matthew Heaney +Matthias Räncker +Michael Horowitz Michael Kohler Mike Frysinger Mike Hommey @@ -107,10 +119,12 @@ Mikhal Shemer Min Chen Minghai Shang Min Ye +Mirko Bonadei Moriyoshi Koizumi Morton Jonuschat Nathan E. Egge Nico Weber +Niveditha Rau Parag Salasakar Pascal Massimino Patrik Westin @@ -129,9 +143,13 @@ Rafael de Lucena Valle Rahul Chaudhry Ralph Giles Ranjit Kumar Tulabandu +Raphael Kubo da Costa +Ravi Chaudhary +Ritu Baldwa Rob Bradford Ronald S. Bultje Rui Ueyama +Sai Deng Sami Pietilä Sarah Parker Sasi Inguva @@ -139,12 +157,15 @@ Scott Graham Scott LaVarnway Sean McGovern Sergey Kolomenkin +Sergey Silkin Sergey Ulanov Shimon Doodkin Shiyou Yin +Shubham Tandle Shunyao Li Stefan Holmer Suman Sunkara +Supradeep T R Sylvestre Ledru Taekhyun Kim Takanori MATSUURA @@ -157,11 +178,15 @@ Timothy B. Terriberry Tom Finegan Tristan Matthews Urvang Joshi +Venkatarama NG. Avadhani Vignesh Venkatasubramanian Vlad Tsyrklevich +Wan-Teh Chang +xiwei gu Yaowu Xu Yi Luo Yongzhe Wang +Yue Chen Yunqing Wang Yury Gitman Zoe Liu diff --git a/libs/libvpx/CHANGELOG b/libs/libvpx/CHANGELOG index 2281394c8e..a7d8311c5f 100644 --- a/libs/libvpx/CHANGELOG +++ b/libs/libvpx/CHANGELOG @@ -1,4 +1,63 @@ -2017-01-04 v1.7.0 "Mandarin Duck" +2019-07-15 v1.8.1 "Orpington Duck" + This release collects incremental improvements to many aspects of the library. + + - Upgrading: + VP8E_SET_CPUUSED now accepts values up to 9 for vp9. + VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E). + The --sdk-path option has been removed. If you were using it to build for + Android please read build/make/Android.mk for alternatives. + All PPC optimizations have been disabled: + https://bugs.chromium.org/p/webm/issues/detail?id=1522. + + - Enhancements: + Various changes to improve encoder rate control, quality and speed + for practically every use case. + + - Bug fixes: + vp9-rtc: Fix color artifacts for speed >= 8. + +2019-01-31 v1.8.0 "Northern Shoveler Duck" + This release focused on encoding performance for realtime and VOD use cases. + + - Upgrading: + This adds and improves several vp9 controls. Most are related to SVC: + VP9E_SET_SVC_FRAME_DROP_LAYER: + - Frame dropping in SVC. + VP9E_SET_SVC_INTER_LAYER_PRED: + - Inter-layer prediction in SVC. + VP9E_SET_SVC_GF_TEMPORAL_REF: + - Enable long term temporal reference in SVC. + VP9E_SET_SVC_REF_FRAME_CONFIG/VP9E_GET_SVC_REF_FRAME_CONFIG: + - Extend and improve this control for better flexibility in setting SVC + pattern dynamically. + VP9E_SET_POSTENCODE_DROP: + - Allow for post-encode frame dropping (applies to non-SVC too). + VP9E_SET_SVC_SPATIAL_LAYER_SYNC: + - Enable spatial layer sync frames. + VP9E_SET_SVC_LAYER_ID: + - Extend api to specify temporal id for each spatial layers. + VP9E_SET_ROI_MAP: + - Extend Region of Interest functionality to VP9. + + - Enhancements: + 2 pass vp9 encoding has improved substantially. When using --auto-alt-ref=6, + we see approximately 8% for VBR and 10% for CQ. When using --auto-alt-ref=1, + the gains are approximately 4% for VBR and 5% for CQ. + + For real-time encoding, speed 7 has improved by ~5-10%. Encodes targeted at + screen sharing have improved when the content changes significantly (slide + sharing) or scrolls. There is a new speed 9 setting for mobile devices which + is about 10-20% faster than speed 8. + + - Bug fixes: + VP9 denoiser issue. + VP9 partition issue for 1080p. + VP9 rate control improvments. + Postprocessing Multi Frame Quality Enhancement (MFQE) issue. + VP8 multithread decoder issues. + A variety of fuzzing issues. + +2018-01-04 v1.7.0 "Mandarin Duck" This release focused on high bit depth performance (10/12 bit) and vp9 encoding improvements. diff --git a/libs/libvpx/README b/libs/libvpx/README index 73304dd62f..a1000e0850 100644 --- a/libs/libvpx/README +++ b/libs/libvpx/README @@ -1,4 +1,4 @@ -README - 24 January 2018 +README - 15 July 2019 Welcome to the WebM VP8/VP9 Codec SDK! @@ -9,22 +9,26 @@ COMPILING THE APPLICATIONS/LIBRARIES: 1. Prerequisites - * All x86 targets require the Yasm[1] assembler be installed. - * All Windows builds require that Cygwin[2] be installed. - * Building the documentation requires Doxygen[3]. If you do not + * All x86 targets require the Yasm[1] assembler be installed[2]. + * All Windows builds require that Cygwin[3] be installed. + * Building the documentation requires Doxygen[4]. If you do not have this package, the install-docs option will be disabled. - * Downloading the data for the unit tests requires curl[4] and sha1sum. + * Downloading the data for the unit tests requires curl[5] and sha1sum. sha1sum is provided via the GNU coreutils, installed by default on many *nix platforms, as well as MinGW and Cygwin. If coreutils is not available, a compatible version of sha1sum can be built from - source[5]. These requirements are optional if not running the unit + source[6]. These requirements are optional if not running the unit tests. [1]: http://www.tortall.net/projects/yasm - [2]: http://www.cygwin.com - [3]: http://www.doxygen.org - [4]: http://curl.haxx.se - [5]: http://www.microbrew.org/tools/md5sha1sum/ + [2]: For Visual Studio the base yasm binary (not vsyasm) should be in the + PATH for Visual Studio. For VS2017 it is sufficient to rename + yasm--.exe to yasm.exe and place it in: + Program Files (x86)/Microsoft Visual Studio/2017//Common7/Tools/ + [3]: http://www.cygwin.com + [4]: http://www.doxygen.org + [5]: http://curl.haxx.se + [6]: http://www.microbrew.org/tools/md5sha1sum/ 2. Out-of-tree builds Out of tree builds are a supported method of building the application. For @@ -41,7 +45,16 @@ COMPILING THE APPLICATIONS/LIBRARIES: used to get a list of supported options: $ ../libvpx/configure --help - 4. Cross development + 4. Compiler analyzers + Compilers have added sanitizers which instrument binaries with information + about address calculation, memory usage, threading, undefined behavior, and + other common errors. To simplify building libvpx with some of these features + use tools/set_analyzer_env.sh before running configure. It will set the + compiler and necessary flags for building as well as environment variables + read by the analyzer when testing the binaries. + $ source ../libvpx/tools/set_analyzer_env.sh address + + 5. Cross development For cross development, the most notable option is the --target option. The most up-to-date list of supported targets can be found at the bottom of the --help output of the configure script. As of this writing, the list of @@ -50,20 +63,20 @@ COMPILING THE APPLICATIONS/LIBRARIES: arm64-android-gcc arm64-darwin-gcc arm64-linux-gcc + arm64-win64-gcc + arm64-win64-vs15 armv7-android-gcc armv7-darwin-gcc armv7-linux-rvct armv7-linux-gcc armv7-none-rvct - armv7-win32-vs11 - armv7-win32-vs12 + armv7-win32-gcc armv7-win32-vs14 armv7-win32-vs15 armv7s-darwin-gcc armv8-linux-gcc mips32-linux-gcc mips64-linux-gcc - ppc64-linux-gcc ppc64le-linux-gcc sparc-solaris-gcc x86-android-gcc @@ -78,15 +91,13 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86-darwin14-gcc x86-darwin15-gcc x86-darwin16-gcc + x86-darwin17-gcc x86-iphonesimulator-gcc x86-linux-gcc x86-linux-icc x86-os2-gcc x86-solaris-gcc x86-win32-gcc - x86-win32-vs10 - x86-win32-vs11 - x86-win32-vs12 x86-win32-vs14 x86-win32-vs15 x86_64-android-gcc @@ -98,14 +109,12 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin14-gcc x86_64-darwin15-gcc x86_64-darwin16-gcc + x86_64-darwin17-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc x86_64-solaris-gcc x86_64-win64-gcc - x86_64-win64-vs10 - x86_64-win64-vs11 - x86_64-win64-vs12 x86_64-win64-vs14 x86_64-win64-vs15 generic-gnu @@ -123,7 +132,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS. - 5. Configuration errors + 6. Configuration errors If the configuration step fails, the first step is to look in the error log. This defaults to config.log. This should give a good indication of what went wrong. If not, contact us for support. diff --git a/libs/libvpx/args.h b/libs/libvpx/args.h index 54abe04607..aae8ec06a5 100644 --- a/libs/libvpx/args.h +++ b/libs/libvpx/args.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef ARGS_H_ -#define ARGS_H_ +#ifndef VPX_ARGS_H_ +#define VPX_ARGS_H_ #include #ifdef __cplusplus @@ -60,4 +60,4 @@ int arg_parse_enum_or_int(const struct arg *arg); } // extern "C" #endif -#endif // ARGS_H_ +#endif // VPX_ARGS_H_ diff --git a/libs/libvpx/build/.gitattributes b/libs/libvpx/build/.gitattributes deleted file mode 100644 index 03db79bc08..0000000000 --- a/libs/libvpx/build/.gitattributes +++ /dev/null @@ -1,2 +0,0 @@ -*-vs8/*.rules -crlf -*-msvs/*.rules -crlf diff --git a/libs/libvpx/build/.gitignore b/libs/libvpx/build/.gitignore deleted file mode 100644 index 1350fcb5eb..0000000000 --- a/libs/libvpx/build/.gitignore +++ /dev/null @@ -1 +0,0 @@ -x86*-win32-vs* diff --git a/libs/libvpx/build/make/Android.mk b/libs/libvpx/build/make/Android.mk index a88f90056e..6cb3af027b 100644 --- a/libs/libvpx/build/make/Android.mk +++ b/libs/libvpx/build/make/Android.mk @@ -14,7 +14,7 @@ # Run the configure script from the jni directory. Base libvpx # encoder/decoder configuration will look similar to: # ./libvpx/configure --target=armv7-android-gcc --disable-examples \ -# --sdk-path=/opt/android-ndk-r6b/ +# --enable-external-build # # When targeting Android, realtime-only is enabled by default. This can # be overridden by adding the command line flag: @@ -29,37 +29,20 @@ # include $(CLEAR_VARS) # include jni/libvpx/build/make/Android.mk # -# By default libvpx will detect at runtime the existance of NEON extension. -# For this we import the 'cpufeatures' module from the NDK sources. -# libvpx can also be configured without this runtime detection method. -# Configuring with --disable-runtime-cpu-detect will assume presence of NEON. -# Configuring with --disable-runtime-cpu-detect --disable-neon \ -# --disable-neon-asm -# will remove any NEON dependency. +# By default libvpx will use the 'cpufeatures' module from the NDK. This allows +# the library to be built with all available optimizations (SSE2->AVX512 for +# x86, NEON for arm, DSPr2 for mips). This can be disabled with +# --disable-runtime-cpu-detect +# but the resulting library *must* be run on devices supporting all of the +# enabled extensions. They can be disabled individually with +# --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512} +# --disable-neon[-asm] +# --disable-{dspr2, msa} # # Running ndk-build will build libvpx and include it in your project. # -# Alternatively, building the examples and unit tests can be accomplished in the -# following way: -# -# Create a standalone toolchain from the NDK: -# https://developer.android.com/ndk/guides/standalone_toolchain.html -# -# For example - to test on arm64 devices with clang: -# $NDK/build/tools/make_standalone_toolchain.py \ -# --arch arm64 --install-dir=/tmp/my-android-toolchain -# export PATH=/tmp/my-android-toolchain/bin:$PATH -# CROSS=aarch64-linux-android- CC=clang CXX=clang++ /path/to/libvpx/configure \ -# --target=arm64-android-gcc -# -# Push the resulting binaries to a device and run them: -# adb push test_libvpx /data/tmp/test_libvpx -# adb shell /data/tmp/test_libvpx --gtest_filter=\*Sixtap\* -# -# Make sure to push the test data as well and set LIBVPX_TEST_DATA - CONFIG_DIR := $(LOCAL_PATH)/ LIBVPX_PATH := $(LOCAL_PATH)/libvpx ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas diff --git a/libs/libvpx/build/make/Makefile b/libs/libvpx/build/make/Makefile index f6b3f0630f..c070cd0e0c 100644 --- a/libs/libvpx/build/make/Makefile +++ b/libs/libvpx/build/make/Makefile @@ -99,6 +99,7 @@ distclean: clean rm -f Makefile; \ rm -f config.log config.mk; \ rm -f vpx_config.[hc] vpx_config.asm; \ + rm -f arm_neon.h; \ else \ rm -f $(target)-$(TOOLCHAIN).mk; \ fi diff --git a/libs/libvpx/build/make/ads2gas.pl b/libs/libvpx/build/make/ads2gas.pl index 029cc4a56f..b6a8f53eae 100755 --- a/libs/libvpx/build/make/ads2gas.pl +++ b/libs/libvpx/build/make/ads2gas.pl @@ -23,16 +23,17 @@ use lib $FindBin::Bin; use thumb; my $thumb = 0; +my $elf = 1; foreach my $arg (@ARGV) { $thumb = 1 if ($arg eq "-thumb"); + $elf = 0 if ($arg eq "-noelf"); } print "@ This file was created from a .asm file\n"; print "@ using the ads2gas.pl script.\n"; -print "\t.equ DO1STROUNDING, 0\n"; +print "\t.syntax unified\n"; if ($thumb) { - print "\t.syntax unified\n"; print "\t.thumb\n"; } @@ -140,7 +141,11 @@ while () # Make function visible to linker, and make additional symbol with # prepended underscore - s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/; + if ($elf) { + s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/; + } else { + s/EXPORT\s+\|([\$\w]*)\|/.global $1/; + } s/IMPORT\s+\|([\$\w]*)\|/.global $1/; s/EXPORT\s+([\$\w]*)/.global $1/; @@ -181,11 +186,16 @@ while () # eabi_attributes numerical equivalents can be found in the # "ARM IHI 0045C" document. - # REQUIRE8 Stack is required to be 8-byte aligned - s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g; + if ($elf) { + # REQUIRE8 Stack is required to be 8-byte aligned + s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g; - # PRESERVE8 Stack 8-byte align is preserved - s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; + # PRESERVE8 Stack 8-byte align is preserved + s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; + } else { + s/\sREQUIRE8//; + s/\sPRESERVE8//; + } # Use PROC and ENDP to give the symbols a .size directive. # This makes them show up properly in debugging tools like gdb and valgrind. @@ -202,7 +212,7 @@ while () my $proc; s/\bENDP\b/@ $&/; $proc = pop(@proc_stack); - $_ = "\t.size $proc, .-$proc".$_ if ($proc); + $_ = "\t.size $proc, .-$proc".$_ if ($proc and $elf); } # EQU directive @@ -225,4 +235,4 @@ while () } # Mark that this object doesn't need an executable stack. -printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n"); +printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n") if $elf; diff --git a/libs/libvpx/build/make/ads2gas_apple.pl b/libs/libvpx/build/make/ads2gas_apple.pl index e1ae7b4f87..848872fa7d 100755 --- a/libs/libvpx/build/make/ads2gas_apple.pl +++ b/libs/libvpx/build/make/ads2gas_apple.pl @@ -20,9 +20,7 @@ print "@ This file was created from a .asm file\n"; print "@ using the ads2gas_apple.pl script.\n\n"; -print "\t.set WIDE_REFERENCE, 0\n"; -print "\t.set ARCHITECTURE, 5\n"; -print "\t.set DO1STROUNDING, 0\n"; +print "\t.syntax unified\n"; my %register_aliases; my %macro_aliases; diff --git a/libs/libvpx/build/make/configure.sh b/libs/libvpx/build/make/configure.sh index 4bf61eb5eb..4c82b83e48 100644 --- a/libs/libvpx/build/make/configure.sh +++ b/libs/libvpx/build/make/configure.sh @@ -319,6 +319,12 @@ check_ld() { && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs} } +check_lib() { + log check_lib "$@" + check_cc $@ \ + && check_cmd ${LD} ${LDFLAGS} -o ${TMP_X} ${TMP_O} "$@" ${extralibs} +} + check_header(){ log check_header "$@" header=$1 @@ -420,6 +426,26 @@ check_gcc_machine_options() { fi } +check_gcc_avx512_compiles() { + if disabled gcc; then + return + fi + + check_cc -mavx512f < +void f(void) { + __m512i x = _mm512_set1_epi16(0); + (void)x; +} +EOF + compile_result=$? + if [ ${compile_result} -ne 0 ]; then + log_echo " disabling avx512: not supported by compiler" + disable_feature avx512 + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " + fi +} + write_common_config_banner() { print_webm_license config.mk "##" "" echo '# This file automatically generated by configure. Do not edit!' >> config.mk @@ -481,6 +507,7 @@ AS_SFX = ${AS_SFX:-.asm} EXE_SFX = ${EXE_SFX} VCPROJ_SFX = ${VCPROJ_SFX} RTCD_OPTIONS = ${RTCD_OPTIONS} +LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS} EOF if enabled rvct; then cat >> $1 << EOF @@ -520,6 +547,24 @@ EOF cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1" } +write_win_arm64_neon_h_workaround() { + print_webm_license ${TMP_H} "/*" " */" + cat >> ${TMP_H} << EOF +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_WIN_ARM_NEON_H_WORKAROUND +#define VPX_WIN_ARM_NEON_H_WORKAROUND +/* The Windows SDK has arm_neon.h, but unlike on other platforms it is + * ARM32-only. ARM64 NEON support is provided by arm64_neon.h, a proper + * superset of arm_neon.h. Work around this by providing a more local + * arm_neon.h that simply #includes arm64_neon.h. + */ +#include +#endif /* VPX_WIN_ARM_NEON_H_WORKAROUND */ +EOF + mkdir -p `dirname "$1"` + cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1" +} + process_common_cmdline() { for opt in "$@"; do optval="${opt#*=}" @@ -602,11 +647,7 @@ process_common_cmdline() { --libdir=*) libdir="${optval}" ;; - --sdk-path=*) - [ -d "${optval}" ] || die "Not a directory: ${optval}" - sdk_path="${optval}" - ;; - --libc|--as|--prefix|--libdir|--sdk-path) + --libc|--as|--prefix|--libdir) die "Option ${opt} requires argument" ;; --help|-h) @@ -713,11 +754,8 @@ process_common_toolchain() { *sparc*) tgt_isa=sparc ;; - power*64*-*) - tgt_isa=ppc64 - ;; - power*) - tgt_isa=ppc + power*64le*-*) + tgt_isa=ppc64le ;; *mips64el*) tgt_isa=mips64 @@ -837,7 +875,7 @@ process_common_toolchain() { IOS_VERSION_MIN="8.0" else IOS_VERSION_OPTIONS="" - IOS_VERSION_MIN="6.0" + IOS_VERSION_MIN="7.0" fi # Handle darwin variants. Newer SDKs allow targeting older @@ -957,7 +995,6 @@ process_common_toolchain() { setup_gnu_toolchain arch_int=${tgt_isa##armv} arch_int=${arch_int%%te} - check_add_asflags --defsym ARCHITECTURE=${arch_int} tune_cflags="-mtune=" if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then if [ -z "${float_abi}" ]; then @@ -984,6 +1021,16 @@ EOF enabled debug && add_asflags -g asm_conversion_cmd="${source_path}/build/make/ads2gas.pl" + + case ${tgt_os} in + win*) + asm_conversion_cmd="$asm_conversion_cmd -noelf" + AS="$CC -c" + EXE_SFX=.exe + enable_feature thumb + ;; + esac + if enabled thumb; then asm_conversion_cmd="$asm_conversion_cmd -thumb" check_add_cflags -mthumb @@ -991,18 +1038,41 @@ EOF fi ;; vs*) - asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" - AS_SFX=.S - msvs_arch_dir=arm-msvs - disable_feature multithread - disable_feature unit_tests - vs_version=${tgt_cc##vs} - if [ $vs_version -ge 12 ]; then - # MSVC 2013 doesn't allow doing plain .exe projects for ARM, - # only "AppContainerApplication" which requires an AppxManifest. - # Therefore disable the examples, just build the library. - disable_feature examples - disable_feature tools + # A number of ARM-based Windows platforms are constrained by their + # respective SDKs' limitations. Fortunately, these are all 32-bit ABIs + # and so can be selected as 'win32'. + if [ ${tgt_os} = "win32" ]; then + asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" + AS_SFX=.S + msvs_arch_dir=arm-msvs + disable_feature multithread + disable_feature unit_tests + if [ ${tgt_cc##vs} -ge 12 ]; then + # MSVC 2013 doesn't allow doing plain .exe projects for ARM32, + # only "AppContainerApplication" which requires an AppxManifest. + # Therefore disable the examples, just build the library. + disable_feature examples + disable_feature tools + fi + else + # Windows 10 on ARM, on the other hand, has full Windows SDK support + # for building Win32 ARM64 applications in addition to ARM64 + # Windows Store apps. It is the only 64-bit ARM ABI that + # Windows supports, so it is the default definition of 'win64'. + # ARM64 build support officially shipped in Visual Studio 15.9.0. + + # Because the ARM64 Windows SDK's arm_neon.h is ARM32-specific + # while LLVM's is not, probe its validity. + if enabled neon; then + if [ -n "${CC}" ]; then + check_header arm_neon.h || check_header arm64_neon.h && \ + enable_feature win_arm64_neon_h_workaround + else + # If a probe is not possible, assume this is the pure Windows + # SDK and so the workaround is necessary. + enable_feature win_arm64_neon_h_workaround + fi + fi fi ;; rvct) @@ -1030,7 +1100,6 @@ EOF fi arch_int=${tgt_isa##armv} arch_int=${arch_int%%te} - check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\"" enabled debug && add_asflags -g add_cflags --gnu add_cflags --enum_is_int @@ -1045,51 +1114,10 @@ EOF ;; android*) - if [ -n "${sdk_path}" ]; then - SDK_PATH=${sdk_path} - COMPILER_LOCATION=`find "${SDK_PATH}" \ - -name "arm-linux-androideabi-gcc*" -print -quit` - TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi- - CC=${TOOLCHAIN_PATH}gcc - CXX=${TOOLCHAIN_PATH}g++ - AR=${TOOLCHAIN_PATH}ar - LD=${TOOLCHAIN_PATH}gcc - AS=${TOOLCHAIN_PATH}as - STRIP=${TOOLCHAIN_PATH}strip - NM=${TOOLCHAIN_PATH}nm - - if [ -z "${alt_libc}" ]; then - alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \ - awk '{n = split($0,a,"/"); \ - split(a[n-1],b,"-"); \ - print $0 " " b[2]}' | \ - sort -g -k 2 | \ - awk '{ print $1 }' | tail -1` - fi - - if [ -d "${alt_libc}" ]; then - add_cflags "--sysroot=${alt_libc}" - add_ldflags "--sysroot=${alt_libc}" - fi - - # linker flag that routes around a CPU bug in some - # Cortex-A8 implementations (NDK Dev Guide) - add_ldflags "-Wl,--fix-cortex-a8" - - enable_feature pic - soft_enable realtime_only - if [ ${tgt_isa} = "armv7" ]; then - soft_enable runtime_cpu_detect - fi - if enabled runtime_cpu_detect; then - add_cflags "-I${SDK_PATH}/sources/android/cpufeatures" - fi - else - echo "Assuming standalone build with NDK toolchain." - echo "See build/make/Android.mk for details." - check_add_ldflags -static - soft_enable unit_tests - fi + echo "Assuming standalone build with NDK toolchain." + echo "See build/make/Android.mk for details." + check_add_ldflags -static + soft_enable unit_tests ;; darwin*) @@ -1204,6 +1232,11 @@ EOF esac if enabled msa; then + # TODO(libyuv:793) + # The new mips functions in libyuv do not build + # with the toolchains we currently use for testing. + soft_disable libyuv + add_cflags -mmsa add_asflags -mmsa add_ldflags -mmsa @@ -1219,13 +1252,25 @@ EOF check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC ;; - ppc*) + ppc64le*) link_with_cc=gcc setup_gnu_toolchain - check_gcc_machine_option "vsx" + # Do not enable vsx by default. + # https://bugs.chromium.org/p/webm/issues/detail?id=1522 + enabled vsx || RTCD_OPTIONS="${RTCD_OPTIONS}--disable-vsx " + if [ -n "${tune_cpu}" ]; then + case ${tune_cpu} in + power?) + tune_cflags="-mcpu=" + ;; + esac + fi ;; x86*) case ${tgt_os} in + android) + soft_enable realtime_only + ;; win*) enabled gcc && add_cflags -fno-common ;; @@ -1277,28 +1322,13 @@ EOF # Skip the check by setting AS arbitrarily AS=msvs msvs_arch_dir=x86-msvs - vc_version=${tgt_cc##vs} - case $vc_version in - 7|8|9|10|11|12|13|14) + case ${tgt_cc##vs} in + 14) echo "${tgt_cc} does not support avx512, disabling....." RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " soft_disable avx512 ;; esac - case $vc_version in - 7|8|9|10) - echo "${tgt_cc} does not support avx/avx2, disabling....." - RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 " - soft_disable avx - soft_disable avx2 - ;; - esac - case $vc_version in - 7|8|9) - echo "${tgt_cc} omits stdint.h, disabling webm-io..." - soft_disable webm_io - ;; - esac ;; esac @@ -1331,16 +1361,12 @@ EOF else if [ "$ext" = "avx512" ]; then check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl + check_gcc_avx512_compiles else # use the shortened version for the flag: sse4_1 -> sse4 check_gcc_machine_option ${ext%_*} $ext fi fi - - # https://bugs.chromium.org/p/webm/issues/detail?id=1464 - # The assembly optimizations for vpx_sub_pixel_variance do not link with - # gcc 6. - enabled sse2 && soft_enable pic done if enabled external_build; then @@ -1400,7 +1426,8 @@ EOF add_cflags ${sim_arch} add_ldflags ${sim_arch} - if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then + if [ "$(disabled external_build)" ] && + [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it # on is pointless (unless building a C-only lib). Warn the user, but # do nothing here. @@ -1490,7 +1517,11 @@ EOF # bionic includes basic pthread functionality, obviating -lpthread. ;; *) - check_header pthread.h && add_extralibs -lpthread + check_header pthread.h && check_lib -lpthread < +#include +int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } +EOF ;; esac fi diff --git a/libs/libvpx/build/make/gen_msvs_vcxproj.sh b/libs/libvpx/build/make/gen_msvs_vcxproj.sh index 171d0b99b6..84515ecff4 100755 --- a/libs/libvpx/build/make/gen_msvs_vcxproj.sh +++ b/libs/libvpx/build/make/gen_msvs_vcxproj.sh @@ -261,6 +261,11 @@ case "$target" in asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "%(FullPath)"" asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "%(FullPath)"" ;; + arm64*) + platforms[0]="ARM64" + asm_Debug_cmdline="armasm64 -nologo -oldit "%(FullPath)"" + asm_Release_cmdline="armasm64 -nologo -oldit "%(FullPath)"" + ;; arm*) platforms[0]="ARM" asm_Debug_cmdline="armasm -nologo -oldit "%(FullPath)"" @@ -307,6 +312,16 @@ generate_vcxproj() { tag_content ApplicationType "Windows Store" tag_content ApplicationTypeRevision 8.1 fi + if [ "${platforms[0]}" = "ARM64" ]; then + # Require the first Visual Studio version to have ARM64 support. + tag_content MinimumVisualStudioVersion 15.9 + fi + if [ $vs_ver -eq 15 ] && [ "${platforms[0]}" = "ARM64" ]; then + # Since VS 15 does not have a 'use latest SDK version' facility, + # specifically require the contemporaneous SDK with official ARM64 + # support. + tag_content WindowsTargetPlatformVersion 10.0.17763.0 + fi close_tag PropertyGroup tag Import \ diff --git a/libs/libvpx/build/make/iosbuild.sh b/libs/libvpx/build/make/iosbuild.sh index 3211d4f5ef..e1633a89a8 100755 --- a/libs/libvpx/build/make/iosbuild.sh +++ b/libs/libvpx/build/make/iosbuild.sh @@ -132,7 +132,8 @@ create_vpx_framework_config_shim() { done # Consume the last line of output from the loop: We don't want it. - sed -i '' -e '$d' "${config_file}" + sed -i.bak -e '$d' "${config_file}" + rm "${config_file}.bak" printf "#endif\n\n" >> "${config_file}" printf "#endif // ${include_guard}" >> "${config_file}" @@ -244,7 +245,7 @@ build_framework() { # Trap function. Cleans up the subtree used to build all targets contained in # $TARGETS. cleanup() { - local readonly res=$? + local res=$? cd "${ORIG_PWD}" if [ $res -ne 0 ]; then @@ -350,7 +351,7 @@ if [ "$ENABLE_SHARED" = "yes" ]; then IOS_VERSION_MIN="8.0" else IOS_VERSION_OPTIONS="" - IOS_VERSION_MIN="6.0" + IOS_VERSION_MIN="7.0" fi if [ "${VERBOSE}" = "yes" ]; then diff --git a/libs/libvpx/build/make/msvs_common.sh b/libs/libvpx/build/make/msvs_common.sh index 88f1cf9b57..27ddf7fd91 100644 --- a/libs/libvpx/build/make/msvs_common.sh +++ b/libs/libvpx/build/make/msvs_common.sh @@ -41,6 +41,15 @@ fix_path() { # Corrects the paths in file_list in one pass for efficiency. # $1 is the name of the array to be modified. fix_file_list() { + if [ "${FIXPATH}" = "echo_path" ] ; then + # When used with echo_path, fix_file_list is a no-op. Avoid warning about + # unsupported 'declare -n' when it is not important. + return 0 + elif [ "${BASH_VERSINFO}" -lt 4 ] ; then + echo "Cygwin path conversion has failed. Please use a version of bash" + echo "which supports nameref (-n), introduced in bash 4.3" + return 1 + fi declare -n array_ref=$1 files=$(fix_path "${array_ref[@]}") local IFS=$'\n' diff --git a/libs/libvpx/build/make/rtcd.pl b/libs/libvpx/build/make/rtcd.pl index 68e92b52cc..7483200411 100755 --- a/libs/libvpx/build/make/rtcd.pl +++ b/libs/libvpx/build/make/rtcd.pl @@ -400,12 +400,13 @@ EOF # &require("c"); +&require(keys %required); if ($opts{arch} eq 'x86') { @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/); x86; } elsif ($opts{arch} eq 'x86_64') { @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/); - @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/); + @REQUIRES = filter(qw/mmx sse sse2/); &require(@REQUIRES); x86; } elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') { @@ -433,6 +434,7 @@ if ($opts{arch} eq 'x86') { arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { @ALL_ARCHS = filter(qw/neon/); + &require("neon"); arm; } elsif ($opts{arch} =~ /^ppc/ ) { @ALL_ARCHS = filter(qw/vsx/); diff --git a/libs/libvpx/build/make/thumb.pm b/libs/libvpx/build/make/thumb.pm index 483c2539c6..9c49e2d8b7 100644 --- a/libs/libvpx/build/make/thumb.pm +++ b/libs/libvpx/build/make/thumb.pm @@ -54,13 +54,6 @@ sub FixThumbInstructions($$) # "addne r0, r0, r2". s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g; - # Convert a conditional addition to the pc register into a series of - # instructions. This converts "addlt pc, pc, r3, lsl #2" into - # "itttt lt", "movlt.n r12, pc", "addlt.w r12, #12", - # "addlt.w r12, r12, r3, lsl #2", "movlt.n pc, r12". - # This assumes that r12 is free at this point. - s/^(\s*)addlt(\s+)pc,\s*pc,\s*(\w+),\s*lsl\s*#(\d+)/$1itttt$2lt\n$1movlt.n$2r12, pc\n$1addlt.w$2r12, #12\n$1addlt.w$2r12, r12, $3, lsl #($4-$branch_shift_offset)\n$1movlt.n$2pc, r12/g; - # Convert "mov pc, lr" into "bx lr", since the former only works # for switching from arm to thumb (and only in armv7), but not # from thumb to arm. diff --git a/libs/libvpx/codereview.settings b/libs/libvpx/codereview.settings index 34c6f1d9de..ccba2eeed2 100644 --- a/libs/libvpx/codereview.settings +++ b/libs/libvpx/codereview.settings @@ -1,5 +1,4 @@ -# This file is used by gcl to get repository specific information. -GERRIT_HOST: chromium-review.googlesource.com -GERRIT_PORT: 29418 +# This file is used by git cl to get repository specific information. +GERRIT_HOST: True CODE_REVIEW_SERVER: chromium-review.googlesource.com GERRIT_SQUASH_UPLOADS: False diff --git a/libs/libvpx/configure b/libs/libvpx/configure index c84c891c0b..e2397ae49f 100755 --- a/libs/libvpx/configure +++ b/libs/libvpx/configure @@ -31,7 +31,6 @@ Advanced options: --libc=PATH path to alternate libc --size-limit=WxH max size to allow in the decoder --as={yasm|nasm|auto} use specified assembler [auto, yasm preferred] - --sdk-path=PATH path to root of sdk (android builds only) ${toggle_codec_srcs} in/exclude codec library source code ${toggle_debug_libs} in/exclude debug version of libraries ${toggle_static_msvcrt} use static MSVCRT (VS builds only) @@ -101,20 +100,20 @@ EOF all_platforms="${all_platforms} arm64-android-gcc" all_platforms="${all_platforms} arm64-darwin-gcc" all_platforms="${all_platforms} arm64-linux-gcc" +all_platforms="${all_platforms} arm64-win64-gcc" +all_platforms="${all_platforms} arm64-win64-vs15" all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8 -all_platforms="${all_platforms} armv7-win32-vs11" -all_platforms="${all_platforms} armv7-win32-vs12" +all_platforms="${all_platforms} armv7-win32-gcc" all_platforms="${all_platforms} armv7-win32-vs14" all_platforms="${all_platforms} armv7-win32-vs15" all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" all_platforms="${all_platforms} mips32-linux-gcc" all_platforms="${all_platforms} mips64-linux-gcc" -all_platforms="${all_platforms} ppc64-linux-gcc" all_platforms="${all_platforms} ppc64le-linux-gcc" all_platforms="${all_platforms} sparc-solaris-gcc" all_platforms="${all_platforms} x86-android-gcc" @@ -137,9 +136,6 @@ all_platforms="${all_platforms} x86-linux-icc" all_platforms="${all_platforms} x86-os2-gcc" all_platforms="${all_platforms} x86-solaris-gcc" all_platforms="${all_platforms} x86-win32-gcc" -all_platforms="${all_platforms} x86-win32-vs10" -all_platforms="${all_platforms} x86-win32-vs11" -all_platforms="${all_platforms} x86-win32-vs12" all_platforms="${all_platforms} x86-win32-vs14" all_platforms="${all_platforms} x86-win32-vs15" all_platforms="${all_platforms} x86_64-android-gcc" @@ -159,9 +155,6 @@ all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" all_platforms="${all_platforms} x86_64-solaris-gcc" all_platforms="${all_platforms} x86_64-win64-gcc" -all_platforms="${all_platforms} x86_64-win64-vs10" -all_platforms="${all_platforms} x86_64-win64-vs11" -all_platforms="${all_platforms} x86_64-win64-vs12" all_platforms="${all_platforms} x86_64-win64-vs14" all_platforms="${all_platforms} x86_64-win64-vs15" all_platforms="${all_platforms} generic-gnu" @@ -278,9 +271,9 @@ HAVE_LIST=" unistd_h " EXPERIMENT_LIST=" - spatial_svc fp_mb_stats emulate_hardware + non_greedy_mv " CONFIG_LIST=" dependency_tracking @@ -330,12 +323,15 @@ CONFIG_LIST=" multi_res_encoding temporal_denoising vp9_temporal_denoising + consistent_recode coefficient_range_checking vp9_highbitdepth better_hw_compatibility experimental size_limit always_adjust_bpm + bitstream_debug + mismatch_debug ${EXPERIMENT_LIST} " CMDLINE_SELECT=" @@ -391,11 +387,14 @@ CMDLINE_SELECT=" multi_res_encoding temporal_denoising vp9_temporal_denoising + consistent_recode coefficient_range_checking better_hw_compatibility vp9_highbitdepth experimental always_adjust_bpm + bitstream_debug + mismatch_debug " process_cmdline() { @@ -426,6 +425,12 @@ process_cmdline() { } post_process_cmdline() { + if enabled coefficient_range_checking; then + echo "coefficient-range-checking is for decoders only, disabling encoders:" + soft_disable vp8_encoder + soft_disable vp9_encoder + fi + c="" # Enable all detected codecs, if they haven't been disabled @@ -447,6 +452,7 @@ process_targets() { enabled child || write_common_config_banner write_common_target_config_h ${BUILD_PFX}vpx_config.h write_common_config_targets + enabled win_arm64_neon_h_workaround && write_win_arm64_neon_h_workaround ${BUILD_PFX}arm_neon.h # Calculate the default distribution name, based on the enabled features cf="" @@ -523,7 +529,7 @@ process_detect() { # here rather than at option parse time because the target auto-detect # magic happens after the command line has been parsed. case "${tgt_os}" in - linux|os2|darwin*|iphonesimulator*) + linux|os2|solaris|darwin*|iphonesimulator*) # Supported platforms ;; *) @@ -575,16 +581,30 @@ process_detect() { check_ld() { true } + check_lib() { + true + } fi check_header stdio.h || die "Unable to invoke compiler: ${CC} ${CFLAGS}" check_ld < +#include +int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } +EOF check_header unistd.h # for sysconf(3) and friends. check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports + + if enabled neon && ! enabled external_build; then + check_header arm_neon.h || die "Unable to find arm_neon.h" + fi } process_toolchain() { @@ -603,22 +623,39 @@ process_toolchain() { check_add_cflags -Wcast-qual check_add_cflags -Wvla check_add_cflags -Wimplicit-function-declaration + check_add_cflags -Wmissing-declarations + check_add_cflags -Wmissing-prototypes check_add_cflags -Wuninitialized check_add_cflags -Wunused - # -Wextra has some tricky cases. Rather than fix them all now, get the - # flag for as many files as possible and fix the remaining issues - # piecemeal. - # https://bugs.chromium.org/p/webm/issues/detail?id=1069 check_add_cflags -Wextra # check_add_cflags also adds to cxxflags. gtest does not do well with - # -Wundef so add it explicitly to CFLAGS only. + # these flags so add them explicitly to CFLAGS only. check_cflags -Wundef && add_cflags_only -Wundef + check_cflags -Wframe-larger-than=52000 && \ + add_cflags_only -Wframe-larger-than=52000 if enabled mips || [ -z "${INLINE}" ]; then enabled extra_warnings || check_add_cflags -Wno-unused-function fi + # Enforce c89 for c files. Don't be too strict about it though. Allow + # gnu extensions like "//" for comments. + check_cflags -std=gnu89 && add_cflags_only -std=gnu89 # Avoid this warning for third_party C++ sources. Some reorganization # would be needed to apply this only to test/*.cc. check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 + + # Quiet gcc 6 vs 7 abi warnings: + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 + if enabled arm; then + check_add_cxxflags -Wno-psabi + fi + + # disable some warnings specific to libyuv. + check_cxxflags -Wno-missing-declarations \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" + check_cxxflags -Wno-missing-prototypes \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes" + check_cxxflags -Wno-unused-parameter \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter" fi if enabled icc; then @@ -689,7 +726,7 @@ process_toolchain() { soft_enable libyuv ;; *-android-*) - soft_enable webm_io + check_add_cxxflags -std=c++11 && soft_enable webm_io soft_enable libyuv # GTestLog must be modified to use Android logging utilities. ;; @@ -698,30 +735,23 @@ process_toolchain() { # x86 targets. ;; *-iphonesimulator-*) - soft_enable webm_io + check_add_cxxflags -std=c++11 && soft_enable webm_io soft_enable libyuv ;; *-win*) # Some mingw toolchains don't have pthread available by default. # Treat these more like visual studio where threading in gtest # would be disabled for the same reason. - check_cxx "$@" < $@ + @echo "ENABLED_SECTIONS += samples" >> $@ diff --git a/libs/libvpx/vpx/svc_context.h b/libs/libvpx/examples/svc_context.h similarity index 83% rename from libs/libvpx/vpx/svc_context.h rename to libs/libvpx/examples/svc_context.h index 462785075c..c5779ce8a9 100644 --- a/libs/libvpx/vpx/svc_context.h +++ b/libs/libvpx/examples/svc_context.h @@ -13,11 +13,11 @@ * spatial SVC frame */ -#ifndef VPX_SVC_CONTEXT_H_ -#define VPX_SVC_CONTEXT_H_ +#ifndef VPX_EXAMPLES_SVC_CONTEXT_H_ +#define VPX_EXAMPLES_SVC_CONTEXT_H_ -#include "./vp8cx.h" -#include "./vpx_encoder.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" #ifdef __cplusplus extern "C" { @@ -35,10 +35,8 @@ typedef struct { int temporal_layers; // number of temporal layers int temporal_layering_mode; SVC_LOG_LEVEL log_level; // amount of information to display - int log_print; // when set, printf log messages instead of returning the - // message with svc_get_message - int output_rc_stat; // for outputting rc stats - int speed; // speed setting for codec + int output_rc_stat; // for outputting rc stats + int speed; // speed setting for codec int threads; int aqmode; // turns on aq-mode=3 (cyclic_refresh): 0=off, 1=on. // private storage for vpx_svc_encode @@ -71,7 +69,6 @@ typedef struct SvcInternal { int layer; int use_multiple_frame_contexts; - char message_buffer[2048]; vpx_codec_ctx_t *codec_ctx; } SvcInternal_t; @@ -106,15 +103,10 @@ void vpx_svc_release(SvcContext *svc_ctx); /** * dump accumulated statistics and reset accumulated values */ -const char *vpx_svc_dump_statistics(SvcContext *svc_ctx); - -/** - * get status message from previous encode - */ -const char *vpx_svc_get_message(const SvcContext *svc_ctx); +void vpx_svc_dump_statistics(SvcContext *svc_ctx); #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_SVC_CONTEXT_H_ +#endif // VPX_EXAMPLES_SVC_CONTEXT_H_ diff --git a/libs/libvpx/vpx/src/svc_encodeframe.c b/libs/libvpx/examples/svc_encodeframe.c similarity index 85% rename from libs/libvpx/vpx/src/svc_encodeframe.c rename to libs/libvpx/examples/svc_encodeframe.c index f633600c79..a73ee8ed66 100644 --- a/libs/libvpx/vpx/src/svc_encodeframe.c +++ b/libs/libvpx/examples/svc_encodeframe.c @@ -22,7 +22,7 @@ #include #define VPX_DISABLE_CTRL_TYPECHECKS 1 #include "./vpx_config.h" -#include "vpx/svc_context.h" +#include "./svc_context.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" @@ -95,17 +95,11 @@ static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) { return (const SvcInternal_t *)svc_ctx->internal; } -static void svc_log_reset(SvcContext *svc_ctx) { - SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal; - si->message_buffer[0] = '\0'; -} - static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt, ...) { char buf[512]; int retval = 0; va_list ap; - SvcInternal_t *const si = get_svc_internal(svc_ctx); if (level > svc_ctx->log_level) { return retval; @@ -115,16 +109,8 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt, retval = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); - if (svc_ctx->log_print) { - printf("%s", buf); - } else { - strncat(si->message_buffer, buf, - sizeof(si->message_buffer) - strlen(si->message_buffer) - 1); - } + printf("%s", buf); - if (level == SVC_LOG_ERROR) { - si->codec_ctx->err_detail = si->message_buffer; - } return retval; } @@ -169,6 +155,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, return VPX_CODEC_INVALID_PARAM; input_string = strdup(input); + if (input_string == NULL) return VPX_CODEC_MEM_ERROR; token = strtok_r(input_string, delim, &save_ptr); for (i = 0; i < num_layers; ++i) { if (token != NULL) { @@ -208,6 +195,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { if (options == NULL) return VPX_CODEC_OK; input_string = strdup(options); + if (input_string == NULL) return VPX_CODEC_MEM_ERROR; // parse option name option_name = strtok_r(input_string, "=", &input_ptr); @@ -294,8 +282,8 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { return VPX_CODEC_OK; } -vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx, - vpx_codec_enc_cfg_t *const enc_cfg) { +static vpx_codec_err_t assign_layer_bitrates( + const SvcContext *svc_ctx, vpx_codec_enc_cfg_t *const enc_cfg) { int i; const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); int sl, tl, spatial_layer_target; @@ -471,8 +459,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, svc_log(svc_ctx, SVC_LOG_ERROR, "spatial layers * temporal layers exceeds the maximum number of " "allowed layers of %d\n", - svc_ctx->spatial_layers * svc_ctx->temporal_layers, - (int)VPX_MAX_LAYERS); + svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS); return VPX_CODEC_INVALID_PARAM; } res = assign_layer_bitrates(svc_ctx, enc_cfg); @@ -485,11 +472,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, return VPX_CODEC_INVALID_PARAM; } -#if CONFIG_SPATIAL_SVC - for (i = 0; i < svc_ctx->spatial_layers; ++i) - enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i]; -#endif - if (svc_ctx->temporal_layers > 1) { int i; for (i = 0; i < svc_ctx->temporal_layers; ++i) { @@ -514,7 +496,17 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, enc_cfg->rc_buf_initial_sz = 500; enc_cfg->rc_buf_optimal_sz = 600; enc_cfg->rc_buf_sz = 1000; - enc_cfg->rc_dropframe_thresh = 0; + } + + for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + i = sl * svc_ctx->temporal_layers + tl; + if (enc_cfg->rc_end_usage == VPX_CBR && + enc_cfg->g_pass == VPX_RC_ONE_PASS) { + si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer; + si->svc_params.min_quantizers[i] = enc_cfg->rc_min_quantizer; + } + } } if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0) @@ -548,8 +540,6 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, return VPX_CODEC_INVALID_PARAM; } - svc_log_reset(svc_ctx); - res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0, deadline); if (res != VPX_CODEC_OK) { @@ -559,56 +549,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, iter = NULL; while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { switch (cx_pkt->kind) { -#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) - case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: { - int i; - for (i = 0; i < svc_ctx->spatial_layers; ++i) { - int j; - svc_log(svc_ctx, SVC_LOG_DEBUG, - "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): " - "%2.3f %2.3f %2.3f %2.3f \n", - si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].psnr[0], - cx_pkt->data.layer_psnr[i].psnr[1], - cx_pkt->data.layer_psnr[i].psnr[2], - cx_pkt->data.layer_psnr[i].psnr[3]); - svc_log(svc_ctx, SVC_LOG_DEBUG, - "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): " - "%2.3f %2.3f %2.3f %2.3f \n", - si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].sse[0], - cx_pkt->data.layer_psnr[i].sse[1], - cx_pkt->data.layer_psnr[i].sse[2], - cx_pkt->data.layer_psnr[i].sse[3]); - - for (j = 0; j < COMPONENTS; ++j) { - si->psnr_sum[i][j] += cx_pkt->data.layer_psnr[i].psnr[j]; - si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j]; - } - } - ++si->psnr_pkt_received; - break; - } - case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: { - int i; - for (i = 0; i < svc_ctx->spatial_layers; ++i) - si->bytes_sum[i] += cx_pkt->data.layer_sizes[i]; - break; - } -#endif case VPX_CODEC_PSNR_PKT: { -#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) - int j; - svc_log(svc_ctx, SVC_LOG_DEBUG, - "frame: %d, layer: %d, PSNR(Total/Y/U/V): " - "%2.3f %2.3f %2.3f %2.3f \n", - si->psnr_pkt_received, 0, cx_pkt->data.layer_psnr[0].psnr[0], - cx_pkt->data.layer_psnr[0].psnr[1], - cx_pkt->data.layer_psnr[0].psnr[2], - cx_pkt->data.layer_psnr[0].psnr[3]); - for (j = 0; j < COMPONENTS; ++j) { - si->psnr_sum[0][j] += cx_pkt->data.layer_psnr[0].psnr[j]; - si->sse_sum[0][j] += cx_pkt->data.layer_psnr[0].sse[j]; - } -#endif } ++si->psnr_pkt_received; break; @@ -619,19 +560,13 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, return VPX_CODEC_OK; } -const char *vpx_svc_get_message(const SvcContext *svc_ctx) { - const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); - if (svc_ctx == NULL || si == NULL) return NULL; - return si->message_buffer; -} - static double calc_psnr(double d) { if (d == 0) return 100; return -10.0 * log(d) / log(10.0); } // dump accumulated statistics and reset accumulated values -const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { +void vpx_svc_dump_statistics(SvcContext *svc_ctx) { int number_of_frames; int i, j; uint32_t bytes_total = 0; @@ -641,21 +576,19 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { double y_scale; SvcInternal_t *const si = get_svc_internal(svc_ctx); - if (svc_ctx == NULL || si == NULL) return NULL; - - svc_log_reset(svc_ctx); + if (svc_ctx == NULL || si == NULL) return; number_of_frames = si->psnr_pkt_received; - if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx); + if (number_of_frames <= 0) return; svc_log(svc_ctx, SVC_LOG_INFO, "\n"); for (i = 0; i < svc_ctx->spatial_layers; ++i) { svc_log(svc_ctx, SVC_LOG_INFO, "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n", - i, (double)si->psnr_sum[i][0] / number_of_frames, - (double)si->psnr_sum[i][1] / number_of_frames, - (double)si->psnr_sum[i][2] / number_of_frames, - (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]); + i, si->psnr_sum[i][0] / number_of_frames, + si->psnr_sum[i][1] / number_of_frames, + si->psnr_sum[i][2] / number_of_frames, + si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]); // the following psnr calculation is deduced from ffmpeg.c#print_report y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames; scale[1] = y_scale; @@ -686,7 +619,6 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { si->psnr_pkt_received = 0; svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total); - return vpx_svc_get_message(svc_ctx); } void vpx_svc_release(SvcContext *svc_ctx) { diff --git a/libs/libvpx/examples/vp8_multi_resolution_encoder.c b/libs/libvpx/examples/vp8_multi_resolution_encoder.c index b14b1ff397..e72f8a0197 100644 --- a/libs/libvpx/examples/vp8_multi_resolution_encoder.c +++ b/libs/libvpx/examples/vp8_multi_resolution_encoder.c @@ -61,7 +61,7 @@ void usage_exit(void) { exit(EXIT_FAILURE); } int (*read_frame_p)(FILE *f, vpx_image_t *img); -static int read_frame(FILE *f, vpx_image_t *img) { +static int mulres_read_frame(FILE *f, vpx_image_t *img) { size_t nbytes, to_read; int res = 1; @@ -75,7 +75,7 @@ static int read_frame(FILE *f, vpx_image_t *img) { return res; } -static int read_frame_by_row(FILE *f, vpx_image_t *img) { +static int mulres_read_frame_by_row(FILE *f, vpx_image_t *img) { size_t nbytes, to_read; int res = 1; int plane; @@ -471,9 +471,9 @@ int main(int argc, char **argv) { die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w) - read_frame_p = read_frame; + read_frame_p = mulres_read_frame; else - read_frame_p = read_frame_by_row; + read_frame_p = mulres_read_frame_by_row; for (i = 0; i < NUM_ENCODERS; i++) if (outfile[i]) write_ivf_file_header(outfile[i], &cfg[i], 0); diff --git a/libs/libvpx/examples/vp9_spatial_svc_encoder.c b/libs/libvpx/examples/vp9_spatial_svc_encoder.c index 0987cbfb85..b987989a86 100644 --- a/libs/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/libs/libvpx/examples/vp9_spatial_svc_encoder.c @@ -25,13 +25,19 @@ #include "../video_writer.h" #include "../vpx_ports/vpx_timer.h" -#include "vpx/svc_context.h" +#include "./svc_context.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" #include "../vpxstats.h" #include "vp9/encoder/vp9_encoder.h" +#include "./y4minput.h" + #define OUTPUT_RC_STATS 1 +#define SIMULCAST_MODE 0 + +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output filename"); static const arg_def_t skip_frames_arg = ARG_DEF("s", "skip-frames", 1, "input frames to skip"); static const arg_def_t frames_arg = @@ -86,6 +92,19 @@ static const arg_def_t aqmode_arg = ARG_DEF("aq", "aqmode", 1, "aq-mode off/on"); static const arg_def_t bitrates_arg = ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]"); +static const arg_def_t dropframe_thresh_arg = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); +static const struct arg_enum_list tune_content_enum[] = { + { "default", VP9E_CONTENT_DEFAULT }, + { "screen", VP9E_CONTENT_SCREEN }, + { "film", VP9E_CONTENT_FILM }, + { NULL, 0 } +}; + +static const arg_def_t tune_content_arg = ARG_DEF_ENUM( + NULL, "tune-content", 1, "Tune content type", tune_content_enum); +static const arg_def_t inter_layer_pred_arg = ARG_DEF( + NULL, "inter-layer-pred", 1, "0 - 3: On, Off, Key-frames, Constrained"); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -97,6 +116,7 @@ static const arg_def_t bitdepth_arg = ARG_DEF_ENUM( #endif // CONFIG_VP9_HIGHBITDEPTH static const arg_def_t *svc_args[] = { &frames_arg, + &outputfile, &width_arg, &height_arg, &timebase_arg, @@ -127,6 +147,9 @@ static const arg_def_t *svc_args[] = { &frames_arg, &speed_arg, &rc_end_usage_arg, &bitrates_arg, + &dropframe_thresh_arg, + &tune_content_arg, + &inter_layer_pred_arg, NULL }; static const uint32_t default_frames_to_skip = 0; @@ -145,7 +168,6 @@ static const int32_t default_speed = -1; // -1 means use library default. static const uint32_t default_threads = 0; // zero means use library default. typedef struct { - const char *input_filename; const char *output_filename; uint32_t frames_to_code; uint32_t frames_to_skip; @@ -153,12 +175,14 @@ typedef struct { stats_io_t rc_stats; int passes; int pass; + int tune_content; + int inter_layer_pred; } AppInput; static const char *exec_name; void usage_exit(void) { - fprintf(stderr, "Usage: %s input_filename output_filename\n", + fprintf(stderr, "Usage: %s input_filename -o output_filename\n", exec_name); fprintf(stderr, "Options:\n"); arg_show_usage(stderr, svc_args); @@ -217,6 +241,8 @@ static void parse_command_line(int argc, const char **argv_, if (arg_match(&arg, &frames_arg, argi)) { app_input->frames_to_code = arg_parse_uint(&arg); + } else if (arg_match(&arg, &outputfile, argi)) { + app_input->output_filename = arg.val; } else if (arg_match(&arg, &width_arg, argi)) { enc_cfg->g_w = arg_parse_uint(&arg); } else if (arg_match(&arg, &height_arg, argi)) { @@ -237,6 +263,9 @@ static void parse_command_line(int argc, const char **argv_, #endif } else if (arg_match(&arg, &speed_arg, argi)) { svc_ctx->speed = arg_parse_uint(&arg); + if (svc_ctx->speed > 9) { + warn("Mapping speed %d to speed 9.\n", svc_ctx->speed); + } } else if (arg_match(&arg, &aqmode_arg, argi)) { svc_ctx->aqmode = arg_parse_uint(&arg); } else if (arg_match(&arg, &threads_arg, argi)) { @@ -251,11 +280,15 @@ static void parse_command_line(int argc, const char **argv_, enc_cfg->kf_min_dist = arg_parse_uint(&arg); enc_cfg->kf_max_dist = enc_cfg->kf_min_dist; } else if (arg_match(&arg, &scale_factors_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s scale-factors=%s", - string_options, arg.val); + strncat(string_options, " scale-factors=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &bitrates_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s bitrates=%s", - string_options, arg.val); + strncat(string_options, " bitrates=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &passes_arg, argi)) { passes = arg_parse_uint(&arg); if (passes < 1 || passes > 2) { @@ -269,11 +302,15 @@ static void parse_command_line(int argc, const char **argv_, } else if (arg_match(&arg, &fpf_name_arg, argi)) { fpf_file_name = arg.val; } else if (arg_match(&arg, &min_q_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s", - string_options, arg.val); + strncat(string_options, " min-quantizers=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &max_q_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s", - string_options, arg.val); + strncat(string_options, " max-quantizers=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &min_bitrate_arg, argi)) { min_bitrate = arg_parse_uint(&arg); } else if (arg_match(&arg, &max_bitrate_arg, argi)) { @@ -303,6 +340,12 @@ static void parse_command_line(int argc, const char **argv_, break; } #endif // CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) { + enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &tune_content_arg, argi)) { + app_input->tune_content = arg_parse_uint(&arg); + } else if (arg_match(&arg, &inter_layer_pred_arg, argi)) { + app_input->inter_layer_pred = arg_parse_uint(&arg); } else { ++argj; } @@ -358,13 +401,18 @@ static void parse_command_line(int argc, const char **argv_, if (argi[0][0] == '-' && strlen(argi[0]) > 1) die("Error: Unrecognized option %s\n", *argi); - if (argv[0] == NULL || argv[1] == 0) { + if (argv[0] == NULL) { usage_exit(); } - app_input->input_filename = argv[0]; - app_input->output_filename = argv[1]; + app_input->input_ctx.filename = argv[0]; free(argv); + open_input_file(&app_input->input_ctx); + if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) { + enc_cfg->g_w = app_input->input_ctx.width; + enc_cfg->g_h = app_input->input_ctx.height; + } + if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 || enc_cfg->g_h % 2) die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h); @@ -429,8 +477,9 @@ static void set_rate_control_stats(struct RateControlStats *rc, rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl]; if (tl > 0) { rc->layer_pfb[layer] = - 1000.0 * (cfg->layer_target_bitrate[layer] - - cfg->layer_target_bitrate[layer - 1]) / + 1000.0 * + (cfg->layer_target_bitrate[layer] - + cfg->layer_target_bitrate[layer - 1]) / (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]); } else { rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] / @@ -502,14 +551,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc, printf("Average, rms-variance, and percent-fluct: %f %f %f \n", rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate), perc_fluctuation); - if (frame_cnt != tot_num_frames) - die("Error: Number of input frames not equal to output encoded frames != " - "%d tot_num_frames = %d\n", - frame_cnt, tot_num_frames); + printf("Num of input, num of encoded (super) frames: %d %d \n", frame_cnt, + tot_num_frames); } -vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, - uint64_t sizes[8], int *count) { +static vpx_codec_err_t parse_superframe_index(const uint8_t *data, + size_t data_sz, uint64_t sizes[8], + int *count) { // A chunk ending with a byte matching 0xc0 is an invalid chunk unless // it is a super frame index. If the last byte of real video compression // data is 0xc0 the encoder must add a 0 byte. If we have the marker but @@ -561,106 +609,386 @@ vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, // bypass/flexible mode. The pattern corresponds to the pattern // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in // non-flexible mode. -void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers, - int is_key_frame, - vpx_svc_ref_frame_config_t *ref_frame_config) { +static void set_frame_flags_bypass_mode_ex0( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + for (sl = 0; sl < num_spatial_layers; ++sl) { - if (!tl) { - if (!sl) { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | - VP8_EFLAG_NO_UPD_ARF; - } else { - if (is_key_frame) { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF | - VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; - } else { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; - } - } - } else if (tl == 1) { - if (!sl) { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | - VP8_EFLAG_NO_UPD_GF; - } else { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; - } - } + // Set the buffer idx. if (tl == 0) { ref_frame_config->lst_fb_idx[sl] = sl; - if (sl) - ref_frame_config->gld_fb_idx[sl] = sl - 1; - else + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { ref_frame_config->gld_fb_idx[sl] = 0; + } ref_frame_config->alt_fb_idx[sl] = 0; } else if (tl == 1) { ref_frame_config->lst_fb_idx[sl] = sl; ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1; ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; } + // Set the reference and update flags. + if (!tl) { + if (!sl) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 1; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + // Base spatial and top temporal (tl = 1) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + // Non-zero spatial. + if (sl < num_spatial_layers - 1) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else if (sl == num_spatial_layers - 1) { + // Top spatial and top temporal (non-reference -- doesn't update any + // reference buffers) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + } + } + } } } +// Example pattern for 2 spatial layers and 2 temporal layers used in the +// bypass/flexible mode, except only 1 spatial layer when temporal_layer_id = 1. +static void set_frame_flags_bypass_mode_ex1( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + if (tl == 0) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[1] = 0; + ref_frame_config->gld_fb_idx[1] = 1; + } else { + ref_frame_config->lst_fb_idx[1] = 1; + ref_frame_config->gld_fb_idx[1] = 0; + } + ref_frame_config->alt_fb_idx[1] = 0; + + ref_frame_config->lst_fb_idx[0] = 0; + ref_frame_config->gld_fb_idx[0] = 0; + ref_frame_config->alt_fb_idx[0] = 0; + } + if (tl == 1) { + ref_frame_config->lst_fb_idx[0] = 0; + ref_frame_config->gld_fb_idx[0] = 1; + ref_frame_config->alt_fb_idx[0] = 2; + + ref_frame_config->lst_fb_idx[1] = 1; + ref_frame_config->gld_fb_idx[1] = 2; + ref_frame_config->alt_fb_idx[1] = 3; + } + // Set the reference and update flags. + if (tl == 0) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[0] = 1; + ref_frame_config->reference_golden[0] = 0; + ref_frame_config->reference_alt_ref[0] = 0; + ref_frame_config->update_buffer_slot[0] |= + 1 << ref_frame_config->lst_fb_idx[0]; + + if (is_key_frame) { + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 0; + ref_frame_config->reference_alt_ref[1] = 0; + ref_frame_config->update_buffer_slot[1] |= + 1 << ref_frame_config->gld_fb_idx[1]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 1; + ref_frame_config->reference_alt_ref[1] = 1; + ref_frame_config->update_buffer_slot[1] |= + 1 << ref_frame_config->lst_fb_idx[1]; + } + } + if (tl == 1) { + // Top spatial and top temporal (non-reference -- doesn't update any + // reference buffers) + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 0; + ref_frame_config->reference_alt_ref[1] = 0; + } +} + +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE +static void test_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder, + const int frames_out, int *mismatch_seen) { + vpx_image_t enc_img, dec_img; + struct vp9_ref_frame ref_enc, ref_dec; + if (*mismatch_seen) return; + /* Get the internal reference frame */ + ref_enc.idx = 0; + ref_dec.idx = 0; + vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc); + enc_img = ref_enc.img; + vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec); + dec_img = ref_dec.img; +#if CONFIG_VP9_HIGHBITDEPTH + if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img); + } + if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img); + } + } +#endif + + if (!compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; +#if CONFIG_VP9_HIGHBITDEPTH + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + find_mismatch(&enc_img, &dec_img, y, u, v); + } +#else + find_mismatch(&enc_img, &dec_img, y, u, v); +#endif + decoder->err = 1; + printf( + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}\n", + frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); + *mismatch_seen = frames_out; + } + + vpx_img_free(&enc_img); + vpx_img_free(&dec_img); +} +#endif + +#if OUTPUT_RC_STATS +static void svc_output_rc_stats( + vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *enc_cfg, + vpx_svc_layer_id_t *layer_id, const vpx_codec_cx_pkt_t *cx_pkt, + struct RateControlStats *rc, VpxVideoWriter **outfile, + const uint32_t frame_cnt, const double framerate) { + int num_layers_encoded = 0; + unsigned int sl, tl; + uint64_t sizes[8]; + uint64_t sizes_parsed[8]; + int count = 0; + double sum_bitrate = 0.0; + double sum_bitrate2 = 0.0; + vp9_zero(sizes); + vp9_zero(sizes_parsed); + vpx_codec_control(codec, VP9E_GET_SVC_LAYER_ID, layer_id); + parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, + sizes_parsed, &count); + if (enc_cfg->ss_number_layers == 1) sizes[0] = cx_pkt->data.frame.sz; + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + sizes[sl] = 0; + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } + } + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + unsigned int sl2; + uint64_t tot_size = 0; +#if SIMULCAST_MODE + for (sl2 = 0; sl2 < sl; ++sl2) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2]; + } + vpx_video_writer_write_frame(outfile[sl], + (uint8_t *)(cx_pkt->data.frame.buf) + tot_size, + (size_t)(sizes[sl]), cx_pkt->data.frame.pts); +#else + for (sl2 = 0; sl2 <= sl; ++sl2) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2]; + } + if (tot_size > 0) + vpx_video_writer_write_frame(outfile[sl], cx_pkt->data.frame.buf, + (size_t)(tot_size), cx_pkt->data.frame.pts); +#endif // SIMULCAST_MODE + } + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + for (tl = layer_id->temporal_layer_id; tl < enc_cfg->ts_number_layers; + ++tl) { + const int layer = sl * enc_cfg->ts_number_layers + tl; + ++rc->layer_tot_enc_frames[layer]; + rc->layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; + // Keep count of rate control stats per layer, for non-key + // frames. + if (tl == (unsigned int)layer_id->temporal_layer_id && + !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { + rc->layer_avg_frame_size[layer] += 8.0 * sizes[sl]; + rc->layer_avg_rate_mismatch[layer] += + fabs(8.0 * sizes[sl] - rc->layer_pfb[layer]) / + rc->layer_pfb[layer]; + ++rc->layer_enc_frames[layer]; + } + } + } + } + + // Update for short-time encoding bitrate states, for moving + // window of size rc->window, shifted by rc->window / 2. + // Ignore first window segment, due to key frame. + if (frame_cnt > (unsigned int)rc->window_size) { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) + sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; + } + if (frame_cnt % rc->window_size == 0) { + rc->window_count += 1; + rc->avg_st_encoding_bitrate += sum_bitrate / rc->window_size; + rc->variance_st_encoding_bitrate += + (sum_bitrate / rc->window_size) * (sum_bitrate / rc->window_size); + } + } + + // Second shifted window. + if (frame_cnt > (unsigned int)(rc->window_size + rc->window_size / 2)) { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; + } + + if (frame_cnt > (unsigned int)(2 * rc->window_size) && + frame_cnt % rc->window_size == 0) { + rc->window_count += 1; + rc->avg_st_encoding_bitrate += sum_bitrate2 / rc->window_size; + rc->variance_st_encoding_bitrate += + (sum_bitrate2 / rc->window_size) * (sum_bitrate2 / rc->window_size); + } + } +} +#endif + int main(int argc, const char **argv) { AppInput app_input; VpxVideoWriter *writer = NULL; VpxVideoInfo info; - vpx_codec_ctx_t codec; + vpx_codec_ctx_t encoder; vpx_codec_enc_cfg_t enc_cfg; SvcContext svc_ctx; + vpx_svc_frame_drop_t svc_drop_frame; uint32_t i; uint32_t frame_cnt = 0; vpx_image_t raw; vpx_codec_err_t res; int pts = 0; /* PTS starts at 0 */ int frame_duration = 1; /* 1 timebase tick per frame */ - FILE *infile = NULL; int end_of_stream = 0; int frames_received = 0; #if OUTPUT_RC_STATS - VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL }; + VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL }; struct RateControlStats rc; vpx_svc_layer_id_t layer_id; vpx_svc_ref_frame_config_t ref_frame_config; - unsigned int sl, tl; - double sum_bitrate = 0.0; - double sum_bitrate2 = 0.0; + unsigned int sl; double framerate = 30.0; #endif struct vpx_usec_timer timer; int64_t cx_time = 0; +#if CONFIG_INTERNAL_STATS + FILE *f = fopen("opsnr.stt", "a"); +#endif +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + int mismatch_seen = 0; + vpx_codec_ctx_t decoder; +#endif memset(&svc_ctx, 0, sizeof(svc_ctx)); - svc_ctx.log_print = 1; + memset(&app_input, 0, sizeof(AppInput)); + memset(&info, 0, sizeof(VpxVideoInfo)); + memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t)); + memset(&rc, 0, sizeof(struct RateControlStats)); exec_name = argv[0]; + + /* Setup default input stream settings */ + app_input.input_ctx.framerate.numerator = 30; + app_input.input_ctx.framerate.denominator = 1; + app_input.input_ctx.only_i420 = 1; + app_input.input_ctx.bit_depth = 0; + parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg); + // Y4M reader handles its own allocation. + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { // Allocate image buffer #if CONFIG_VP9_HIGHBITDEPTH - if (!vpx_img_alloc(&raw, - enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 - : VPX_IMG_FMT_I42016, - enc_cfg.g_w, enc_cfg.g_h, 32)) { - die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); - } + if (!vpx_img_alloc(&raw, + enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 + : VPX_IMG_FMT_I42016, + enc_cfg.g_w, enc_cfg.g_h, 32)) { + die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); + } #else - if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) { - die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); - } + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) { + die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); + } #endif // CONFIG_VP9_HIGHBITDEPTH - - if (!(infile = fopen(app_input.input_filename, "rb"))) - die("Failed to open %s for reading\n", app_input.input_filename); + } // Initialize codec - if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) != + if (vpx_svc_init(&svc_ctx, &encoder, vpx_codec_vp9_cx(), &enc_cfg) != VPX_CODEC_OK) die("Failed to initialize encoder\n"); +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + if (vpx_codec_dec_init( + &decoder, get_vpx_decoder_by_name("vp9")->codec_interface(), NULL, 0)) + die("Failed to initialize decoder\n"); +#endif #if OUTPUT_RC_STATS + rc.window_count = 1; + rc.window_size = 15; // Silence a static analysis warning. + rc.avg_st_encoding_bitrate = 0.0; + rc.variance_st_encoding_bitrate = 0.0; if (svc_ctx.output_rc_stat) { set_rate_control_stats(&rc, &enc_cfg); framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num; @@ -668,6 +996,8 @@ int main(int argc, const char **argv) { #endif info.codec_fourcc = VP9_FOURCC; + info.frame_width = enc_cfg.g_w; + info.frame_height = enc_cfg.g_h; info.time_base.numerator = enc_cfg.g_timebase.num; info.time_base.denominator = enc_cfg.g_timebase.den; @@ -679,43 +1009,65 @@ int main(int argc, const char **argv) { die("Failed to open %s for writing\n", app_input.output_filename); } #if OUTPUT_RC_STATS - // For now, just write temporal layer streams. - // TODO(marpan): do spatial by re-writing superframe. + // Write out spatial layer stream. + // TODO(marpan/jianj): allow for writing each spatial and temporal stream. if (svc_ctx.output_rc_stat) { - for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { char file_name[PATH_MAX]; - snprintf(file_name, sizeof(file_name), "%s_t%d.ivf", - app_input.output_filename, tl); - outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info); - if (!outfile[tl]) die("Failed to open %s for writing", file_name); + snprintf(file_name, sizeof(file_name), "%s_s%d.ivf", + app_input.output_filename, sl); + outfile[sl] = vpx_video_writer_open(file_name, kContainerIVF, &info); + if (!outfile[sl]) die("Failed to open %s for writing", file_name); } } #endif // skip initial frames - for (i = 0; i < app_input.frames_to_skip; ++i) vpx_img_read(&raw, infile); + for (i = 0; i < app_input.frames_to_skip; ++i) + read_frame(&app_input.input_ctx, &raw); if (svc_ctx.speed != -1) - vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed); + vpx_codec_control(&encoder, VP8E_SET_CPUUSED, svc_ctx.speed); if (svc_ctx.threads) { - vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1)); + vpx_codec_control(&encoder, VP9E_SET_TILE_COLUMNS, + get_msb(svc_ctx.threads)); if (svc_ctx.threads > 1) - vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1); + vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 1); else - vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0); + vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 0); } if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1) - vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + vpx_codec_control(&encoder, VP9E_SET_AQ_MODE, 3); if (svc_ctx.speed >= 5) - vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); - vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); + vpx_codec_control(&encoder, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&encoder, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); + + vpx_codec_control(&encoder, VP9E_SET_SVC_INTER_LAYER_PRED, + app_input.inter_layer_pred); + + vpx_codec_control(&encoder, VP9E_SET_NOISE_SENSITIVITY, 0); + + vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content); + + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh; + svc_drop_frame.max_consec_drop = INT_MAX; + vpx_codec_control(&encoder, VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); // Encode frames while (!end_of_stream) { vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *cx_pkt; - if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) { + // Example patterns for bypass/flexible mode: + // example_pattern = 0: 2 temporal layers, and spatial_layers = 1,2,3. Exact + // to fixed SVC patterns. example_pattern = 1: 2 spatial and 2 temporal + // layers, with SL0 only has TL0, and SL1 has both TL0 and TL1. This example + // uses the extended API. + int example_pattern = 0; + if (frame_cnt >= app_input.frames_to_code || + !read_frame(&app_input.input_ctx, &raw)) { // We need one extra vpx_svc_encode call at end of stream to flush // encoder and get remaining data end_of_stream = 1; @@ -723,140 +1075,97 @@ int main(int argc, const char **argv) { // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates) // and the buffer indices for each spatial layer of the current - // (super)frame to be encoded. The temporal layer_id for the current frame - // also needs to be set. + // (super)frame to be encoded. The spatial and temporal layer_id for the + // current frame also needs to be set. // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS" // mode to "VP9E_LAYERING_MODE_BYPASS". if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { layer_id.spatial_layer_id = 0; // Example for 2 temporal layers. - if (frame_cnt % 2 == 0) + if (frame_cnt % 2 == 0) { layer_id.temporal_layer_id = 0; - else + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) + layer_id.temporal_layer_id_per_spatial[i] = 0; + } else { layer_id.temporal_layer_id = 1; - // Note that we only set the temporal layer_id, since we are calling - // the encode for the whole superframe. The encoder will internally loop - // over all the spatial layers for the current superframe. - vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id); - set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id, - svc_ctx.spatial_layers, frame_cnt == 0, - &ref_frame_config); - vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG, + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) + layer_id.temporal_layer_id_per_spatial[i] = 1; + } + if (example_pattern == 1) { + // example_pattern 1 is hard-coded for 2 spatial and 2 temporal layers. + assert(svc_ctx.spatial_layers == 2); + assert(svc_ctx.temporal_layers == 2); + if (frame_cnt % 2 == 0) { + // Spatial layer 0 and 1 are encoded. + layer_id.temporal_layer_id_per_spatial[0] = 0; + layer_id.temporal_layer_id_per_spatial[1] = 0; + layer_id.spatial_layer_id = 0; + } else { + // Only spatial layer 1 is encoded here. + layer_id.temporal_layer_id_per_spatial[1] = 1; + layer_id.spatial_layer_id = 1; + } + } + vpx_codec_control(&encoder, VP9E_SET_SVC_LAYER_ID, &layer_id); + // TODO(jianj): Fix the parameter passing for "is_key_frame" in + // set_frame_flags_bypass_model() for case of periodic key frames. + if (example_pattern == 0) { + set_frame_flags_bypass_mode_ex0(layer_id.temporal_layer_id, + svc_ctx.spatial_layers, frame_cnt == 0, + &ref_frame_config); + } else if (example_pattern == 1) { + set_frame_flags_bypass_mode_ex1(layer_id.temporal_layer_id, + svc_ctx.spatial_layers, frame_cnt == 0, + &ref_frame_config); + } + ref_frame_config.duration[0] = frame_duration * 1; + ref_frame_config.duration[1] = frame_duration * 1; + + vpx_codec_control(&encoder, VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); // Keep track of input frames, to account for frame drops in rate control // stats/metrics. - for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) { + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id]; } + } else { + // For the fixed pattern SVC, temporal layer is given by superframe count. + unsigned int tl = 0; + if (enc_cfg.ts_number_layers == 2) + tl = (frame_cnt % 2 != 0); + else if (enc_cfg.ts_number_layers == 3) { + if (frame_cnt % 2 != 0) tl = 2; + if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) tl = 1; + } + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) + ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + tl]; } vpx_usec_timer_start(&timer); res = vpx_svc_encode( - &svc_ctx, &codec, (end_of_stream ? NULL : &raw), pts, frame_duration, + &svc_ctx, &encoder, (end_of_stream ? NULL : &raw), pts, frame_duration, svc_ctx.speed >= 5 ? VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY); vpx_usec_timer_mark(&timer); cx_time += vpx_usec_timer_elapsed(&timer); - printf("%s", vpx_svc_get_message(&svc_ctx)); fflush(stdout); if (res != VPX_CODEC_OK) { - die_codec(&codec, "Failed to encode frame"); + die_codec(&encoder, "Failed to encode frame"); } - while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) { + while ((cx_pkt = vpx_codec_get_cx_data(&encoder, &iter)) != NULL) { switch (cx_pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: { SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal; if (cx_pkt->data.frame.sz > 0) { -#if OUTPUT_RC_STATS - uint64_t sizes[8]; - int count = 0; -#endif vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, cx_pkt->data.frame.pts); #if OUTPUT_RC_STATS - // TODO(marpan): Put this (to line728) in separate function. if (svc_ctx.output_rc_stat) { - vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id); - parse_superframe_index(cx_pkt->data.frame.buf, - cx_pkt->data.frame.sz, sizes, &count); - if (enc_cfg.ss_number_layers == 1) - sizes[0] = cx_pkt->data.frame.sz; - // Note computing input_layer_frames here won't account for frame - // drops in rate control stats. - // TODO(marpan): Fix this for non-bypass mode so we can get stats - // for dropped frames. - if (svc_ctx.temporal_layering_mode != - VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + - layer_id.temporal_layer_id]; - } - } - for (tl = layer_id.temporal_layer_id; - tl < enc_cfg.ts_number_layers; ++tl) { - vpx_video_writer_write_frame( - outfile[tl], cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, - cx_pkt->data.frame.pts); - } - - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - for (tl = layer_id.temporal_layer_id; - tl < enc_cfg.ts_number_layers; ++tl) { - const int layer = sl * enc_cfg.ts_number_layers + tl; - ++rc.layer_tot_enc_frames[layer]; - rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; - // Keep count of rate control stats per layer, for non-key - // frames. - if (tl == (unsigned int)layer_id.temporal_layer_id && - !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { - rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl]; - rc.layer_avg_rate_mismatch[layer] += - fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) / - rc.layer_pfb[layer]; - ++rc.layer_enc_frames[layer]; - } - } - } - - // Update for short-time encoding bitrate states, for moving - // window of size rc->window, shifted by rc->window / 2. - // Ignore first window segment, due to key frame. - if (frame_cnt > (unsigned int)rc.window_size) { - tl = layer_id.temporal_layer_id; - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; - } - if (frame_cnt % rc.window_size == 0) { - rc.window_count += 1; - rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size; - rc.variance_st_encoding_bitrate += - (sum_bitrate / rc.window_size) * - (sum_bitrate / rc.window_size); - sum_bitrate = 0.0; - } - } - - // Second shifted window. - if (frame_cnt > - (unsigned int)(rc.window_size + rc.window_size / 2)) { - tl = layer_id.temporal_layer_id; - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; - } - - if (frame_cnt > (unsigned int)(2 * rc.window_size) && - frame_cnt % rc.window_size == 0) { - rc.window_count += 1; - rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; - rc.variance_st_encoding_bitrate += - (sum_bitrate2 / rc.window_size) * - (sum_bitrate2 / rc.window_size); - sum_bitrate2 = 0.0; - } - } + svc_output_rc_stats(&encoder, &enc_cfg, &layer_id, cx_pkt, &rc, + outfile, frame_cnt, framerate); } #endif } @@ -868,6 +1177,11 @@ int main(int argc, const char **argv) { if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; ++frames_received; +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf, + (unsigned int)cx_pkt->data.frame.sz, NULL, 0)) + die_codec(&decoder, "Failed to decode frame."); +#endif break; } case VPX_CODEC_STATS_PKT: { @@ -877,6 +1191,19 @@ int main(int argc, const char **argv) { } default: { break; } } + +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id); + // Don't look for mismatch on top spatial and top temporal layers as they + // are non reference frames. + if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) && + !(layer_id.temporal_layer_id > 0 && + layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 && + cx_pkt->data.frame + .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) { + test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen); + } +#endif } if (!end_of_stream) { @@ -885,41 +1212,45 @@ int main(int argc, const char **argv) { } } - // Compensate for the extra frame count for the bypass mode. - if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - const int layer = - sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id; - --rc.layer_input_frames[layer]; - } - } - printf("Processed %d frames\n", frame_cnt); - fclose(infile); + + close_input_file(&app_input.input_ctx); + #if OUTPUT_RC_STATS if (svc_ctx.output_rc_stat) { printout_rate_control_summary(&rc, &enc_cfg, frame_cnt); printf("\n"); } #endif - if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + if (vpx_codec_destroy(&encoder)) + die_codec(&encoder, "Failed to destroy codec"); if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1); if (writer) { vpx_video_writer_close(writer); } #if OUTPUT_RC_STATS if (svc_ctx.output_rc_stat) { - for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { - vpx_video_writer_close(outfile[tl]); + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + vpx_video_writer_close(outfile[sl]); } } +#endif +#if CONFIG_INTERNAL_STATS + if (mismatch_seen) { + fprintf(f, "First mismatch occurred in frame %d\n", mismatch_seen); + } else { + fprintf(f, "No mismatch detected in recon buffers\n"); + } + fclose(f); #endif printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n", frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), 1000000 * (double)frame_cnt / (double)cx_time); - vpx_img_free(&raw); + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { + vpx_img_free(&raw); + } // display average size, psnr - printf("%s", vpx_svc_dump_statistics(&svc_ctx)); + vpx_svc_dump_statistics(&svc_ctx); vpx_svc_release(&svc_ctx); return EXIT_SUCCESS; } diff --git a/libs/libvpx/examples/vp9cx_set_ref.c b/libs/libvpx/examples/vp9cx_set_ref.c index 3472689db2..911ad38630 100644 --- a/libs/libvpx/examples/vp9cx_set_ref.c +++ b/libs/libvpx/examples/vp9cx_set_ref.c @@ -68,128 +68,6 @@ void usage_exit() { exit(EXIT_FAILURE); } -static int compare_img(const vpx_image_t *const img1, - const vpx_image_t *const img2) { - uint32_t l_w = img1->d_w; - uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - uint32_t i; - int match = 1; - - match &= (img1->fmt == img2->fmt); - match &= (img1->d_w == img2->d_w); - match &= (img1->d_h == img2->d_h); - - for (i = 0; i < img1->d_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], - img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], - l_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], - img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], - c_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], - img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], - c_w) == 0); - - return match; -} - -#define mmin(a, b) ((a) < (b) ? (a) : (b)) -static void find_mismatch(const vpx_image_t *const img1, - const vpx_image_t *const img2, int yloc[4], - int uloc[4], int vloc[4]) { - const uint32_t bsize = 64; - const uint32_t bsizey = bsize >> img1->y_chroma_shift; - const uint32_t bsizex = bsize >> img1->x_chroma_shift; - const uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - int match = 1; - uint32_t i, j; - yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; - for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { - for (j = 0; match && j < img1->d_w; j += bsize) { - int k, l; - const int si = mmin(i + bsize, img1->d_h) - i; - const int sj = mmin(j + bsize, img1->d_w) - j; - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l) != - *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { - yloc[0] = i + k; - yloc[1] = j + l; - yloc[2] = *(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l); - yloc[3] = *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l); - match = 0; - break; - } - } - } - } - } - - uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l) != - *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { - uloc[0] = i + k; - uloc[1] = j + l; - uloc[2] = *(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l); - uloc[3] = *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l); - match = 0; - break; - } - } - } - } - } - vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l) != - *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { - vloc[0] = i + k; - vloc[1] = j + l; - vloc[2] = *(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l); - vloc[3] = *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l); - match = 0; - break; - } - } - } - } - } -} - static void testing_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder, unsigned int frame_out, int *mismatch_seen) { vpx_image_t enc_img, dec_img; diff --git a/libs/libvpx/examples/vpx_dec_fuzzer.cc b/libs/libvpx/examples/vpx_dec_fuzzer.cc new file mode 100644 index 0000000000..d55fe1571b --- /dev/null +++ b/libs/libvpx/examples/vpx_dec_fuzzer.cc @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Fuzzer for libvpx decoders + * ========================== + * Requirements + * -------------- + * Requires Clang 6.0 or above as -fsanitize=fuzzer is used as a linker + * option. + + * Steps to build + * -------------- + * Clone libvpx repository + $git clone https://chromium.googlesource.com/webm/libvpx + + * Create a directory in parallel to libvpx and change directory + $mkdir vpx_dec_fuzzer + $cd vpx_dec_fuzzer/ + + * Enable sanitizers (Supported: address integer memory thread undefined) + $source ../libvpx/tools/set_analyzer_env.sh address + + * Configure libvpx. + * Note --size-limit and VPX_MAX_ALLOCABLE_MEMORY are defined to avoid + * Out of memory errors when running generated fuzzer binary + $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \ + --extra-cflags="-fsanitize=fuzzer-no-link \ + -DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \ + --disable-webm-io --enable-debug --disable-vp8-encoder \ + --disable-vp9-encoder --disable-examples + + * Build libvpx + $make -j32 + + * Build vp9 fuzzer + $ $CXX $CXXFLAGS -std=c++11 -DDECODER=vp9 \ + -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \ + ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \ + ./libvpx.a -Wl,--end-group + + * DECODER should be defined as vp9 or vp8 to enable vp9/vp8 + * + * create a corpus directory and copy some ivf files there. + * Based on which codec (vp8/vp9) is being tested, it is recommended to + * have corresponding ivf files in corpus directory + * Empty corpus directoy also is acceptable, though not recommended + $mkdir CORPUS && cp some-files CORPUS + + * Run fuzzing: + $./vpx_dec_fuzzer_vp9 CORPUS + + * References: + * http://llvm.org/docs/LibFuzzer.html + * https://github.com/google/oss-fuzz + */ + +#include +#include +#include +#include +#include +#include + +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" +#include "vpx_ports/mem_ops.h" + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +#define VPXD_INTERFACE(name) VPXD_INTERFACE_(name) +#define VPXD_INTERFACE_(name) vpx_codec_##name##_dx() + +extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size <= IVF_FILE_HDR_SZ) { + return 0; + } + + vpx_codec_ctx_t codec; + // Set thread count in the range [1, 64]. + const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1; + vpx_codec_dec_cfg_t cfg = { threads, 0, 0 }; + if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, 0)) { + return 0; + } + + data += IVF_FILE_HDR_SZ; + size -= IVF_FILE_HDR_SZ; + + while (size > IVF_FRAME_HDR_SZ) { + size_t frame_size = mem_get_le32(data); + size -= IVF_FRAME_HDR_SZ; + data += IVF_FRAME_HDR_SZ; + frame_size = std::min(size, frame_size); + + const vpx_codec_err_t err = + vpx_codec_decode(&codec, data, frame_size, nullptr, 0); + static_cast(err); + vpx_codec_iter_t iter = nullptr; + vpx_image_t *img = nullptr; + while ((img = vpx_codec_get_frame(&codec, &iter)) != nullptr) { + } + data += frame_size; + size -= frame_size; + } + vpx_codec_destroy(&codec); + return 0; +} diff --git a/libs/libvpx/examples/vpx_temporal_svc_encoder.c b/libs/libvpx/examples/vpx_temporal_svc_encoder.c index f5736ea45d..6afbee83d2 100644 --- a/libs/libvpx/examples/vpx_temporal_svc_encoder.c +++ b/libs/libvpx/examples/vpx_temporal_svc_encoder.c @@ -19,14 +19,18 @@ #include #include "./vpx_config.h" +#include "./y4minput.h" #include "../vpx_ports/vpx_timer.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" +#include "vpx_ports/bitops.h" #include "../tools_common.h" #include "../video_writer.h" -#define VP8_ROI_MAP 0 +#define ROI_MAP 0 + +#define zero(Dest) memset(&(Dest), 0, sizeof(Dest)); static const char *exec_name; @@ -89,19 +93,21 @@ struct RateControlMetrics { // in the stream. static void set_rate_control_metrics(struct RateControlMetrics *rc, vpx_codec_enc_cfg_t *cfg) { - unsigned int i = 0; + int i = 0; // Set the layer (cumulative) framerate and the target layer (non-cumulative) // per-frame-bandwidth, for the rate control encoding stats below. const double framerate = cfg->g_timebase.den / cfg->g_timebase.num; + const int ts_number_layers = cfg->ts_number_layers; rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0]; rc->layer_pfb[0] = 1000.0 * rc->layer_target_bitrate[0] / rc->layer_framerate[0]; - for (i = 0; i < cfg->ts_number_layers; ++i) { + for (i = 0; i < ts_number_layers; ++i) { if (i > 0) { rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i]; - rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] - - rc->layer_target_bitrate[i - 1]) / - (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); + rc->layer_pfb[i] = + 1000.0 * + (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / + (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); } rc->layer_input_frames[i] = 0; rc->layer_enc_frames[i] = 0; @@ -114,6 +120,9 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc, rc->window_size = 15; rc->avg_st_encoding_bitrate = 0.0; rc->variance_st_encoding_bitrate = 0.0; + // Target bandwidth for the whole stream. + // Set to layer_target_bitrate for highest layer (total bitrate). + cfg->rc_target_bitrate = rc->layer_target_bitrate[ts_number_layers - 1]; } static void printout_rate_control_summary(struct RateControlMetrics *rc, @@ -164,38 +173,60 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc, die("Error: Number of input frames not equal to output! \n"); } -#if VP8_ROI_MAP -static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) { +#if ROI_MAP +static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg, + vpx_roi_map_t *roi) { unsigned int i, j; - memset(roi, 0, sizeof(*roi)); + int block_size = 0; + uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0; + uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0; + if (!is_vp8 && !is_vp9) { + die("unsupported codec."); + } + zero(*roi); + + block_size = is_vp9 && !is_vp8 ? 8 : 16; // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for // segment is 16x16 for vp8, 8x8 for vp9. - roi->rows = (cfg->g_h + 15) / 16; - roi->cols = (cfg->g_w + 15) / 16; + roi->rows = (cfg->g_h + block_size - 1) / block_size; + roi->cols = (cfg->g_w + block_size - 1) / block_size; // Applies delta QP on the segment blocks, varies from -63 to 63. // Setting to negative means lower QP (better quality). // Below we set delta_q to the extreme (-63) to show strong effect. - roi->delta_q[0] = 0; + // VP8 uses the first 4 segments. VP9 uses all 8 segments. + zero(roi->delta_q); roi->delta_q[1] = -63; - roi->delta_q[2] = 0; - roi->delta_q[3] = 0; // Applies delta loopfilter strength on the segment blocks, varies from -63 to - // 63. Setting to positive means stronger loopfilter. - roi->delta_lf[0] = 0; - roi->delta_lf[1] = 0; - roi->delta_lf[2] = 0; - roi->delta_lf[3] = 0; + // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4 + // segments. VP9 uses all 8 segments. + zero(roi->delta_lf); - // Applies skip encoding threshold on the segment blocks, varies from 0 to - // UINT_MAX. Larger value means more skipping of encoding is possible. - // This skip threshold only applies on delta frames. - roi->static_threshold[0] = 0; - roi->static_threshold[1] = 0; - roi->static_threshold[2] = 0; - roi->static_threshold[3] = 0; + if (is_vp8) { + // Applies skip encoding threshold on the segment blocks, varies from 0 to + // UINT_MAX. Larger value means more skipping of encoding is possible. + // This skip threshold only applies on delta frames. + zero(roi->static_threshold); + } + + if (is_vp9) { + // Apply skip segment. Setting to 1 means this block will be copied from + // previous frame. + zero(roi->skip); + } + + if (is_vp9) { + // Apply ref frame segment. + // -1 : Do not apply this segment. + // 0 : Froce using intra. + // 1 : Force using last. + // 2 : Force using golden. + // 3 : Force using alfref but not used in non-rd pickmode for 0 lag. + memset(roi->ref_frame, -1, sizeof(roi->ref_frame)); + roi->ref_frame[1] = 1; + } // Use 2 states: 1 is center square, 0 is the rest. roi->roi_map = @@ -563,12 +594,12 @@ int main(int argc, char **argv) { int layering_mode = 0; int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 }; int flag_periodicity = 1; -#if VP8_ROI_MAP +#if ROI_MAP vpx_roi_map_t roi; #endif - vpx_svc_layer_id_t layer_id = { 0, 0 }; + vpx_svc_layer_id_t layer_id; const VpxInterface *encoder = NULL; - FILE *infile = NULL; + struct VpxInputContext input_ctx; struct RateControlMetrics rc; int64_t cx_time = 0; const int min_args_base = 13; @@ -583,6 +614,15 @@ int main(int argc, char **argv) { double sum_bitrate2 = 0.0; double framerate = 30.0; + zero(rc.layer_target_bitrate); + memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t)); + memset(&input_ctx, 0, sizeof(input_ctx)); + /* Setup default input stream settings */ + input_ctx.framerate.numerator = 30; + input_ctx.framerate.denominator = 1; + input_ctx.only_i420 = 1; + input_ctx.bit_depth = 0; + exec_name = argv[0]; // Check usage and arguments. if (argc < min_args) { @@ -621,6 +661,9 @@ int main(int argc, char **argv) { die("Invalid number of arguments"); } + input_ctx.filename = argv[1]; + open_input_file(&input_ctx); + #if CONFIG_VP9_HIGHBITDEPTH switch (strtol(argv[argc - 1], NULL, 0)) { case 8: @@ -637,14 +680,22 @@ int main(int argc, char **argv) { break; default: die("Invalid bit depth (8, 10, 12) %s", argv[argc - 1]); } - if (!vpx_img_alloc( - &raw, bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016, - width, height, 32)) { - die("Failed to allocate image", width, height); + + // Y4M reader has its own allocation. + if (input_ctx.file_type != FILE_TYPE_Y4M) { + if (!vpx_img_alloc( + &raw, + bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016, + width, height, 32)) { + die("Failed to allocate image", width, height); + } } #else - if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) { - die("Failed to allocate image", width, height); + // Y4M reader has its own allocation. + if (input_ctx.file_type != FILE_TYPE_Y4M) { + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) { + die("Failed to allocate image", width, height); + } } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -675,6 +726,9 @@ int main(int argc, char **argv) { if (speed < 0) { die("Invalid speed setting: must be positive"); } + if (strncmp(encoder->name, "vp9", 3) == 0 && speed > 9) { + warn("Mapping speed %d to speed 9.\n", speed); + } for (i = min_args_base; (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) { @@ -722,13 +776,15 @@ int main(int argc, char **argv) { set_rate_control_metrics(&rc, &cfg); - // Target bandwidth for the whole stream. - // Set to layer_target_bitrate for highest layer (total bitrate). - cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1]; - - // Open input file. - if (!(infile = fopen(argv[1], "rb"))) { - die("Failed to open %s for reading", argv[1]); + if (input_ctx.file_type == FILE_TYPE_Y4M) { + if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) { + die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h); + } + if (input_ctx.framerate.numerator != cfg.g_timebase.den || + input_ctx.framerate.denominator != cfg.g_timebase.num) { + die("Incorrect framerate: numerator %d denominator %d", + cfg.g_timebase.num, cfg.g_timebase.den); + } } framerate = cfg.g_timebase.den / cfg.g_timebase.num; @@ -766,8 +822,8 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); -#if VP8_ROI_MAP - vp8_set_roi_map(&cfg, &roi); +#if ROI_MAP + set_roi_map(encoder->name, &cfg, &roi); if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) die_codec(&codec, "Failed to set ROI map"); #endif @@ -783,7 +839,13 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); - vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1)); + vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads)); +#if ROI_MAP + set_roi_map(encoder->name, &cfg, &roi); + if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); + vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0); +#endif // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at // high speed settings, disable its use for those cases for now. if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7)) @@ -822,6 +884,7 @@ int main(int argc, char **argv) { layer_id.spatial_layer_id = 0; layer_id.temporal_layer_id = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity]; + layer_id.temporal_layer_id_per_spatial[0] = layer_id.temporal_layer_id; if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id); } else if (strncmp(encoder->name, "vp8", 3) == 0) { @@ -830,7 +893,7 @@ int main(int argc, char **argv) { } flags = layer_flags[frame_cnt % flag_periodicity]; if (layering_mode == 0) flags = 0; - frame_avail = vpx_img_read(&raw, infile); + frame_avail = read_frame(&input_ctx, &raw); if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id]; vpx_usec_timer_start(&timer); if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags, @@ -898,7 +961,7 @@ int main(int argc, char **argv) { ++frame_cnt; pts += frame_duration; } - fclose(infile); + close_input_file(&input_ctx); printout_rate_control_summary(&rc, &cfg, frame_cnt); printf("\n"); printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n", @@ -910,6 +973,12 @@ int main(int argc, char **argv) { // Try to rewrite the output file headers with the actual frame count. for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]); - vpx_img_free(&raw); + if (input_ctx.file_type != FILE_TYPE_Y4M) { + vpx_img_free(&raw); + } + +#if ROI_MAP + free(roi.roi_map); +#endif return EXIT_SUCCESS; } diff --git a/libs/libvpx/ivfdec.c b/libs/libvpx/ivfdec.c index f64e594ab0..3e179bc6ed 100644 --- a/libs/libvpx/ivfdec.c +++ b/libs/libvpx/ivfdec.c @@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, size_t frame_size = 0; if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) { - if (!feof(infile)) warn("Failed to read frame size\n"); + if (!feof(infile)) warn("Failed to read frame size"); } else { frame_size = mem_get_le32(raw_header); if (frame_size > 256 * 1024 * 1024) { - warn("Read invalid frame size (%u)\n", (unsigned int)frame_size); + warn("Read invalid frame size (%u)", (unsigned int)frame_size); frame_size = 0; } @@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, *buffer = new_buffer; *buffer_size = 2 * frame_size; } else { - warn("Failed to allocate compressed data buffer\n"); + warn("Failed to allocate compressed data buffer"); frame_size = 0; } } @@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, if (!feof(infile)) { if (fread(*buffer, 1, frame_size, infile) != frame_size) { - warn("Failed to read full frame\n"); + warn("Failed to read full frame"); return 1; } diff --git a/libs/libvpx/ivfdec.h b/libs/libvpx/ivfdec.h index af725572b4..847cd79f3f 100644 --- a/libs/libvpx/ivfdec.h +++ b/libs/libvpx/ivfdec.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef IVFDEC_H_ -#define IVFDEC_H_ +#ifndef VPX_IVFDEC_H_ +#define VPX_IVFDEC_H_ #include "./tools_common.h" @@ -25,4 +25,4 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, } /* extern "C" */ #endif -#endif // IVFDEC_H_ +#endif // VPX_IVFDEC_H_ diff --git a/libs/libvpx/ivfenc.h b/libs/libvpx/ivfenc.h index ebdce47be8..483f2d2c59 100644 --- a/libs/libvpx/ivfenc.h +++ b/libs/libvpx/ivfenc.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef IVFENC_H_ -#define IVFENC_H_ +#ifndef VPX_IVFENC_H_ +#define VPX_IVFENC_H_ #include "./tools_common.h" @@ -30,4 +30,4 @@ void ivf_write_frame_size(FILE *outfile, size_t frame_size); } /* extern "C" */ #endif -#endif // IVFENC_H_ +#endif // VPX_IVFENC_H_ diff --git a/libs/libvpx/libs.doxy_template b/libs/libvpx/libs.doxy_template index 5a8f847280..1eacc8fe2d 100644 --- a/libs/libvpx/libs.doxy_template +++ b/libs/libvpx/libs.doxy_template @@ -943,18 +943,6 @@ GENERATE_XML = NO XML_OUTPUT = xml -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that diff --git a/libs/libvpx/libs.mk b/libs/libvpx/libs.mk index a3e2f9d0eb..67d7512abe 100644 --- a/libs/libvpx/libs.mk +++ b/libs/libvpx/libs.mk @@ -88,7 +88,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS)) CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h - INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h CODEC_DOC_SECTIONS += vp9 vp9_encoder @@ -113,13 +112,6 @@ ifeq ($(CONFIG_DECODERS),yes) CODEC_DOC_SECTIONS += decoder endif -# Suppress -Wextra warnings in third party code. -$(BUILD_PFX)third_party/googletest/%.cc.o: CXXFLAGS += -Wno-missing-field-initializers -# Suppress -Wextra warnings in first party code pending investigation. -# https://bugs.chromium.org/p/webm/issues/detail?id=1069 -$(BUILD_PFX)vp8/encoder/onyx_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered -$(BUILD_PFX)vp8/decoder/onyxd_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered - ifeq ($(CONFIG_MSVS),yes) CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd) GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd) @@ -153,9 +145,6 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm endif CODEC_EXPORTS-yes += vpx/exports_com CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc -ifeq ($(CONFIG_SPATIAL_SVC),yes) -CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc -endif CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec INSTALL-LIBS-yes += include/vpx/vpx_codec.h @@ -206,6 +195,8 @@ vpx.def: $(call enabled,CODEC_EXPORTS) --out=$@ $^ CLEAN-OBJS += vpx.def +vpx.$(VCPROJ_SFX): VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) + vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def @echo " [CREATE] $@" $(qexec)$(GEN_VCPROJ) \ @@ -218,7 +209,15 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ --out=$@ $(CFLAGS) \ - $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \ + $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \ + $(filter-out $(addprefix $(SRC_PATH_BARE)/, \ + vp8/%.c vp8/%.h vp9/%.c vp9/%.h vpx/% vpx_dsp/%), \ + $(VCPROJ_SRCS)) \ --src-path-bare="$(SRC_PATH_BARE)" \ PROJECTS-yes += vpx.$(VCPROJ_SFX) @@ -233,8 +232,8 @@ OBJS-yes += $(LIBVPX_OBJS) LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) -SO_VERSION_MAJOR := 5 -SO_VERSION_MINOR := 0 +SO_VERSION_MAJOR := 6 +SO_VERSION_MINOR := 1 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib @@ -274,18 +273,6 @@ $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR) $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE) -libvpx.ver: $(call enabled,CODEC_EXPORTS) - @echo " [CREATE] $@" - $(qexec)echo "{ global:" > $@ - $(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done - $(qexec)echo "local: *; };" >> $@ -CLEAN-OBJS += libvpx.ver - -libvpx.syms: $(call enabled,CODEC_EXPORTS) - @echo " [CREATE] $@" - $(qexec)awk '{print "_"$$2}' $^ >$@ -CLEAN-OBJS += libvpx.syms - libvpx.def: $(call enabled,CODEC_EXPORTS) @echo " [CREATE] $@" $(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@ @@ -345,6 +332,18 @@ INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc CLEAN-OBJS += vpx.pc endif +libvpx.ver: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)echo "{ global:" > $@ + $(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done + $(qexec)echo "local: *; };" >> $@ +CLEAN-OBJS += libvpx.ver + +libvpx.syms: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)awk '{print "_"$$2}' $^ >$@ +CLEAN-OBJS += libvpx.syms + # # Rule to make assembler configuration file from C configuration file # diff --git a/libs/libvpx/mainpage.dox b/libs/libvpx/mainpage.dox index ec202fa4fb..4b0dff0871 100644 --- a/libs/libvpx/mainpage.dox +++ b/libs/libvpx/mainpage.dox @@ -25,8 +25,10 @@ release. - The \ref readme contains instructions on recompiling the sample applications. - Read the \ref usage "usage" for a narrative on codec usage. + \if samples - Read the \ref samples "sample code" for examples of how to interact with the codec. + \endif - \ref codec reference \if encoder - \ref encoder reference diff --git a/libs/libvpx/md5_utils.c b/libs/libvpx/md5_utils.c index 093798b833..9ddb104c8a 100644 --- a/libs/libvpx/md5_utils.c +++ b/libs/libvpx/md5_utils.c @@ -163,7 +163,7 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) { */ VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) { - register UWORD32 a, b, c, d; + UWORD32 a, b, c, d; a = buf[0]; b = buf[1]; diff --git a/libs/libvpx/md5_utils.h b/libs/libvpx/md5_utils.h index bd4991b3ad..e0d5a2d1fb 100644 --- a/libs/libvpx/md5_utils.h +++ b/libs/libvpx/md5_utils.h @@ -20,8 +20,8 @@ * Still in the public domain. */ -#ifndef MD5_UTILS_H_ -#define MD5_UTILS_H_ +#ifndef VPX_MD5_UTILS_H_ +#define VPX_MD5_UTILS_H_ #ifdef __cplusplus extern "C" { @@ -46,4 +46,4 @@ void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]); } // extern "C" #endif -#endif // MD5_UTILS_H_ +#endif // VPX_MD5_UTILS_H_ diff --git a/libs/libvpx/rate_hist.h b/libs/libvpx/rate_hist.h index 00a1676a61..d6a4c68519 100644 --- a/libs/libvpx/rate_hist.h +++ b/libs/libvpx/rate_hist.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef RATE_HIST_H_ -#define RATE_HIST_H_ +#ifndef VPX_RATE_HIST_H_ +#define VPX_RATE_HIST_H_ #include "vpx/vpx_encoder.h" @@ -37,4 +37,4 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg, } // extern "C" #endif -#endif // RATE_HIST_H_ +#endif // VPX_RATE_HIST_H_ diff --git a/libs/libvpx/test/acm_random.h b/libs/libvpx/test/acm_random.h index d915cf9133..ccfa20681a 100644 --- a/libs/libvpx/test/acm_random.h +++ b/libs/libvpx/test/acm_random.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_ACM_RANDOM_H_ -#define TEST_ACM_RANDOM_H_ +#ifndef VPX_TEST_ACM_RANDOM_H_ +#define VPX_TEST_ACM_RANDOM_H_ #include @@ -34,6 +34,24 @@ class ACMRandom { return (value >> 15) & 0xffff; } + int32_t Rand20Signed(void) { + // Use 20 bits: values between 524287 and -524288. + const uint32_t value = random_.Generate(1048576); + return static_cast(value) - 524288; + } + + int16_t Rand16Signed(void) { + // Use 16 bits: values between 32767 and -32768. + const uint32_t value = random_.Generate(65536); + return static_cast(value) - 32768; + } + + int16_t Rand13Signed(void) { + // Use 13 bits: values between 4095 and -4096. + const uint32_t value = random_.Generate(8192); + return static_cast(value) - 4096; + } + int16_t Rand9Signed(void) { // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). const uint32_t value = random_.Generate(512); @@ -73,4 +91,4 @@ class ACMRandom { } // namespace libvpx_test -#endif // TEST_ACM_RANDOM_H_ +#endif // VPX_TEST_ACM_RANDOM_H_ diff --git a/libs/libvpx/test/active_map_refresh_test.cc b/libs/libvpx/test/active_map_refresh_test.cc index d893635505..a985ed4f11 100644 --- a/libs/libvpx/test/active_map_refresh_test.cc +++ b/libs/libvpx/test/active_map_refresh_test.cc @@ -74,7 +74,7 @@ class ActiveMapRefreshTest ::libvpx_test::Encoder *encoder) { ::libvpx_test::Y4mVideoSource *y4m_video = static_cast(video); - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh); } else if (video->frame() >= 2 && video->img()) { diff --git a/libs/libvpx/test/active_map_test.cc b/libs/libvpx/test/active_map_test.cc index 1d24f956f5..03536c81ef 100644 --- a/libs/libvpx/test/active_map_test.cc +++ b/libs/libvpx/test/active_map_test.cc @@ -35,7 +35,7 @@ class ActiveMapTest virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); } else if (video->frame() == 3) { vpx_active_map_t map = vpx_active_map_t(); diff --git a/libs/libvpx/test/add_noise_test.cc b/libs/libvpx/test/add_noise_test.cc index eae32c33bb..0d1893c524 100644 --- a/libs/libvpx/test/add_noise_test.cc +++ b/libs/libvpx/test/add_noise_test.cc @@ -8,8 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ #include +#include + #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "test/util.h" #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" @@ -25,7 +28,10 @@ typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); -class AddNoiseTest : public ::testing::TestWithParam { +typedef std::tuple AddNoiseTestFPParam; + +class AddNoiseTest : public ::testing::Test, + public ::testing::WithParamInterface { public: virtual void TearDown() { libvpx_test::ClearSystemState(); } virtual ~AddNoiseTest() {} @@ -44,14 +50,14 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) { const int height = 64; const int image_size = width * height; int8_t noise[kNoiseSize]; - const int clamp = vpx_setup_noise(4.4, noise, kNoiseSize); + const int clamp = vpx_setup_noise(GET_PARAM(0), noise, kNoiseSize); uint8_t *const s = reinterpret_cast(vpx_calloc(image_size, sizeof(*s))); ASSERT_TRUE(s != NULL); memset(s, 99, image_size * sizeof(*s)); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); // Check to make sure we don't end up having either the same or no added // noise either vertically or horizontally. @@ -70,7 +76,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) { memset(s, 255, image_size); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); // Check to make sure don't roll over. for (int i = 0; i < image_size; ++i) { @@ -81,7 +87,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) { memset(s, 0, image_size); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); // Check to make sure don't roll under. for (int i = 0; i < image_size; ++i) { @@ -108,7 +114,7 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) { srand(0); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); srand(0); ASM_REGISTER_STATE_CHECK( vpx_plane_add_noise_c(d, noise, clamp, clamp, width, height, width)); @@ -121,16 +127,24 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) { vpx_free(s); } -INSTANTIATE_TEST_CASE_P(C, AddNoiseTest, - ::testing::Values(vpx_plane_add_noise_c)); +using std::make_tuple; + +INSTANTIATE_TEST_CASE_P( + C, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_c), + make_tuple(4.4, vpx_plane_add_noise_c))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest, - ::testing::Values(vpx_plane_add_noise_sse2)); +INSTANTIATE_TEST_CASE_P( + SSE2, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_sse2), + make_tuple(4.4, vpx_plane_add_noise_sse2))); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest, - ::testing::Values(vpx_plane_add_noise_msa)); +INSTANTIATE_TEST_CASE_P( + MSA, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_msa), + make_tuple(4.4, vpx_plane_add_noise_msa))); #endif } // namespace diff --git a/libs/libvpx/test/alt_ref_aq_segment_test.cc b/libs/libvpx/test/alt_ref_aq_segment_test.cc index 64a3011eb9..6e03a47852 100644 --- a/libs/libvpx/test/alt_ref_aq_segment_test.cc +++ b/libs/libvpx/test/alt_ref_aq_segment_test.cc @@ -32,7 +32,7 @@ class AltRefAqSegmentTest virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); diff --git a/libs/libvpx/test/altref_test.cc b/libs/libvpx/test/altref_test.cc index f9308c2717..0119be4da0 100644 --- a/libs/libvpx/test/altref_test.cc +++ b/libs/libvpx/test/altref_test.cc @@ -35,7 +35,7 @@ class AltRefTest : public ::libvpx_test::EncoderTest, virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_CPUUSED, 3); } diff --git a/libs/libvpx/test/android/README b/libs/libvpx/test/android/README index 4a1adcf7f4..ee21f9b652 100644 --- a/libs/libvpx/test/android/README +++ b/libs/libvpx/test/android/README @@ -3,12 +3,12 @@ Android.mk will build vpx unittests on android. ./libvpx/configure --target=armv7-android-gcc --enable-external-build \ --enable-postproc --disable-install-srcs --enable-multi-res-encoding \ --enable-temporal-denoising --disable-unit-tests --disable-install-docs \ - --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK + --disable-examples --disable-runtime-cpu-detect 2) From the parent directory, invoke ndk-build: NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \ APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \ - APP_STL=gnustl_static + APP_STL=c++_static Note: Both adb and ndk-build are available prebuilt at: https://chromium.googlesource.com/android_tools diff --git a/libs/libvpx/test/aq_segment_test.cc b/libs/libvpx/test/aq_segment_test.cc index 1c2147fbb2..3c4053be7f 100644 --- a/libs/libvpx/test/aq_segment_test.cc +++ b/libs/libvpx/test/aq_segment_test.cc @@ -31,7 +31,7 @@ class AqSegmentTest virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 100); diff --git a/libs/libvpx/test/avg_test.cc b/libs/libvpx/test/avg_test.cc index ad21198e4b..3d24f1cdb6 100644 --- a/libs/libvpx/test/avg_test.cc +++ b/libs/libvpx/test/avg_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -22,40 +23,43 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" +#include "vpx/vpx_codec.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; namespace { + +template class AverageTestBase : public ::testing::Test { public: - AverageTestBase(int width, int height) : width_(width), height_(height) {} + AverageTestBase(int width, int height) + : width_(width), height_(height), source_data_(NULL), source_stride_(0), + bit_depth_(8) {} - static void SetUpTestCase() { - source_data_ = reinterpret_cast( - vpx_memalign(kDataAlignment, kDataBlockSize)); - } - - static void TearDownTestCase() { + virtual void TearDown() { vpx_free(source_data_); source_data_ = NULL; + libvpx_test::ClearSystemState(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - protected: // Handle blocks up to 4 blocks 64x64 with stride up to 128 static const int kDataAlignment = 16; static const int kDataBlockSize = 64 * 128; virtual void SetUp() { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_TRUE(source_data_ != NULL); source_stride_ = (width_ + 31) & ~31; + bit_depth_ = 8; rnd_.Reset(ACMRandom::DeterministicSeed()); } // Sum Pixels - static unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 8; ++h) { for (int w = 0; w < 8; ++w) average += source[h * pitch + w]; @@ -63,7 +67,7 @@ class AverageTestBase : public ::testing::Test { return ((average + 32) >> 6); } - static unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 4; ++h) { for (int w = 0; w < 4; ++w) average += source[h * pitch + w]; @@ -71,7 +75,7 @@ class AverageTestBase : public ::testing::Test { return ((average + 8) >> 4); } - void FillConstant(uint8_t fill_constant) { + void FillConstant(Pixel fill_constant) { for (int i = 0; i < width_ * height_; ++i) { source_data_[i] = fill_constant; } @@ -79,21 +83,22 @@ class AverageTestBase : public ::testing::Test { void FillRandom() { for (int i = 0; i < width_ * height_; ++i) { - source_data_[i] = rnd_.Rand8(); + source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1); } } int width_, height_; - static uint8_t *source_data_; + Pixel *source_data_; int source_stride_; + int bit_depth_; ACMRandom rnd_; }; typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch); -typedef std::tr1::tuple AvgFunc; +typedef std::tuple AvgFunc; -class AverageTest : public AverageTestBase, +class AverageTest : public AverageTestBase, public ::testing::WithParamInterface { public: AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} @@ -119,12 +124,40 @@ class AverageTest : public AverageTestBase, } }; +#if CONFIG_VP9_HIGHBITDEPTH +class AverageTestHBD : public AverageTestBase, + public ::testing::WithParamInterface { + public: + AverageTestHBD() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + void CheckAverages() { + const int block_size = GET_PARAM(3); + unsigned int expected = 0; + if (block_size == 8) { + expected = + ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_); + } else if (block_size == 4) { + expected = + ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_); + } + + ASM_REGISTER_STATE_CHECK(GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_)); + unsigned int actual = GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_); + + EXPECT_EQ(expected, actual); + } +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height); -typedef std::tr1::tuple IntProRowParam; +typedef std::tuple IntProRowParam; -class IntProRowTest : public AverageTestBase, +class IntProRowTest : public AverageTestBase, public ::testing::WithParamInterface { public: IntProRowTest() @@ -135,6 +168,10 @@ class IntProRowTest : public AverageTestBase, protected: virtual void SetUp() { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_TRUE(source_data_ != NULL); + hbuf_asm_ = reinterpret_cast( vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16)); hbuf_c_ = reinterpret_cast( @@ -142,6 +179,8 @@ class IntProRowTest : public AverageTestBase, } virtual void TearDown() { + vpx_free(source_data_); + source_data_ = NULL; vpx_free(hbuf_c_); hbuf_c_ = NULL; vpx_free(hbuf_asm_); @@ -164,9 +203,9 @@ class IntProRowTest : public AverageTestBase, typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width); -typedef std::tr1::tuple IntProColParam; +typedef std::tuple IntProColParam; -class IntProColTest : public AverageTestBase, +class IntProColTest : public AverageTestBase, public ::testing::WithParamInterface { public: IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) { @@ -189,7 +228,7 @@ class IntProColTest : public AverageTestBase, }; typedef int (*SatdFunc)(const tran_low_t *coeffs, int length); -typedef std::tr1::tuple SatdTestParam; +typedef std::tuple SatdTestParam; class SatdTest : public ::testing::Test, public ::testing::WithParamInterface { @@ -212,12 +251,7 @@ class SatdTest : public ::testing::Test, for (int i = 0; i < satd_size_; ++i) src_[i] = val; } - void FillRandom() { - for (int i = 0; i < satd_size_; ++i) { - const int16_t tmp = rnd_.Rand16(); - src_[i] = (tran_low_t)tmp; - } - } + virtual void FillRandom() = 0; void Check(const int expected) { int total; @@ -225,17 +259,29 @@ class SatdTest : public ::testing::Test, EXPECT_EQ(expected, total); } + tran_low_t *GetCoeff() const { return src_; } + int satd_size_; + ACMRandom rnd_; + tran_low_t *src_; private: - tran_low_t *src_; SatdFunc satd_func_; - ACMRandom rnd_; +}; + +class SatdLowbdTest : public SatdTest { + protected: + virtual void FillRandom() { + for (int i = 0; i < satd_size_; ++i) { + const int16_t tmp = rnd_.Rand16Signed(); + src_[i] = (tran_low_t)tmp; + } + } }; typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -typedef std::tr1::tuple BlockErrorTestFPParam; +typedef std::tuple BlockErrorTestFPParam; class BlockErrorTestFP : public ::testing::Test, @@ -279,6 +325,10 @@ class BlockErrorTestFP EXPECT_EQ(expected, total); } + tran_low_t *GetCoeff() const { return coeff_; } + + tran_low_t *GetDQCoeff() const { return dqcoeff_; } + int txfm_size_; private: @@ -288,8 +338,6 @@ class BlockErrorTestFP ACMRandom rnd_; }; -uint8_t *AverageTestBase::source_data_ = NULL; - TEST_P(AverageTest, MinValue) { FillConstant(0); CheckAverages(); @@ -308,6 +356,27 @@ TEST_P(AverageTest, Random) { CheckAverages(); } } +#if CONFIG_VP9_HIGHBITDEPTH +TEST_P(AverageTestHBD, MinValue) { + FillConstant(0); + CheckAverages(); +} + +TEST_P(AverageTestHBD, MaxValue) { + FillConstant((1 << VPX_BITS_12) - 1); + CheckAverages(); +} + +TEST_P(AverageTestHBD, Random) { + bit_depth_ = VPX_BITS_12; + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + for (int i = 0; i < 1000; i++) { + FillRandom(); + CheckAverages(); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH TEST_P(IntProRowTest, MinValue) { FillConstant(0); @@ -339,27 +408,27 @@ TEST_P(IntProColTest, Random) { RunComparison(); } -TEST_P(SatdTest, MinValue) { +TEST_P(SatdLowbdTest, MinValue) { const int kMin = -32640; const int expected = -kMin * satd_size_; FillConstant(kMin); Check(expected); } -TEST_P(SatdTest, MaxValue) { +TEST_P(SatdLowbdTest, MaxValue) { const int kMax = 32640; const int expected = kMax * satd_size_; FillConstant(kMax); Check(expected); } -TEST_P(SatdTest, Random) { +TEST_P(SatdLowbdTest, Random) { int expected; switch (satd_size_) { - case 16: expected = 205298; break; - case 64: expected = 1113950; break; - case 256: expected = 4268415; break; - case 1024: expected = 16954082; break; + case 16: expected = 263252; break; + case 64: expected = 1105420; break; + case 256: expected = 4252250; break; + case 1024: expected = 16876840; break; default: FAIL() << "Invalid satd size (" << satd_size_ << ") valid: 16/64/256/1024"; @@ -368,11 +437,12 @@ TEST_P(SatdTest, Random) { Check(expected); } -TEST_P(SatdTest, DISABLED_Speed) { +TEST_P(SatdLowbdTest, DISABLED_Speed) { const int kCountSpeedTestBlock = 20000; vpx_usec_timer timer; - DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); vpx_usec_timer_start(&timer); for (int i = 0; i < kCountSpeedTestBlock; ++i) { @@ -383,6 +453,62 @@ TEST_P(SatdTest, DISABLED_Speed) { printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); } +#if CONFIG_VP9_HIGHBITDEPTH +class SatdHighbdTest : public SatdTest { + protected: + virtual void FillRandom() { + for (int i = 0; i < satd_size_; ++i) { + src_[i] = rnd_.Rand20Signed(); + } + } +}; + +TEST_P(SatdHighbdTest, MinValue) { + const int kMin = -524280; + const int expected = -kMin * satd_size_; + FillConstant(kMin); + Check(expected); +} + +TEST_P(SatdHighbdTest, MaxValue) { + const int kMax = 524280; + const int expected = kMax * satd_size_; + FillConstant(kMax); + Check(expected); +} + +TEST_P(SatdHighbdTest, Random) { + int expected; + switch (satd_size_) { + case 16: expected = 5249712; break; + case 64: expected = 18362120; break; + case 256: expected = 66100520; break; + case 1024: expected = 266094734; break; + default: + FAIL() << "Invalid satd size (" << satd_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(SatdHighbdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + TEST_P(BlockErrorTestFP, MinValue) { const int64_t kMin = -32640; const int64_t expected = kMin * kMin * txfm_size_; @@ -415,9 +541,10 @@ TEST_P(BlockErrorTestFP, Random) { TEST_P(BlockErrorTestFP, DISABLED_Speed) { const int kCountSpeedTestBlock = 20000; vpx_usec_timer timer; - DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]); const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + tran_low_t *dqcoeff = GetDQCoeff(); vpx_usec_timer_start(&timer); for (int i = 0; i < kCountSpeedTestBlock; ++i) { @@ -428,14 +555,34 @@ TEST_P(BlockErrorTestFP, DISABLED_Speed) { printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); } -using std::tr1::make_tuple; +using std::make_tuple; INSTANTIATE_TEST_CASE_P( C, AverageTest, ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c), make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c))); -INSTANTIATE_TEST_CASE_P(C, SatdTest, +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_c), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_c))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); +#endif // HAVE_SSE2 + +INSTANTIATE_TEST_CASE_P(C, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_CASE_P(C, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_c), make_tuple(64, &vpx_satd_c), make_tuple(256, &vpx_satd_c), @@ -472,7 +619,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(SSE2, SatdTest, +INSTANTIATE_TEST_CASE_P(SSE2, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_sse2), make_tuple(64, &vpx_satd_sse2), make_tuple(256, &vpx_satd_sse2), @@ -487,12 +634,21 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P(AVX2, SatdTest, +INSTANTIATE_TEST_CASE_P(AVX2, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_avx2), make_tuple(64, &vpx_satd_avx2), make_tuple(256, &vpx_satd_avx2), make_tuple(1024, &vpx_satd_avx2))); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + AVX2, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2), + make_tuple(64, &vpx_highbd_satd_avx2), + make_tuple(256, &vpx_highbd_satd_avx2), + make_tuple(1024, &vpx_highbd_satd_avx2))); +#endif // CONFIG_VP9_HIGHBITDEPTH + INSTANTIATE_TEST_CASE_P( AVX2, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), @@ -525,7 +681,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(NEON, SatdTest, +INSTANTIATE_TEST_CASE_P(NEON, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_neon), make_tuple(64, &vpx_satd_neon), make_tuple(256, &vpx_satd_neon), @@ -570,7 +726,7 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): Remove the highbitdepth flag once the SIMD functions are // in place. #if !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(MSA, SatdTest, +INSTANTIATE_TEST_CASE_P(MSA, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_msa), make_tuple(64, &vpx_satd_msa), make_tuple(256, &vpx_satd_msa), diff --git a/libs/libvpx/test/bench.cc b/libs/libvpx/test/bench.cc new file mode 100644 index 0000000000..4b883d8250 --- /dev/null +++ b/libs/libvpx/test/bench.cc @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "test/bench.h" +#include "vpx_ports/vpx_timer.h" + +void AbstractBench::RunNTimes(int n) { + for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) { + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int j = 0; j < n; ++j) { + Run(); + } + vpx_usec_timer_mark(&timer); + times_[r] = static_cast(vpx_usec_timer_elapsed(&timer)); + } +} + +void AbstractBench::PrintMedian(const char *title) { + std::sort(times_, times_ + VPX_BENCH_ROBUST_ITER); + const int med = times_[VPX_BENCH_ROBUST_ITER >> 1]; + int sad = 0; + for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) { + sad += abs(times_[t] - med); + } + printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0, + sad / (VPX_BENCH_ROBUST_ITER * 1000.0)); +} diff --git a/libs/libvpx/test/bench.h b/libs/libvpx/test/bench.h new file mode 100644 index 0000000000..57ca9118ba --- /dev/null +++ b/libs/libvpx/test/bench.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_BENCH_H_ +#define VPX_TEST_BENCH_H_ + +// Number of iterations used to compute median run time. +#define VPX_BENCH_ROBUST_ITER 15 + +class AbstractBench { + public: + void RunNTimes(int n); + void PrintMedian(const char *title); + + protected: + // Implement this method and put the code to benchmark in it. + virtual void Run() = 0; + + private: + int times_[VPX_BENCH_ROBUST_ITER]; +}; + +#endif // VPX_TEST_BENCH_H_ diff --git a/libs/libvpx/test/blockiness_test.cc b/libs/libvpx/test/blockiness_test.cc index 2fa10192f1..ced6e66c62 100644 --- a/libs/libvpx/test/blockiness_test.cc +++ b/libs/libvpx/test/blockiness_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -25,10 +26,7 @@ #include "test/util.h" #include "vpx_mem/vpx_mem.h" - -extern "C" double vp9_get_blockiness(const unsigned char *img1, int img1_pitch, - const unsigned char *img2, int img2_pitch, - int width, int height); +#include "vp9/encoder/vp9_blockiness.h" using libvpx_test::ACMRandom; @@ -141,7 +139,7 @@ class BlockinessTestBase : public ::testing::Test { }; #if CONFIG_VP9_ENCODER -typedef std::tr1::tuple BlockinessParam; +typedef std::tuple BlockinessParam; class BlockinessVP9Test : public BlockinessTestBase, public ::testing::WithParamInterface { @@ -208,15 +206,15 @@ TEST_P(BlockinessVP9Test, WorstCaseBlockiness) { } #endif // CONFIG_VP9_ENCODER -using std::tr1::make_tuple; +using std::make_tuple; //------------------------------------------------------------------------------ // C functions #if CONFIG_VP9_ENCODER -const BlockinessParam c_vp9_tests[] = { - make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238), -}; +const BlockinessParam c_vp9_tests[] = { make_tuple(320, 240), + make_tuple(318, 242), + make_tuple(318, 238) }; INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests)); #endif diff --git a/libs/libvpx/test/borders_test.cc b/libs/libvpx/test/borders_test.cc index e66ff02e25..b91a15b800 100644 --- a/libs/libvpx/test/borders_test.cc +++ b/libs/libvpx/test/borders_test.cc @@ -31,7 +31,7 @@ class BordersTest virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 1); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); diff --git a/libs/libvpx/test/buffer.h b/libs/libvpx/test/buffer.h index 2175dad9d9..b003d2f0d0 100644 --- a/libs/libvpx/test/buffer.h +++ b/libs/libvpx/test/buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_BUFFER_H_ -#define TEST_BUFFER_H_ +#ifndef VPX_TEST_BUFFER_H_ +#define VPX_TEST_BUFFER_H_ #include @@ -379,4 +379,4 @@ bool Buffer::BufferSizesMatch(const Buffer &a) const { return true; } } // namespace libvpx_test -#endif // TEST_BUFFER_H_ +#endif // VPX_TEST_BUFFER_H_ diff --git a/libs/libvpx/test/byte_alignment_test.cc b/libs/libvpx/test/byte_alignment_test.cc index 5a058b2756..0ef6c4c519 100644 --- a/libs/libvpx/test/byte_alignment_test.cc +++ b/libs/libvpx/test/byte_alignment_test.cc @@ -171,8 +171,9 @@ TEST_F(ByteAlignmentTest, SwitchByteAlignment) { TEST_P(ByteAlignmentTest, TestAlignment) { const ByteAlignmentTestParam t = GetParam(); SetByteAlignment(t.byte_alignment, t.expected_value); - if (t.decode_remaining) + if (t.decode_remaining) { ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment)); + } } INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest, diff --git a/libs/libvpx/test/clear_system_state.h b/libs/libvpx/test/clear_system_state.h index 044a5c7583..ba3c0b386a 100644 --- a/libs/libvpx/test/clear_system_state.h +++ b/libs/libvpx/test/clear_system_state.h @@ -7,23 +7,17 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_CLEAR_SYSTEM_STATE_H_ -#define TEST_CLEAR_SYSTEM_STATE_H_ +#ifndef VPX_TEST_CLEAR_SYSTEM_STATE_H_ +#define VPX_TEST_CLEAR_SYSTEM_STATE_H_ #include "./vpx_config.h" -#if ARCH_X86 || ARCH_X86_64 -#include "vpx_ports/x86.h" -#endif +#include "vpx_ports/system_state.h" namespace libvpx_test { // Reset system to a known state. This function should be used for all non-API // test cases. -inline void ClearSystemState() { -#if ARCH_X86 || ARCH_X86_64 - vpx_reset_mmx_state(); -#endif -} +inline void ClearSystemState() { vpx_clear_system_state(); } } // namespace libvpx_test -#endif // TEST_CLEAR_SYSTEM_STATE_H_ +#endif // VPX_TEST_CLEAR_SYSTEM_STATE_H_ diff --git a/libs/libvpx/test/codec_factory.h b/libs/libvpx/test/codec_factory.h index d5882ed9c8..17c9512ca8 100644 --- a/libs/libvpx/test/codec_factory.h +++ b/libs/libvpx/test/codec_factory.h @@ -7,8 +7,10 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_CODEC_FACTORY_H_ -#define TEST_CODEC_FACTORY_H_ +#ifndef VPX_TEST_CODEC_FACTORY_H_ +#define VPX_TEST_CODEC_FACTORY_H_ + +#include #include "./vpx_config.h" #include "vpx/vpx_decoder.h" @@ -53,23 +55,22 @@ class CodecFactory { template class CodecTestWithParam : public ::testing::TestWithParam< - std::tr1::tuple > {}; + std::tuple > {}; template class CodecTestWith2Params : public ::testing::TestWithParam< - std::tr1::tuple > {}; + std::tuple > {}; template class CodecTestWith3Params : public ::testing::TestWithParam< - std::tr1::tuple > {}; + std::tuple > {}; template class CodecTestWith4Params : public ::testing::TestWithParam< - std::tr1::tuple > { -}; + std::tuple > {}; /* * VP8 Codec Definitions @@ -264,4 +265,4 @@ const libvpx_test::VP9CodecFactory kVP9; #endif // CONFIG_VP9 } // namespace libvpx_test -#endif // TEST_CODEC_FACTORY_H_ +#endif // VPX_TEST_CODEC_FACTORY_H_ diff --git a/libs/libvpx/test/comp_avg_pred_test.cc b/libs/libvpx/test/comp_avg_pred_test.cc index 110e065836..56e701e09c 100644 --- a/libs/libvpx/test/comp_avg_pred_test.cc +++ b/libs/libvpx/test/comp_avg_pred_test.cc @@ -29,6 +29,10 @@ uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; } void reference_pred(const Buffer &pred, const Buffer &ref, int width, int height, Buffer *avg) { + ASSERT_TRUE(avg->TopLeftPixel() != NULL); + ASSERT_TRUE(pred.TopLeftPixel() != NULL); + ASSERT_TRUE(ref.TopLeftPixel() != NULL); + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { avg->TopLeftPixel()[y * avg->stride() + x] = diff --git a/libs/libvpx/test/consistency_test.cc b/libs/libvpx/test/consistency_test.cc index 37b4a45e54..875b06f4aa 100644 --- a/libs/libvpx/test/consistency_test.cc +++ b/libs/libvpx/test/consistency_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -127,7 +128,7 @@ class ConsistencyTestBase : public ::testing::Test { }; #if CONFIG_VP9_ENCODER -typedef std::tr1::tuple ConsistencyParam; +typedef std::tuple ConsistencyParam; class ConsistencyVP9Test : public ConsistencyTestBase, public ::testing::WithParamInterface { @@ -198,15 +199,15 @@ TEST_P(ConsistencyVP9Test, ConsistencyIsZero) { } #endif // CONFIG_VP9_ENCODER -using std::tr1::make_tuple; +using std::make_tuple; //------------------------------------------------------------------------------ // C functions #if CONFIG_VP9_ENCODER -const ConsistencyParam c_vp9_tests[] = { - make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238), -}; +const ConsistencyParam c_vp9_tests[] = { make_tuple(320, 240), + make_tuple(318, 242), + make_tuple(318, 238) }; INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test, ::testing::ValuesIn(c_vp9_tests)); #endif diff --git a/libs/libvpx/test/convolve_test.cc b/libs/libvpx/test/convolve_test.cc index 70f0b11a77..47589a9f2e 100644 --- a/libs/libvpx/test/convolve_test.cc +++ b/libs/libvpx/test/convolve_test.cc @@ -9,6 +9,7 @@ */ #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -77,7 +78,7 @@ struct ConvolveFunctions { int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth. }; -typedef std::tr1::tuple ConvolveParam; +typedef std::tuple ConvolveParam; #define ALL_SIZES(convolve_fn) \ make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn), \ @@ -114,6 +115,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride, // and filter_max_width = 16 // uint8_t intermediate_buffer[71 * kMaxDimension]; + vp9_zero(intermediate_buffer); const int intermediate_next_stride = 1 - static_cast(intermediate_height * output_width); @@ -213,6 +215,8 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, const int intermediate_next_stride = 1 - static_cast(intermediate_height * output_width); + vp9_zero(intermediate_buffer); + // Horizontal pass (src -> transposed intermediate). { uint16_t *output_ptr = intermediate_buffer; @@ -412,8 +416,14 @@ class ConvolveTest : public ::testing::TestWithParam { for (int i = 0; i < kOutputBufferSize; ++i) { if (IsIndexInBorder(i)) { output_[i] = 255; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = mask_; +#endif } else { output_[i] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = 0; +#endif } } @@ -450,7 +460,9 @@ class ConvolveTest : public ::testing::TestWithParam { void CheckGuardBlocks() { for (int i = 0; i < kOutputBufferSize; ++i) { - if (IsIndexInBorder(i)) EXPECT_EQ(255, output_[i]); + if (IsIndexInBorder(i)) { + EXPECT_EQ(255, output_[i]); + } } } @@ -672,6 +684,74 @@ TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) { UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); } +TEST_P(ConvolveTest, DISABLED_4Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) { const uint8_t *const in = input(); uint8_t *const out = output(); @@ -787,7 +867,7 @@ TEST_P(ConvolveTest, Copy2D) { } } -const int kNumFilterBanks = 4; +const int kNumFilterBanks = 5; const int kNumFilters = 16; TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { @@ -1040,7 +1120,7 @@ TEST_P(ConvolveTest, CheckScalingFiltering) { } #endif -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH #define WRAP(func, bd) \ @@ -1183,9 +1263,9 @@ const ConvolveFunctions convolve12_c( wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12); -const ConvolveParam kArrayConvolve_c[] = { - ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c) -}; +const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c), + ALL_SIZES(convolve10_c), + ALL_SIZES(convolve12_c) }; #else const ConvolveFunctions convolve8_c( @@ -1377,4 +1457,16 @@ const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) }; INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_vsx)); #endif // HAVE_VSX + +#if HAVE_MMI +const ConvolveFunctions convolve8_mmi( + vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi, + vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi, + vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) }; +INSTANTIATE_TEST_CASE_P(MMI, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_mmi)); +#endif // HAVE_MMI } // namespace diff --git a/libs/libvpx/test/cpu_speed_test.cc b/libs/libvpx/test/cpu_speed_test.cc index 404b5b44f4..2fb5c10eae 100644 --- a/libs/libvpx/test/cpu_speed_test.cc +++ b/libs/libvpx/test/cpu_speed_test.cc @@ -44,7 +44,7 @@ class CpuSpeedTest virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -152,5 +152,5 @@ VP9_INSTANTIATE_TEST_CASE(CpuSpeedTest, ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(0, 9)); + ::testing::Range(0, 10)); } // namespace diff --git a/libs/libvpx/test/cq_test.cc b/libs/libvpx/test/cq_test.cc index 20e1f0f3de..474b9d0fa2 100644 --- a/libs/libvpx/test/cq_test.cc +++ b/libs/libvpx/test/cq_test.cc @@ -65,7 +65,7 @@ class CQTest : public ::libvpx_test::EncoderTest, virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { if (cfg_.rc_end_usage == VPX_CQ) { encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_); } diff --git a/libs/libvpx/test/datarate_test.cc b/libs/libvpx/test/datarate_test.cc deleted file mode 100644 index 31a8523d21..0000000000 --- a/libs/libvpx/test/datarate_test.cc +++ /dev/null @@ -1,1876 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "./vpx_config.h" -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "test/codec_factory.h" -#include "test/encode_test_driver.h" -#include "test/i420_video_source.h" -#include "test/util.h" -#include "test/y4m_video_source.h" -#include "vpx/vpx_codec.h" - -namespace { - -class DatarateTestLarge - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { - public: - DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} - - virtual ~DatarateTestLarge() {} - - protected: - virtual void SetUp() { - InitializeConfig(); - SetMode(GET_PARAM(1)); - set_cpu_used_ = GET_PARAM(2); - ResetModel(); - } - - virtual void ResetModel() { - last_pts_ = 0; - bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; - frame_number_ = 0; - first_drop_ = 0; - bits_total_ = 0; - duration_ = 0.0; - denoiser_offon_test_ = 0; - denoiser_offon_period_ = -1; - gf_boost_ = 0; - use_roi_ = 0; - } - - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 0) { - encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); - encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); - encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); - } - -#if CONFIG_VP8_ENCODER - if (use_roi_ == 1) { - encoder->Control(VP8E_SET_ROI_MAP, &roi_); - } -#endif - - if (denoiser_offon_test_) { - ASSERT_GT(denoiser_offon_period_, 0) - << "denoiser_offon_period_ is not positive."; - if ((video->frame() + 1) % denoiser_offon_period_ == 0) { - // Flip denoiser_on_ periodically - denoiser_on_ ^= 1; - } - encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); - } - - const vpx_rational_t tb = video->timebase(); - timebase_ = static_cast(tb.num) / tb.den; - duration_ = 0; - } - - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - // Time since last timestamp = duration. - vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; - - // TODO(jimbankoski): Remove these lines when the issue: - // http://code.google.com/p/webm/issues/detail?id=496 is fixed. - // For now the codec assumes buffer starts at starting buffer rate - // plus one frame's time. - if (last_pts_ == 0) duration = 1; - - // Add to the buffer the bits we'd expect from a constant bitrate server. - bits_in_buffer_model_ += static_cast( - duration * timebase_ * cfg_.rc_target_bitrate * 1000); - - /* Test the buffer model here before subtracting the frame. Do so because - * the way the leaky bucket model works in libvpx is to allow the buffer to - * empty - and then stop showing frames until we've got enough bits to - * show one. As noted in comment below (issue 495), this does not currently - * apply to key frames. For now exclude key frames in condition below. */ - const bool key_frame = - (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; - if (!key_frame) { - ASSERT_GE(bits_in_buffer_model_, 0) - << "Buffer Underrun at frame " << pkt->data.frame.pts; - } - - const int64_t frame_size_in_bits = pkt->data.frame.sz * 8; - - // Subtract from the buffer the bits associated with a played back frame. - bits_in_buffer_model_ -= frame_size_in_bits; - - // Update the running total of bits for end of test datarate checks. - bits_total_ += frame_size_in_bits; - - // If first drop not set and we have a drop set it to this time. - if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; - - // Update the most recent pts. - last_pts_ = pkt->data.frame.pts; - - // We update this so that we can calculate the datarate minus the last - // frame encoded in the file. - bits_in_last_frame_ = frame_size_in_bits; - - ++frame_number_; - } - - virtual void EndPassHook(void) { - if (bits_total_) { - const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit - - duration_ = (last_pts_ + 1) * timebase_; - - // Effective file datarate includes the time spent prebuffering. - effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 / - (cfg_.rc_buf_initial_sz / 1000.0 + duration_); - - file_datarate_ = file_size_in_kb / duration_; - } - } - - vpx_codec_pts_t last_pts_; - int64_t bits_in_buffer_model_; - double timebase_; - int frame_number_; - vpx_codec_pts_t first_drop_; - int64_t bits_total_; - double duration_; - double file_datarate_; - double effective_datarate_; - int64_t bits_in_last_frame_; - int denoiser_on_; - int denoiser_offon_test_; - int denoiser_offon_period_; - int set_cpu_used_; - int gf_boost_; - int use_roi_; - vpx_roi_map_t roi_; -}; - -#if CONFIG_TEMPORAL_DENOISING -// Check basic datarate targeting, for a single bitrate, but loop over the -// various denoiser settings. -TEST_P(DatarateTestLarge, DenoiserLevels) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - for (int j = 1; j < 5; ++j) { - // Run over the denoiser levels. - // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j - // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, - // denoiserOnAggressive, and denoiserOnAdaptive. - denoiser_on_ = j; - cfg_.rc_target_bitrate = 300; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is off -// and on. -TEST_P(DatarateTestLarge, DenoiserOffOn) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 299); - cfg_.rc_target_bitrate = 300; - ResetModel(); - // The denoiser is off by default. - denoiser_on_ = 0; - // Set the offon test flag. - denoiser_offon_test_ = 1; - denoiser_offon_period_ = 100; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} -#endif // CONFIG_TEMPORAL_DENOISING - -TEST_P(DatarateTestLarge, BasicBufferModel) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - // 2 pass cbr datarate control has a bug hidden by the small # of - // frames selected in this encode. The problem is that even if the buffer is - // negative we produce a keyframe on a cutscene. Ignoring datarate - // constraints - // TODO(jimbankoski): ( Fix when issue - // http://code.google.com/p/webm/issues/detail?id=495 is addressed. ) - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - // There is an issue for low bitrates in real-time mode, where the - // effective_datarate slightly overshoots the target bitrate. - // This is same the issue as noted about (#495). - // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), - // when the issue is resolved. - for (int i = 100; i < 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_max_quantizer = 36; - cfg_.rc_end_usage = VPX_CBR; - cfg_.rc_target_bitrate = 200; - cfg_.kf_mode = VPX_KF_DISABLED; - - const int frame_count = 40; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, frame_count); - - // Here we check that the first dropped frame gets earlier and earlier - // as the drop frame threshold is increased. - - const int kDropFrameThreshTestStep = 30; - vpx_codec_pts_t last_drop = frame_count; - for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { - cfg_.rc_dropframe_thresh = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_LE(first_drop_, last_drop) - << " The first dropped frame for drop_thresh " << i - << " > first dropped frame for drop_thresh " - << i - kDropFrameThreshTestStep; - last_drop = first_drop_; - } -} - -TEST_P(DatarateTestLarge, DropFramesMultiThreads) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 30; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_threads = 2; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - cfg_.rc_target_bitrate = 200; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} - -class DatarateTestRealTime : public DatarateTestLarge { - public: - virtual ~DatarateTestRealTime() {} -}; - -#if CONFIG_TEMPORAL_DENOISING -// Check basic datarate targeting, for a single bitrate, but loop over the -// various denoiser settings. -TEST_P(DatarateTestRealTime, DenoiserLevels) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - for (int j = 1; j < 5; ++j) { - // Run over the denoiser levels. - // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j - // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, - // denoiserOnAggressive, and denoiserOnAdaptive. - denoiser_on_ = j; - cfg_.rc_target_bitrate = 300; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is off -// and on. -TEST_P(DatarateTestRealTime, DenoiserOffOn) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 299); - cfg_.rc_target_bitrate = 300; - ResetModel(); - // The denoiser is off by default. - denoiser_on_ = 0; - // Set the offon test flag. - denoiser_offon_test_ = 1; - denoiser_offon_period_ = 100; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} -#endif // CONFIG_TEMPORAL_DENOISING - -TEST_P(DatarateTestRealTime, BasicBufferModel) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - // 2 pass cbr datarate control has a bug hidden by the small # of - // frames selected in this encode. The problem is that even if the buffer is - // negative we produce a keyframe on a cutscene, ignoring datarate - // constraints - // TODO(jimbankoski): Fix when issue - // http://bugs.chromium.org/p/webm/issues/detail?id=495 is addressed. - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - // There is an issue for low bitrates in real-time mode, where the - // effective_datarate slightly overshoots the target bitrate. - // This is same the issue as noted above (#495). - // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), - // when the issue is resolved. - for (int i = 100; i <= 700; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_max_quantizer = 36; - cfg_.rc_end_usage = VPX_CBR; - cfg_.rc_target_bitrate = 200; - cfg_.kf_mode = VPX_KF_DISABLED; - - const int frame_count = 40; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, frame_count); - - // Check that the first dropped frame gets earlier and earlier - // as the drop frame threshold is increased. - - const int kDropFrameThreshTestStep = 30; - vpx_codec_pts_t last_drop = frame_count; - for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { - cfg_.rc_dropframe_thresh = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_LE(first_drop_, last_drop) - << " The first dropped frame for drop_thresh " << i - << " > first dropped frame for drop_thresh " - << i - kDropFrameThreshTestStep; - last_drop = first_drop_; - } -} - -TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 30; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - // Encode using multiple threads. - cfg_.g_threads = 2; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - cfg_.rc_target_bitrate = 200; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} - -TEST_P(DatarateTestRealTime, RegionOfInterest) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 0; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - // Encode using multiple threads. - cfg_.g_threads = 2; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - cfg_.rc_target_bitrate = 450; - cfg_.g_w = 352; - cfg_.g_h = 288; - - ResetModel(); - - // Set ROI parameters - use_roi_ = 1; - memset(&roi_, 0, sizeof(roi_)); - - roi_.rows = (cfg_.g_h + 15) / 16; - roi_.cols = (cfg_.g_w + 15) / 16; - - roi_.delta_q[0] = 0; - roi_.delta_q[1] = -20; - roi_.delta_q[2] = 0; - roi_.delta_q[3] = 0; - - roi_.delta_lf[0] = 0; - roi_.delta_lf[1] = -20; - roi_.delta_lf[2] = 0; - roi_.delta_lf[3] = 0; - - roi_.static_threshold[0] = 0; - roi_.static_threshold[1] = 1000; - roi_.static_threshold[2] = 0; - roi_.static_threshold[3] = 0; - - // Use 2 states: 1 is center square, 0 is the rest. - roi_.roi_map = - (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); - for (unsigned int i = 0; i < roi_.rows; ++i) { - for (unsigned int j = 0; j < roi_.cols; ++j) { - if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && - j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { - roi_.roi_map[i * roi_.cols + j] = 1; - } - } - } - - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - - free(roi_.roi_map); -} - -TEST_P(DatarateTestRealTime, GFBoost) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 0; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_error_resilient = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - cfg_.rc_target_bitrate = 300; - ResetModel(); - // Apply a gf boost. - gf_boost_ = 50; - - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} - -class DatarateTestVP9Large - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { - public: - DatarateTestVP9Large() : EncoderTest(GET_PARAM(0)) {} - - protected: - virtual ~DatarateTestVP9Large() {} - - virtual void SetUp() { - InitializeConfig(); - SetMode(GET_PARAM(1)); - set_cpu_used_ = GET_PARAM(2); - ResetModel(); - } - - virtual void ResetModel() { - last_pts_ = 0; - bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; - frame_number_ = 0; - tot_frame_number_ = 0; - first_drop_ = 0; - num_drops_ = 0; - // Denoiser is off by default. - denoiser_on_ = 0; - // For testing up to 3 layers. - for (int i = 0; i < 3; ++i) { - bits_total_[i] = 0; - } - denoiser_offon_test_ = 0; - denoiser_offon_period_ = -1; - frame_parallel_decoding_mode_ = 1; - } - - // - // Frame flags and layer id for temporal layers. - // - - // For two layers, test pattern is: - // 1 3 - // 0 2 ..... - // For three layers, test pattern is: - // 1 3 5 7 - // 2 6 - // 0 4 .... - // LAST is always update on base/layer 0, GOLDEN is updated on layer 1. - // For this 3 layer example, the 2nd enhancement layer (layer 2) updates - // the altref frame. - int SetFrameFlags(int frame_num, int num_temp_layers) { - int frame_flags = 0; - if (num_temp_layers == 2) { - if (frame_num % 2 == 0) { - // Layer 0: predict from L and ARF, update L. - frame_flags = - VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; - } else { - // Layer 1: predict from L, G and ARF, and update G. - frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | - VP8_EFLAG_NO_UPD_ENTROPY; - } - } else if (num_temp_layers == 3) { - if (frame_num % 4 == 0) { - // Layer 0: predict from L and ARF; update L. - frame_flags = - VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; - } else if ((frame_num - 2) % 4 == 0) { - // Layer 1: predict from L, G, ARF; update G. - frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; - } else if ((frame_num - 1) % 2 == 0) { - // Layer 2: predict from L, G, ARF; update ARF. - frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; - } - } - return frame_flags; - } - - int SetLayerId(int frame_num, int num_temp_layers) { - int layer_id = 0; - if (num_temp_layers == 2) { - if (frame_num % 2 == 0) { - layer_id = 0; - } else { - layer_id = 1; - } - } else if (num_temp_layers == 3) { - if (frame_num % 4 == 0) { - layer_id = 0; - } else if ((frame_num - 2) % 4 == 0) { - layer_id = 1; - } else if ((frame_num - 1) % 2 == 0) { - layer_id = 2; - } - } - return layer_id; - } - - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); - - if (denoiser_offon_test_) { - ASSERT_GT(denoiser_offon_period_, 0) - << "denoiser_offon_period_ is not positive."; - if ((video->frame() + 1) % denoiser_offon_period_ == 0) { - // Flip denoiser_on_ periodically - denoiser_on_ ^= 1; - } - } - - encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); - encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1)); - encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, - frame_parallel_decoding_mode_); - - if (cfg_.ts_number_layers > 1) { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_SVC, 1); - } - vpx_svc_layer_id_t layer_id; - layer_id.spatial_layer_id = 0; - frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id = - SetLayerId(video->frame(), cfg_.ts_number_layers); - encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); - } - const vpx_rational_t tb = video->timebase(); - timebase_ = static_cast(tb.num) / tb.den; - duration_ = 0; - } - - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - // Time since last timestamp = duration. - vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; - - if (duration > 1) { - // If first drop not set and we have a drop set it to this time. - if (!first_drop_) first_drop_ = last_pts_ + 1; - // Update the number of frame drops. - num_drops_ += static_cast(duration - 1); - // Update counter for total number of frames (#frames input to encoder). - // Needed for setting the proper layer_id below. - tot_frame_number_ += static_cast(duration - 1); - } - - int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers); - - // Add to the buffer the bits we'd expect from a constant bitrate server. - bits_in_buffer_model_ += static_cast( - duration * timebase_ * cfg_.rc_target_bitrate * 1000); - - // Buffer should not go negative. - ASSERT_GE(bits_in_buffer_model_, 0) - << "Buffer Underrun at frame " << pkt->data.frame.pts; - - const size_t frame_size_in_bits = pkt->data.frame.sz * 8; - - // Update the total encoded bits. For temporal layers, update the cumulative - // encoded bits per layer. - for (int i = layer; i < static_cast(cfg_.ts_number_layers); ++i) { - bits_total_[i] += frame_size_in_bits; - } - - // Update the most recent pts. - last_pts_ = pkt->data.frame.pts; - ++frame_number_; - ++tot_frame_number_; - } - - virtual void EndPassHook(void) { - for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); - ++layer) { - duration_ = (last_pts_ + 1) * timebase_; - if (bits_total_[layer]) { - // Effective file datarate: - effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_; - } - } - } - - vpx_codec_pts_t last_pts_; - double timebase_; - int frame_number_; // Counter for number of non-dropped/encoded frames. - int tot_frame_number_; // Counter for total number of input frames. - int64_t bits_total_[3]; - double duration_; - double effective_datarate_[3]; - int set_cpu_used_; - int64_t bits_in_buffer_model_; - vpx_codec_pts_t first_drop_; - int num_drops_; - int denoiser_on_; - int denoiser_offon_test_; - int denoiser_offon_period_; - int frame_parallel_decoding_mode_; -}; - -// Check basic rate targeting for VBR mode with 0 lag. -TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) { - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.g_error_resilient = 0; - cfg_.rc_end_usage = VPX_VBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - for (int i = 400; i <= 800; i += 400) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for VBR mode with non-zero lag. -TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) { - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.g_error_resilient = 0; - cfg_.rc_end_usage = VPX_VBR; - // For non-zero lag, rate control will work (be within bounds) for - // real-time mode. - if (deadline_ == VPX_DL_REALTIME) { - cfg_.g_lag_in_frames = 15; - } else { - cfg_.g_lag_in_frames = 0; - } - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - for (int i = 400; i <= 800; i += 400) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for VBR mode with non-zero lag, with -// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs -// since error_resilience is off. -TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) { - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.g_error_resilient = 0; - cfg_.rc_end_usage = VPX_VBR; - // For non-zero lag, rate control will work (be within bounds) for - // real-time mode. - if (deadline_ == VPX_DL_REALTIME) { - cfg_.g_lag_in_frames = 15; - } else { - cfg_.g_lag_in_frames = 0; - } - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - for (int i = 400; i <= 800; i += 400) { - cfg_.rc_target_bitrate = i; - ResetModel(); - frame_parallel_decoding_mode_ = 0; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for CBR mode. -TEST_P(DatarateTestVP9Large, BasicRateTargeting) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - for (int i = 150; i < 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode -// off( and error_resilience off). -TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.g_error_resilient = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - for (int i = 150; i < 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - frame_parallel_decoding_mode_ = 0; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. -TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 30; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - // Encode using multiple threads. - cfg_.g_threads = 2; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - cfg_.rc_target_bitrate = 200; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} - -// Check basic rate targeting for CBR. -TEST_P(DatarateTestVP9Large, BasicRateTargeting444) { - ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); - - cfg_.g_profile = 1; - cfg_.g_timebase = video.timebase(); - - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - - for (int i = 250; i < 900; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(static_cast(cfg_.rc_target_bitrate), - effective_datarate_[0] * 0.80) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(static_cast(cfg_.rc_target_bitrate), - effective_datarate_[0] * 1.15) - << " The datarate for the file missed the target!" - << cfg_.rc_target_bitrate << " " << effective_datarate_; - } -} - -// Check that (1) the first dropped frame gets earlier and earlier -// as the drop frame threshold is increased, and (2) that the total number of -// frame drops does not decrease as we increase frame drop threshold. -// Use a lower qp-max to force some frame drops. -TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_undershoot_pct = 20; - cfg_.rc_undershoot_pct = 20; - cfg_.rc_dropframe_thresh = 10; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 50; - cfg_.rc_end_usage = VPX_CBR; - cfg_.rc_target_bitrate = 200; - cfg_.g_lag_in_frames = 0; - // TODO(marpan): Investigate datarate target failures with a smaller keyframe - // interval (128). - cfg_.kf_max_dist = 9999; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - const int kDropFrameThreshTestStep = 30; - for (int j = 50; j <= 150; j += 100) { - cfg_.rc_target_bitrate = j; - vpx_codec_pts_t last_drop = 140; - int last_num_drops = 0; - for (int i = 10; i < 100; i += kDropFrameThreshTestStep) { - cfg_.rc_dropframe_thresh = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) - << " The datarate for the file is greater than target by too much!"; - ASSERT_LE(first_drop_, last_drop) - << " The first dropped frame for drop_thresh " << i - << " > first dropped frame for drop_thresh " - << i - kDropFrameThreshTestStep; - ASSERT_GE(num_drops_, last_num_drops * 0.85) - << " The number of dropped frames for drop_thresh " << i - << " < number of dropped frames for drop_thresh " - << i - kDropFrameThreshTestStep; - last_drop = first_drop_; - last_num_drops = num_drops_; - } - } -} - -// Check basic rate targeting for 2 temporal layers. -TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1). - cfg_.ss_number_layers = 1; - cfg_.ts_number_layers = 2; - cfg_.ts_rate_decimator[0] = 2; - cfg_.ts_rate_decimator[1] = 1; - - cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; - - if (deadline_ == VPX_DL_REALTIME) cfg_.g_error_resilient = 1; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - for (int i = 200; i <= 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - // 60-40 bitrate allocation for 2 temporal layers. - cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { - ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) - << " The datarate for the file is lower than target by too much, " - "for layer: " - << j; - ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15) - << " The datarate for the file is greater than target by too much, " - "for layer: " - << j; - } - } -} - -// Check basic rate targeting for 3 temporal layers. -TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). - cfg_.ss_number_layers = 1; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - - cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - for (int i = 200; i <= 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - // 40-20-40 bitrate allocation for 3 temporal layers. - cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { - // TODO(yaowu): Work out more stable rc control strategy and - // Adjust the thresholds to be tighter than .75. - ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75) - << " The datarate for the file is lower than target by too much, " - "for layer: " - << j; - // TODO(yaowu): Work out more stable rc control strategy and - // Adjust the thresholds to be tighter than 1.25. - ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25) - << " The datarate for the file is greater than target by too much, " - "for layer: " - << j; - } - } -} - -// Check basic rate targeting for 3 temporal layers, with frame dropping. -// Only for one (low) bitrate with lower max_quantizer, and somewhat higher -// frame drop threshold, to force frame dropping. -TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - // Set frame drop threshold and rc_max_quantizer to force some frame drops. - cfg_.rc_dropframe_thresh = 20; - cfg_.rc_max_quantizer = 45; - cfg_.rc_min_quantizer = 0; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). - cfg_.ss_number_layers = 1; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - - cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - cfg_.rc_target_bitrate = 200; - ResetModel(); - // 40-20-40 bitrate allocation for 3 temporal layers. - cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { - ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) - << " The datarate for the file is lower than target by too much, " - "for layer: " - << j; - ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15) - << " The datarate for the file is greater than target by too much, " - "for layer: " - << j; - // Expect some frame drops in this test: for this 200 frames test, - // expect at least 10% and not more than 60% drops. - ASSERT_GE(num_drops_, 20); - ASSERT_LE(num_drops_, 130); - } -} - -#if CONFIG_VP9_TEMPORAL_DENOISING -class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large { - public: - virtual ~DatarateTestVP9LargeDenoiser() {} -}; - -// Check basic datarate targeting, for a single bitrate, when denoiser is on. -TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 2; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: denoiserYonly(which is 1), - // but may add more modes in the future. - cfg_.rc_target_bitrate = 300; - ResetModel(); - // Turn on the denoiser. - denoiser_on_ = 1; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is on, -// for clip with high noise level. Use 2 threads. -TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 2; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.g_threads = 2; - - ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200); - - // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: kDenoiserOnYOnly(which is 1), - // but may add more modes in the future. - cfg_.rc_target_bitrate = 1000; - ResetModel(); - // Turn on the denoiser. - denoiser_on_ = 1; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is on, -// for 1280x720 clip with 4 threads. -TEST_P(DatarateTestVP9LargeDenoiser, 4threads) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 2; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.g_threads = 4; - - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); - - // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: denoiserYonly(which is 1), - // but may add more modes in the future. - cfg_.rc_target_bitrate = 1000; - ResetModel(); - // Turn on the denoiser. - denoiser_on_ = 1; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29) - << " The datarate for the file is greater than target by too much!"; -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is off -// and on. -TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 2; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 299); - - // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: denoiserYonly(which is 1), - // but may add more modes in the future. - cfg_.rc_target_bitrate = 300; - ResetModel(); - // The denoiser is off by default. - denoiser_on_ = 0; - // Set the offon test flag. - denoiser_offon_test_ = 1; - denoiser_offon_period_ = 100; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} -#endif // CONFIG_VP9_TEMPORAL_DENOISING - -class DatarateOnePassCbrSvc - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { - public: - DatarateOnePassCbrSvc() : EncoderTest(GET_PARAM(0)) { - memset(&svc_params_, 0, sizeof(svc_params_)); - } - virtual ~DatarateOnePassCbrSvc() {} - - protected: - virtual void SetUp() { - InitializeConfig(); - SetMode(GET_PARAM(1)); - speed_setting_ = GET_PARAM(2); - ResetModel(); - } - virtual void ResetModel() { - last_pts_ = 0; - duration_ = 0.0; - mismatch_psnr_ = 0.0; - mismatch_nframes_ = 0; - denoiser_on_ = 0; - tune_content_ = 0; - base_speed_setting_ = 5; - spatial_layer_id_ = 0; - temporal_layer_id_ = 0; - memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_)); - memset(bits_total_, 0, sizeof(bits_total_)); - memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_)); - } - virtual void BeginPassHook(unsigned int /*pass*/) {} - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 0) { - int i; - for (i = 0; i < VPX_MAX_LAYERS; ++i) { - svc_params_.max_quantizers[i] = 63; - svc_params_.min_quantizers[i] = 0; - } - svc_params_.speed_per_layer[0] = base_speed_setting_; - for (i = 1; i < VPX_SS_MAX_LAYERS; ++i) { - svc_params_.speed_per_layer[i] = speed_setting_; - } - - encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); - encoder->Control(VP9E_SET_SVC, 1); - encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); - encoder->Control(VP8E_SET_CPUUSED, speed_setting_); - encoder->Control(VP9E_SET_TILE_COLUMNS, 0); - encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300); - encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1)); - encoder->Control(VP9E_SET_ROW_MT, 1); - encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1); - encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); - } - const vpx_rational_t tb = video->timebase(); - timebase_ = static_cast(tb.num) / tb.den; - duration_ = 0; - } - - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { - vpx_svc_layer_id_t layer_id; - encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); - spatial_layer_id_ = layer_id.spatial_layer_id; - temporal_layer_id_ = layer_id.temporal_layer_id; - // Update buffer with per-layer target frame bandwidth, this is done - // for every frame passed to the encoder (encoded or dropped). - // For temporal layers, update the cumulative buffer level. - for (int sl = 0; sl < number_spatial_layers_; ++sl) { - for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { - const int layer = sl * number_temporal_layers_ + tl; - bits_in_buffer_model_[layer] += - static_cast(layer_target_avg_bandwidth_[layer]); - } - } - } - - vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, - uint32_t sizes[8], int *count) { - uint8_t marker; - marker = *(data + data_sz - 1); - *count = 0; - if ((marker & 0xe0) == 0xc0) { - const uint32_t frames = (marker & 0x7) + 1; - const uint32_t mag = ((marker >> 3) & 0x3) + 1; - const size_t index_sz = 2 + mag * frames; - // This chunk is marked as having a superframe index but doesn't have - // enough data for it, thus it's an invalid superframe index. - if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; - { - const uint8_t marker2 = *(data + data_sz - index_sz); - // This chunk is marked as having a superframe index but doesn't have - // the matching marker byte at the front of the index therefore it's an - // invalid chunk. - if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; - } - { - uint32_t i, j; - const uint8_t *x = &data[data_sz - index_sz + 1]; - for (i = 0; i < frames; ++i) { - uint32_t this_sz = 0; - - for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); - sizes[i] = this_sz; - } - *count = frames; - } - } - return VPX_CODEC_OK; - } - - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - uint32_t sizes[8] = { 0 }; - int count = 0; - last_pts_ = pkt->data.frame.pts; - const bool key_frame = - (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; - parse_superframe_index(static_cast(pkt->data.frame.buf), - pkt->data.frame.sz, sizes, &count); - ASSERT_EQ(count, number_spatial_layers_); - for (int sl = 0; sl < number_spatial_layers_; ++sl) { - sizes[sl] = sizes[sl] << 3; - // Update the total encoded bits per layer. - // For temporal layers, update the cumulative encoded bits per layer. - for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { - const int layer = sl * number_temporal_layers_ + tl; - bits_total_[layer] += static_cast(sizes[sl]); - // Update the per-layer buffer level with the encoded frame size. - bits_in_buffer_model_[layer] -= static_cast(sizes[sl]); - // There should be no buffer underrun, except on the base - // temporal layer, since there may be key frames there. - if (!key_frame && tl > 0) { - ASSERT_GE(bits_in_buffer_model_[layer], 0) - << "Buffer Underrun at frame " << pkt->data.frame.pts; - } - } - } - } - - virtual void EndPassHook(void) { - for (int sl = 0; sl < number_spatial_layers_; ++sl) { - for (int tl = 0; tl < number_temporal_layers_; ++tl) { - const int layer = sl * number_temporal_layers_ + tl; - const double file_size_in_kb = bits_total_[layer] / 1000.; - duration_ = (last_pts_ + 1) * timebase_; - file_datarate_[layer] = file_size_in_kb / duration_; - } - } - } - - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { - double mismatch_psnr = compute_psnr(img1, img2); - mismatch_psnr_ += mismatch_psnr; - ++mismatch_nframes_; - } - - unsigned int GetMismatchFrames() { return mismatch_nframes_; } - - vpx_codec_pts_t last_pts_; - int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; - double timebase_; - int64_t bits_total_[VPX_MAX_LAYERS]; - double duration_; - double file_datarate_[VPX_MAX_LAYERS]; - size_t bits_in_last_frame_; - vpx_svc_extra_cfg_t svc_params_; - int speed_setting_; - double mismatch_psnr_; - int mismatch_nframes_; - int denoiser_on_; - int tune_content_; - int base_speed_setting_; - int spatial_layer_id_; - int temporal_layer_id_; - int number_spatial_layers_; - int number_temporal_layers_; - int layer_target_avg_bandwidth_[VPX_MAX_LAYERS]; -}; -static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg, - const vpx_svc_extra_cfg_t *svc_params, - int spatial_layers, int temporal_layers, - int temporal_layering_mode, - int *layer_target_avg_bandwidth, - int64_t *bits_in_buffer_model) { - int sl, spatial_layer_target; - float total = 0; - float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; - float framerate = 30.0; - for (sl = 0; sl < spatial_layers; ++sl) { - if (svc_params->scaling_factor_den[sl] > 0) { - alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 / - svc_params->scaling_factor_den[sl]); - total += alloc_ratio[sl]; - } - } - for (sl = 0; sl < spatial_layers; ++sl) { - enc_cfg->ss_target_bitrate[sl] = spatial_layer_target = - (unsigned int)(enc_cfg->rc_target_bitrate * alloc_ratio[sl] / total); - const int index = sl * temporal_layers; - if (temporal_layering_mode == 3) { - enc_cfg->layer_target_bitrate[index] = spatial_layer_target >> 1; - enc_cfg->layer_target_bitrate[index + 1] = - (spatial_layer_target >> 1) + (spatial_layer_target >> 2); - enc_cfg->layer_target_bitrate[index + 2] = spatial_layer_target; - } else if (temporal_layering_mode == 2) { - enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3; - enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target; - } else if (temporal_layering_mode <= 1) { - enc_cfg->layer_target_bitrate[index] = spatial_layer_target; - } - } - for (sl = 0; sl < spatial_layers; ++sl) { - for (int tl = 0; tl < temporal_layers; ++tl) { - const int layer = sl * temporal_layers + tl; - float layer_framerate = framerate; - if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2; - if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4; - if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2; - layer_target_avg_bandwidth[layer] = static_cast( - enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate); - bits_in_buffer_model[layer] = - enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz; - } - } -} - -static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg, - int number_spatial_layers, - int number_temporal_layers, - double *file_datarate, - double thresh_overshoot, - double thresh_undershoot) { - for (int sl = 0; sl < number_spatial_layers; ++sl) - for (int tl = 0; tl < number_temporal_layers; ++tl) { - const int layer = sl * number_temporal_layers + tl; - ASSERT_GE(cfg->layer_target_bitrate[layer], - file_datarate[layer] * thresh_overshoot) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg->layer_target_bitrate[layer], - file_datarate[layer] * thresh_undershoot) - << " The datarate for the file is lower than the target by too much!"; - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 -// temporal layer, with screen content mode on and same speed setting for all -// layers. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 1; - cfg_.ts_rate_decimator[0] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 0; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 10; - cfg_.kf_max_dist = 9999; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); - cfg_.rc_target_bitrate = 500; - ResetModel(); - tune_content_ = 1; - base_speed_setting_ = speed_setting_; - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, 1.15); - EXPECT_EQ(static_cast(0), GetMismatchFrames()); -} - -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and -// 3 temporal layers. Run CIF clip with 1 thread. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 0; - cfg_.kf_max_dist = 9999; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, - 0, 400); - // TODO(marpan): Check that effective_datarate for each layer hits the - // layer target_bitrate. - for (int i = 200; i <= 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, - 1.15); -#if CONFIG_VP9_DECODER - // Number of temporal layers > 1, so half of the frames in this SVC pattern - // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expect 200 (half of the sequence) - // mismatched frames. - EXPECT_EQ(static_cast(200), GetMismatchFrames()); -#endif - } -} - -// Check basic rate targeting for 1 pass CBR SVC with denoising. -// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 2; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 0; - cfg_.kf_max_dist = 9999; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, - 0, 400); - // TODO(marpan): Check that effective_datarate for each layer hits the - // layer target_bitrate. - // For SVC, noise_sen = 1 means denoising only the top spatial layer - // noise_sen = 2 means denoising the two top spatial layers. - for (int noise_sen = 1; noise_sen <= 2; noise_sen++) { - for (int i = 600; i <= 1000; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - denoiser_on_ = noise_sen; - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, - 1.15); -#if CONFIG_VP9_DECODER - // Number of temporal layers > 1, so half of the frames in this SVC - // pattern - // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expect 200 (half of the sequence) - // mismatched frames. - EXPECT_EQ(static_cast(200), GetMismatchFrames()); -#endif - } - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 -// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 10; - cfg_.rc_target_bitrate = 400; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, - 0, 400); - // For this 3 temporal layer case, pattern repeats every 4 frames, so choose - // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). - for (int j = 64; j <= 67; j++) { - cfg_.kf_max_dist = j; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, - 1.15); - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and -// 3 temporal layers. Run HD clip with 4 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 4; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 0; - cfg_.kf_max_dist = 9999; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); - cfg_.rc_target_bitrate = 800; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, 1.15); -#if CONFIG_VP9_DECODER - // Number of temporal layers > 1, so half of the frames in this SVC pattern - // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expect 30 (half of the sequence) - // mismatched frames. - EXPECT_EQ(static_cast(30), GetMismatchFrames()); -#endif -} - -// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and -// 3 temporal layers. Run CIF clip with 1 thread. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 0; - cfg_.kf_max_dist = 9999; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, - 0, 400); - cfg_.rc_target_bitrate = 800; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, 1.15); -#if CONFIG_VP9_DECODER - // Number of temporal layers > 1, so half of the frames in this SVC pattern - // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expect 200 (half of the sequence) - // mismatched frames. - EXPECT_EQ(static_cast(200), GetMismatchFrames()); -#endif -} - -// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 -// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 10; - cfg_.rc_target_bitrate = 800; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, - 0, 400); - // For this 3 temporal layer case, pattern repeats every 4 frames, so choose - // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). - for (int j = 32; j <= 35; j++) { - cfg_.kf_max_dist = j; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, - 1.15); - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and -// 3 temporal layers. Run HD clip with 4 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 4; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 0; - cfg_.kf_max_dist = 9999; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); - cfg_.rc_target_bitrate = 800; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode, - layer_target_avg_bandwidth_, bits_in_buffer_model_); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, 1.15); -#if CONFIG_VP9_DECODER - // Number of temporal layers > 1, so half of the frames in this SVC pattern - // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expect 30 (half of the sequence) - // mismatched frames. - EXPECT_EQ(static_cast(30), GetMismatchFrames()); -#endif -} - -// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial -// downscale 5x5. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 1; - cfg_.ts_rate_decimator[0] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 3; - cfg_.temporal_layering_mode = 0; - svc_params_.scaling_factor_num[0] = 256; - svc_params_.scaling_factor_den[0] = 1280; - svc_params_.scaling_factor_num[1] = 1280; - svc_params_.scaling_factor_den[1] = 1280; - cfg_.rc_dropframe_thresh = 10; - cfg_.kf_max_dist = 999999; - cfg_.kf_min_dist = 0; - cfg_.ss_target_bitrate[0] = 300; - cfg_.ss_target_bitrate[1] = 1400; - cfg_.layer_target_bitrate[0] = 300; - cfg_.layer_target_bitrate[1] = 1400; - cfg_.rc_target_bitrate = 1700; - number_spatial_layers_ = cfg_.ss_number_layers; - number_temporal_layers_ = cfg_.ts_number_layers; - ResetModel(); - layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30; - bits_in_buffer_model_[0] = - cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz; - layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30; - bits_in_buffer_model_[1] = - cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.78, 1.15); - EXPECT_EQ(static_cast(0), GetMismatchFrames()); -} - -VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES, - ::testing::Values(0)); -VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Values(-6, -12)); -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large, - ::testing::Values(::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime), - ::testing::Range(2, 9)); -#if CONFIG_VP9_TEMPORAL_DENOISING -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 9)); -#endif -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 9)); -} // namespace diff --git a/libs/libvpx/test/dct16x16_test.cc b/libs/libvpx/test/dct16x16_test.cc index ce0bd37b3d..9ccf2b84f1 100644 --- a/libs/libvpx/test/dct16x16_test.cc +++ b/libs/libvpx/test/dct16x16_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -229,10 +230,9 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, int tx_type); -typedef std::tr1::tuple Dct16x16Param; -typedef std::tr1::tuple Ht16x16Param; -typedef std::tr1::tuple - Idct16x16Param; +typedef std::tuple Dct16x16Param; +typedef std::tuple Ht16x16Param; +typedef std::tuple Idct16x16Param; void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride, int /*tx_type*/) { @@ -744,7 +744,7 @@ TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( diff --git a/libs/libvpx/test/dct32x32_test.cc b/libs/libvpx/test/dct32x32_test.cc index a95ff97328..94d6b37fa9 100644 --- a/libs/libvpx/test/dct32x32_test.cc +++ b/libs/libvpx/test/dct32x32_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -18,6 +19,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" @@ -66,7 +68,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef std::tr1::tuple +typedef std::tuple Trans32x32Param; #if CONFIG_VP9_HIGHBITDEPTH @@ -79,7 +81,8 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { } #endif // CONFIG_VP9_HIGHBITDEPTH -class Trans32x32Test : public ::testing::TestWithParam { +class Trans32x32Test : public AbstractBench, + public ::testing::TestWithParam { public: virtual ~Trans32x32Test() {} virtual void SetUp() { @@ -99,8 +102,14 @@ class Trans32x32Test : public ::testing::TestWithParam { int mask_; FwdTxfmFunc fwd_txfm_; InvTxfmFunc inv_txfm_; + + int16_t *bench_in_; + tran_low_t *bench_out_; + virtual void Run(); }; +void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); } + TEST_P(Trans32x32Test, AccuracyCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); uint32_t max_error = 0; @@ -237,6 +246,19 @@ TEST_P(Trans32x32Test, MemCheck) { } } +TEST_P(Trans32x32Test, DISABLED_Speed) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); + + bench_in_ = input_extreme_block; + bench_out_ = output_block; + + RunNTimes(INT16_MAX); + PrintMedian("32x32"); +} + TEST_P(Trans32x32Test, InverseAccuracy) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 1000; @@ -292,7 +314,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) { } } -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( @@ -371,7 +393,7 @@ INSTANTIATE_TEST_CASE_P( VSX, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx, 0, VPX_BITS_8), - make_tuple(&vpx_fdct32x32_rd_c, + make_tuple(&vpx_fdct32x32_rd_vsx, &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libs/libvpx/test/dct_partial_test.cc b/libs/libvpx/test/dct_partial_test.cc index 4d145f5891..c889e92d70 100644 --- a/libs/libvpx/test/dct_partial_test.cc +++ b/libs/libvpx/test/dct_partial_test.cc @@ -11,8 +11,8 @@ #include #include #include - #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -28,8 +28,8 @@ using libvpx_test::ACMRandom; using libvpx_test::Buffer; -using std::tr1::tuple; -using std::tr1::make_tuple; +using std::make_tuple; +using std::tuple; namespace { typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride); @@ -39,10 +39,14 @@ typedef tuple tran_low_t partial_fdct_ref(const Buffer &in, int size) { int64_t sum = 0; - for (int y = 0; y < size; ++y) { - for (int x = 0; x < size; ++x) { - sum += in.TopLeftPixel()[y * in.stride() + x]; + if (in.TopLeftPixel() != NULL) { + for (int y = 0; y < size; ++y) { + for (int x = 0; x < size; ++x) { + sum += in.TopLeftPixel()[y * in.stride() + x]; + } } + } else { + assert(0); } switch (size) { @@ -77,21 +81,25 @@ class PartialFdctTest : public ::testing::TestWithParam { Buffer output_block = Buffer(size_, size_, 0, 16); ASSERT_TRUE(output_block.Init()); - for (int i = 0; i < 100; ++i) { - if (i == 0) { - input_block.Set(maxvalue); - } else if (i == 1) { - input_block.Set(minvalue); - } else { - input_block.Set(&rnd, minvalue, maxvalue); + if (output_block.TopLeftPixel() != NULL) { + for (int i = 0; i < 100; ++i) { + if (i == 0) { + input_block.Set(maxvalue); + } else if (i == 1) { + input_block.Set(minvalue); + } else { + input_block.Set(&rnd, minvalue, maxvalue); + } + + ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(), + output_block.TopLeftPixel(), + input_block.stride())); + + EXPECT_EQ(partial_fdct_ref(input_block, size_), + output_block.TopLeftPixel()[0]); } - - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(), - output_block.TopLeftPixel(), - input_block.stride())); - - EXPECT_EQ(partial_fdct_ref(input_block, size_), - output_block.TopLeftPixel()[0]); + } else { + assert(0); } } diff --git a/libs/libvpx/test/dct_test.cc b/libs/libvpx/test/dct_test.cc index addbdfb463..6053aee542 100644 --- a/libs/libvpx/test/dct_test.cc +++ b/libs/libvpx/test/dct_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -28,8 +29,8 @@ using libvpx_test::ACMRandom; using libvpx_test::Buffer; -using std::tr1::tuple; -using std::tr1::make_tuple; +using std::make_tuple; +using std::tuple; namespace { typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); @@ -40,10 +41,60 @@ typedef void (*FhtFuncRef)(const Buffer &in, Buffer *out, int size, int tx_type); typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, int tx_type); +typedef void (*IhtWithBdFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd); + +template +void fdct_wrapper(const int16_t *in, tran_low_t *out, int stride, int tx_type) { + (void)tx_type; + fn(in, out, stride); +} + +template +void idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type, + int bd) { + (void)tx_type; + (void)bd; + fn(in, out, stride); +} + +template +void iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type, + int bd) { + (void)bd; + fn(in, out, stride, tx_type); +} + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*HighbdIdctFunc)(const tran_low_t *in, uint16_t *out, int stride, + int bd); + +typedef void (*HighbdIhtFunc)(const tran_low_t *in, uint16_t *out, int stride, + int tx_type, int bd); + +template +void highbd_idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd) { + (void)tx_type; + fn(in, CAST_TO_SHORTPTR(out), stride, bd); +} + +template +void highbd_iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd) { + fn(in, CAST_TO_SHORTPTR(out), stride, tx_type, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +struct FuncInfo { + FhtFunc ft_func; + IhtWithBdFunc it_func; + int size; + int pixel_size; +}; /* forward transform, inverse transform, size, transform type, bit depth */ -typedef tuple DctParam; -typedef tuple HtParam; +typedef tuple DctParam; void fdct_ref(const Buffer &in, Buffer *out, int size, int /*tx_type*/) { @@ -81,128 +132,123 @@ void fwht_ref(const Buffer &in, Buffer *out, int size, vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); } -#if CONFIG_VP9_HIGHBITDEPTH -#define idctNxN(n, coeffs, bitdepth) \ - void idct##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \ - int stride) { \ - vpx_highbd_idct##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \ - stride, bitdepth); \ - } - -idctNxN(4, 16, 10); -idctNxN(4, 16, 12); -idctNxN(8, 64, 10); -idctNxN(8, 64, 12); -idctNxN(16, 256, 10); -idctNxN(16, 256, 12); -idctNxN(32, 1024, 10); -idctNxN(32, 1024, 12); - -#define ihtNxN(n, coeffs, bitdepth) \ - void iht##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \ - int stride, int tx_type) { \ - vp9_highbd_iht##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \ - stride, tx_type, bitdepth); \ - } - -ihtNxN(4, 16, 10); -ihtNxN(4, 16, 12); -ihtNxN(8, 64, 10); -ihtNxN(8, 64, 12); -ihtNxN(16, 256, 10); -// ihtNxN(16, 256, 12); - -void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); -} - -void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -class TransTestBase { +class TransTestBase : public ::testing::TestWithParam { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } + virtual void SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); + const int idx = GET_PARAM(0); + const FuncInfo *func_info = &(GET_PARAM(1)[idx]); + tx_type_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + fwd_txfm_ = func_info->ft_func; + inv_txfm_ = func_info->it_func; + size_ = func_info->size; + pixel_size_ = func_info->pixel_size; + max_pixel_value_ = (1 << bit_depth_) - 1; + + // Randomize stride_ to a value less than or equal to 1024 + stride_ = rnd_(1024) + 1; + if (stride_ < size_) { + stride_ = size_; + } + // Align stride_ to 16 if it's bigger than 16. + if (stride_ > 16) { + stride_ &= ~15; + } + + block_size_ = size_ * stride_; + + src_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * block_size_)); + ASSERT_TRUE(src_ != NULL); + dst_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * block_size_)); + ASSERT_TRUE(dst_ != NULL); + } + + virtual void TearDown() { + vpx_free(src_); + src_ = NULL; + vpx_free(dst_); + dst_ = NULL; + libvpx_test::ClearSystemState(); + } + + void InitMem() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + if (pixel_size_ == 1) { + for (int j = 0; j < block_size_; ++j) { + src_[j] = rnd_.Rand16() & max_pixel_value_; + } + for (int j = 0; j < block_size_; ++j) { + dst_[j] = rnd_.Rand16() & max_pixel_value_; + } + } else { + ASSERT_EQ(pixel_size_, 2); + uint16_t *const src = reinterpret_cast(src_); + uint16_t *const dst = reinterpret_cast(dst_); + for (int j = 0; j < block_size_; ++j) { + src[j] = rnd_.Rand16() & max_pixel_value_; + } + for (int j = 0; j < block_size_; ++j) { + dst[j] = rnd_.Rand16() & max_pixel_value_; + } + } + } + + void RunFwdTxfm(const Buffer &in, Buffer *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_); + } + + void RunInvTxfm(const Buffer &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, stride_, tx_type_, bit_depth_); + } protected: - virtual void RunFwdTxfm(const Buffer &in, - Buffer *out) = 0; - - virtual void RunInvTxfm(const Buffer &in, uint8_t *out) = 0; - void RunAccuracyCheck(int limit) { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; ACMRandom rnd(ACMRandom::DeterministicSeed()); Buffer test_input_block = Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); ASSERT_TRUE(test_input_block.Init()); + ASSERT_TRUE(test_input_block.TopLeftPixel() != NULL); Buffer test_temp_block = Buffer(size_, size_, 0, 16); ASSERT_TRUE(test_temp_block.Init()); - Buffer dst = Buffer(size_, size_, 0, 16); - ASSERT_TRUE(dst.Init()); - Buffer src = Buffer(size_, size_, 0, 16); - ASSERT_TRUE(src.Init()); -#if CONFIG_VP9_HIGHBITDEPTH - Buffer dst16 = Buffer(size_, size_, 0, 16); - ASSERT_TRUE(dst16.Init()); - Buffer src16 = Buffer(size_, size_, 0, 16); - ASSERT_TRUE(src16.Init()); -#endif // CONFIG_VP9_HIGHBITDEPTH uint32_t max_error = 0; int64_t total_error = 0; const int count_test_block = 10000; for (int i = 0; i < count_test_block; ++i) { - if (bit_depth_ == 8) { - src.Set(&rnd, &ACMRandom::Rand8); - dst.Set(&rnd, &ACMRandom::Rand8); - // Initialize a test block with input range [-255, 255]. - for (int h = 0; h < size_; ++h) { - for (int w = 0; w < size_; ++w) { + InitMem(); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + if (pixel_size_ == 1) { test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = - src.TopLeftPixel()[h * src.stride() + w] - - dst.TopLeftPixel()[h * dst.stride() + w]; + src_[h * stride_ + w] - dst_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src[h * stride_ + w] - dst[h * stride_ + w]; } } -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16.Set(&rnd, 0, max_pixel_value_); - dst16.Set(&rnd, 0, max_pixel_value_); - for (int h = 0; h < size_; ++h) { - for (int w = 0; w < size_; ++w) { - test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = - src16.TopLeftPixel()[h * src16.stride() + w] - - dst16.TopLeftPixel()[h * dst16.stride() + w]; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH } ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block)); - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, dst.TopLeftPixel())); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16.TopLeftPixel()))); -#endif // CONFIG_VP9_HIGHBITDEPTH - } + ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst_)); for (int h = 0; h < size_; ++h) { for (int w = 0; w < size_; ++w) { int diff; -#if CONFIG_VP9_HIGHBITDEPTH - if (bit_depth_ != 8) { - diff = dst16.TopLeftPixel()[h * dst16.stride() + w] - - src16.TopLeftPixel()[h * src16.stride() + w]; + if (pixel_size_ == 1) { + diff = dst_[h * stride_ + w] - src_[h * stride_ + w]; } else { -#endif // CONFIG_VP9_HIGHBITDEPTH - diff = dst.TopLeftPixel()[h * dst.stride() + w] - - src.TopLeftPixel()[h * src.stride() + w]; -#if CONFIG_VP9_HIGHBITDEPTH + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + diff = dst[h * stride_ + w] - src[h * stride_ + w]; } -#endif // CONFIG_VP9_HIGHBITDEPTH const uint32_t error = diff * diff; if (max_error < error) max_error = error; total_error += error; @@ -211,14 +257,18 @@ class TransTestBase { } EXPECT_GE(static_cast(limit), max_error) - << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit; + << "Error: " << size_ << "x" << size_ + << " transform/inverse transform has an individual round trip error > " + << limit; EXPECT_GE(count_test_block * limit, total_error) - << "Error: 4x4 FHT/IHT has average round trip error > " << limit - << " per block"; + << "Error: " << size_ << "x" << size_ + << " transform/inverse transform has average round trip error > " + << limit << " per block"; } void RunCoeffCheck() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 5000; Buffer input_block = @@ -248,6 +298,7 @@ class TransTestBase { } void RunMemCheck() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 5000; Buffer input_extreme_block = @@ -265,6 +316,7 @@ class TransTestBase { } else if (i == 1) { input_extreme_block.Set(-max_pixel_value_); } else { + ASSERT_TRUE(input_extreme_block.TopLeftPixel() != NULL); for (int h = 0; h < size_; ++h) { for (int w = 0; w < size_; ++w) { input_extreme_block @@ -279,13 +331,14 @@ class TransTestBase { // The minimum quant value is 4. EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + ASSERT_TRUE(output_block.TopLeftPixel() != NULL); for (int h = 0; h < size_; ++h) { for (int w = 0; w < size_; ++w) { EXPECT_GE( 4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block.TopLeftPixel()[h * output_block.stride() + w])) - << "Error: 4x4 FDCT has coefficient larger than " - "4*DCT_MAX_VALUE" + << "Error: " << size_ << "x" << size_ + << " transform has coefficient larger than 4*DCT_MAX_VALUE" << " at " << w << "," << h; if (::testing::Test::HasFailure()) { printf("Size: %d Transform type: %d\n", size_, tx_type_); @@ -298,6 +351,7 @@ class TransTestBase { } void RunInvAccuracyCheck(int limit) { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 1000; Buffer in = Buffer(size_, size_, 4); @@ -314,100 +368,85 @@ class TransTestBase { ASSERT_TRUE(src16.Init()); for (int i = 0; i < count_test_block; ++i) { + InitMem(); + ASSERT_TRUE(in.TopLeftPixel() != NULL); // Initialize a test block with input range [-max_pixel_value_, // max_pixel_value_]. - if (bit_depth_ == VPX_BITS_8) { - src.Set(&rnd, &ACMRandom::Rand8); - dst.Set(&rnd, &ACMRandom::Rand8); - for (int h = 0; h < size_; ++h) { - for (int w = 0; w < size_; ++w) { + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + if (pixel_size_ == 1) { in.TopLeftPixel()[h * in.stride() + w] = - src.TopLeftPixel()[h * src.stride() + w] - - dst.TopLeftPixel()[h * dst.stride() + w]; + src_[h * stride_ + w] - dst_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + in.TopLeftPixel()[h * in.stride() + w] = + src[h * stride_ + w] - dst[h * stride_ + w]; } } -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16.Set(&rnd, 0, max_pixel_value_); - dst16.Set(&rnd, 0, max_pixel_value_); - for (int h = 0; h < size_; ++h) { - for (int w = 0; w < size_; ++w) { - in.TopLeftPixel()[h * in.stride() + w] = - src16.TopLeftPixel()[h * src16.stride() + w] - - dst16.TopLeftPixel()[h * dst16.stride() + w]; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH } fwd_txfm_ref(in, &coeff, size_, tx_type_); - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst.TopLeftPixel())); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16.TopLeftPixel()))); -#endif // CONFIG_VP9_HIGHBITDEPTH - } + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst_)); for (int h = 0; h < size_; ++h) { for (int w = 0; w < size_; ++w) { int diff; -#if CONFIG_VP9_HIGHBITDEPTH - if (bit_depth_ != 8) { - diff = dst16.TopLeftPixel()[h * dst16.stride() + w] - - src16.TopLeftPixel()[h * src16.stride() + w]; + if (pixel_size_ == 1) { + diff = dst_[h * stride_ + w] - src_[h * stride_ + w]; } else { -#endif // CONFIG_VP9_HIGHBITDEPTH - diff = dst.TopLeftPixel()[h * dst.stride() + w] - - src.TopLeftPixel()[h * src.stride() + w]; -#if CONFIG_VP9_HIGHBITDEPTH + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + diff = dst[h * stride_ + w] - src[h * stride_ + w]; } -#endif // CONFIG_VP9_HIGHBITDEPTH const uint32_t error = diff * diff; EXPECT_GE(static_cast(limit), error) - << "Error: " << size_ << "x" << size_ << " IDCT has error " - << error << " at " << w << "," << h; + << "Error: " << size_ << "x" << size_ + << " inverse transform has error " << error << " at " << w << "," + << h; + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + return; + } } } } } + FhtFunc fwd_txfm_; FhtFuncRef fwd_txfm_ref; + IhtWithBdFunc inv_txfm_; + ACMRandom rnd_; + uint8_t *src_; + uint8_t *dst_; vpx_bit_depth_t bit_depth_; int tx_type_; int max_pixel_value_; int size_; + int stride_; + int pixel_size_; + int block_size_; }; -class TransDCT : public TransTestBase, - public ::testing::TestWithParam { +/* -------------------------------------------------------------------------- */ + +class TransDCT : public TransTestBase { public: - TransDCT() { - fwd_txfm_ref = fdct_ref; - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - size_ = GET_PARAM(2); - tx_type_ = GET_PARAM(3); - bit_depth_ = GET_PARAM(4); - max_pixel_value_ = (1 << bit_depth_) - 1; - } - - protected: - void RunFwdTxfm(const Buffer &in, Buffer *out) { - fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); - } - - void RunInvTxfm(const Buffer &in, uint8_t *out) { - inv_txfm_(in.TopLeftPixel(), out, in.stride()); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; + TransDCT() { fwd_txfm_ref = fdct_ref; } }; -TEST_P(TransDCT, AccuracyCheck) { RunAccuracyCheck(1); } +TEST_P(TransDCT, AccuracyCheck) { + int t = 1; + if (size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2) { + t = 2; + } else if (size_ == 32 && bit_depth_ > 10 && pixel_size_ == 2) { + t = 7; + } + RunAccuracyCheck(t); +} TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); } @@ -415,177 +454,150 @@ TEST_P(TransDCT, MemCheck) { RunMemCheck(); } TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } +static const FuncInfo dct_c_func_info[] = { #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, TransDCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 32, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 32, 0, VPX_BITS_10), - make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0, - VPX_BITS_8), - make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 16, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 16, 0, VPX_BITS_10), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0, - VPX_BITS_8), - make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 8, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 8, 0, VPX_BITS_10), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 4, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 4, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P( - C, TransDCT, - ::testing::Values( - make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_SSE2 -#if !CONFIG_EMULATE_HARDWARE -#if CONFIG_VP9_HIGHBITDEPTH -/* TODO:(johannkoenig) Determine why these fail AccuracyCheck - make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 32, 0, VPX_BITS_12), - make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 16, 0, VPX_BITS_12), -*/ -INSTANTIATE_TEST_CASE_P( - SSE2, TransDCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 32, 0, - VPX_BITS_10), - make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 32, 0, - VPX_BITS_8), - make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 16, 0, - VPX_BITS_10), - make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 16, 0, - VPX_BITS_8), - make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_10, 8, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_12, 8, 0, VPX_BITS_12), - make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, 0, - VPX_BITS_8), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10, 4, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12, 4, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, 0, - VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P( - SSE2, TransDCT, - ::testing::Values(make_tuple(&vpx_fdct32x32_sse2, - &vpx_idct32x32_1024_add_sse2, 32, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct16x16_sse2, - &vpx_idct16x16_256_add_sse2, 16, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, - 0, VPX_BITS_8), - make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, - 0, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // !CONFIG_EMULATE_HARDWARE -#endif // HAVE_SSE2 - -#if !CONFIG_VP9_HIGHBITDEPTH -#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE -#if !ARCH_X86_64 -// TODO(johannkoenig): high bit depth fdct8x8. -INSTANTIATE_TEST_CASE_P( - SSSE3, TransDCT, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, - 32, 0, VPX_BITS_8), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0, - VPX_BITS_8))); -#else -// vpx_fdct8x8_ssse3 is only available in 64 bit builds. -INSTANTIATE_TEST_CASE_P( - SSSE3, TransDCT, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, - 32, 0, VPX_BITS_8), - make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2, - 8, 0, VPX_BITS_8))); -#endif // !ARCH_X86_64 -#endif // HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE -#endif // !CONFIG_VP9_HIGHBITDEPTH - -#if !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE -// TODO(johannkoenig): high bit depth fdct32x32. -INSTANTIATE_TEST_CASE_P( - AVX2, TransDCT, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2, - &vpx_idct32x32_1024_add_sse2, - 32, 0, VPX_BITS_8))); - -#endif // !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE - -#if HAVE_NEON -#if !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - NEON, TransDCT, - ::testing::Values(make_tuple(&vpx_fdct32x32_neon, - &vpx_idct32x32_1024_add_neon, 32, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct16x16_neon, - &vpx_idct16x16_256_add_neon, 16, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8, - 0, VPX_BITS_8), - make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 4, - 0, VPX_BITS_8))); -#endif // !CONFIG_EMULATE_HARDWARE -#endif // HAVE_NEON - -#if HAVE_MSA -#if !CONFIG_VP9_HIGHBITDEPTH -#if !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - MSA, TransDCT, - ::testing::Values( - make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 32, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, 16, 0, - VPX_BITS_8), - make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 8, 0, VPX_BITS_8), - make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 4, 0, - VPX_BITS_8))); -#endif // !CONFIG_EMULATE_HARDWARE -#endif // !CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_MSA - -#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(VSX, TransDCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_c, - &vpx_idct4x4_16_add_vsx, 4, - 0, VPX_BITS_8))); -#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -class TransHT : public TransTestBase, public ::testing::TestWithParam { - public: - TransHT() { - fwd_txfm_ref = fht_ref; - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - size_ = GET_PARAM(2); - tx_type_ = GET_PARAM(3); - bit_depth_ = GET_PARAM(4); - max_pixel_value_ = (1 << bit_depth_) - 1; - } - - protected: - void RunFwdTxfm(const Buffer &in, Buffer *out) { - fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_); - } - - void RunInvTxfm(const Buffer &in, uint8_t *out) { - inv_txfm_(in.TopLeftPixel(), out, in.stride(), tx_type_); - } - - FhtFunc fwd_txfm_; - IhtFunc inv_txfm_; + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, 1 }, + { &fdct_wrapper, &idct_wrapper, 8, 1 }, + { &fdct_wrapper, &idct_wrapper, 16, + 1 }, + { &fdct_wrapper, &idct_wrapper, 32, + 1 } }; -TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); } +INSTANTIATE_TEST_CASE_P( + C, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_c_func_info) / + sizeof(dct_c_func_info[0]))), + ::testing::Values(dct_c_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 +static const FuncInfo dct_sse2_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, + &idct_wrapper, 16, 1 }, + { &fdct_wrapper, + &idct_wrapper, 32, 1 } +}; + +INSTANTIATE_TEST_CASE_P( + SSE2, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_sse2_func_info) / + sizeof(dct_sse2_func_info[0]))), + ::testing::Values(dct_sse2_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +// vpx_fdct8x8_ssse3 is only available in 64 bit builds. +static const FuncInfo dct_ssse3_func_info = { + &fdct_wrapper, &idct_wrapper, 8, 1 +}; + +// TODO(johannkoenig): high bit depth fdct8x8. +INSTANTIATE_TEST_CASE_P(SSSE3, TransDCT, + ::testing::Values(make_tuple(0, &dct_ssse3_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 + +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_avx2_func_info = { + &fdct_wrapper, &idct_wrapper, + 32, 1 +}; + +// TODO(johannkoenig): high bit depth fdct32x32. +INSTANTIATE_TEST_CASE_P(AVX2, TransDCT, + ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON +static const FuncInfo dct_neon_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, + &idct_wrapper, 16, 1 }, + { &fdct_wrapper, + &idct_wrapper, 32, 1 } +}; + +INSTANTIATE_TEST_CASE_P( + NEON, TransDCT, + ::testing::Combine(::testing::Range(0, 4), + ::testing::Values(dct_neon_func_info), + ::testing::Values(0), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_NEON + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_msa_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, &idct_wrapper, + 16, 1 }, + { &fdct_wrapper, &idct_wrapper, + 32, 1 } +}; + +INSTANTIATE_TEST_CASE_P(MSA, TransDCT, + ::testing::Combine(::testing::Range(0, 4), + ::testing::Values(dct_msa_func_info), + ::testing::Values(0), + ::testing::Values(VPX_BITS_8))); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_vsx_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_CASE_P(VSX, TransDCT, + ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && + +#endif // !CONFIG_EMULATE_HARDWARE + +/* -------------------------------------------------------------------------- */ + +class TransHT : public TransTestBase { + public: + TransHT() { fwd_txfm_ref = fht_ref; } +}; + +TEST_P(TransHT, AccuracyCheck) { + RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2 ? 2 : 1); +} TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); } @@ -593,117 +605,109 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); } TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } -/* TODO:(johannkoenig) Determine why these fail AccuracyCheck - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 3, VPX_BITS_12), - */ +static const FuncInfo ht_c_func_info[] = { #if CONFIG_VP9_HIGHBITDEPTH + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, + 2 }, + { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, + 2 }, + { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper, + 16, 2 }, +#endif + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } +}; + INSTANTIATE_TEST_CASE_P( C, TransHT, - ::testing::Values( - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 3, VPX_BITS_10), - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8), - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8), - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8), - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 3, VPX_BITS_10), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 3, VPX_BITS_12), - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8), - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8), - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8), - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 3, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 3, VPX_BITS_12), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8))); -#else + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(ht_c_func_info) / + sizeof(ht_c_func_info[0]))), + ::testing::Values(ht_c_func_info), ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON + +static const FuncInfo ht_neon_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, + 2 }, + { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, + 2 }, + { &vp9_highbd_fht16x16_c, + &highbd_iht_wrapper, 16, 2 }, +#endif + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } +}; + INSTANTIATE_TEST_CASE_P( - C, TransHT, - ::testing::Values( - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8), - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8), - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8), - make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8), - - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8), - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8), - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8), - make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8), - - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH + NEON, TransHT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(ht_neon_func_info) / + sizeof(ht_neon_func_info[0]))), + ::testing::Values(ht_neon_func_info), ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_NEON #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( - SSE2, TransHT, - ::testing::Values( - make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 0, - VPX_BITS_8), - make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 1, - VPX_BITS_8), - make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 2, - VPX_BITS_8), - make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 3, - VPX_BITS_8), - make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 0, VPX_BITS_8), - make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 1, VPX_BITS_8), - make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 2, VPX_BITS_8), - make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 3, VPX_BITS_8), +static const FuncInfo ht_sse2_func_info[3] = { + { &vp9_fht4x4_sse2, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_sse2, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_sse2, &iht_wrapper, 16, 1 } +}; - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 3, - VPX_BITS_8))); +INSTANTIATE_TEST_CASE_P(SSE2, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_sse2_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8))); #endif // HAVE_SSE2 -class TransWHT : public TransTestBase, - public ::testing::TestWithParam { +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo ht_sse4_1_func_info[3] = { + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, + 4, 2 }, + { vp9_highbd_fht8x8_c, &highbd_iht_wrapper, + 8, 2 }, + { &vp9_highbd_fht16x16_c, + &highbd_iht_wrapper, 16, 2 } +}; + +INSTANTIATE_TEST_CASE_P( + SSE4_1, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_sse4_1_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, + VPX_BITS_12))); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo ht_vsx_func_info[3] = { + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_CASE_P(VSX, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_vsx_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8))); +#endif // HAVE_VSX +#endif // !CONFIG_EMULATE_HARDWARE + +/* -------------------------------------------------------------------------- */ + +class TransWHT : public TransTestBase { public: - TransWHT() { - fwd_txfm_ref = fwht_ref; - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - size_ = GET_PARAM(2); - tx_type_ = GET_PARAM(3); - bit_depth_ = GET_PARAM(4); - max_pixel_value_ = (1 << bit_depth_) - 1; - } - - protected: - void RunFwdTxfm(const Buffer &in, Buffer *out) { - fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); - } - - void RunInvTxfm(const Buffer &in, uint8_t *out) { - inv_txfm_(in.TopLeftPixel(), out, in.stride()); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; + TransWHT() { fwd_txfm_ref = fwht_ref; } }; TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); } @@ -714,24 +718,39 @@ TEST_P(TransWHT, MemCheck) { RunMemCheck(); } TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } +static const FuncInfo wht_c_func_info[] = { #if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, 1 } +}; + INSTANTIATE_TEST_CASE_P( C, TransWHT, - ::testing::Values( - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 4, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 4, 0, VPX_BITS_12), - make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 4, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P(C, TransWHT, - ::testing::Values(make_tuple(&vp9_fwht4x4_c, - &vpx_iwht4x4_16_add_c, 4, - 0, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(wht_c_func_info) / + sizeof(wht_c_func_info[0]))), + ::testing::Values(wht_c_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE +static const FuncInfo wht_sse2_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; -#if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, TransWHT, - ::testing::Values(make_tuple(&vp9_fwht4x4_sse2, - &vpx_iwht4x4_16_add_sse2, - 4, 0, VPX_BITS_8))); -#endif // HAVE_SSE2 + ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo wht_vsx_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_CASE_P(VSX, TransWHT, + ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libs/libvpx/test/decode_api_test.cc b/libs/libvpx/test/decode_api_test.cc index 4167cf3e0f..d4b67ccdb8 100644 --- a/libs/libvpx/test/decode_api_test.cc +++ b/libs/libvpx/test/decode_api_test.cc @@ -138,8 +138,30 @@ TEST(DecodeAPI, Vp9InvalidDecode) { EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); } -TEST(DecodeAPI, Vp9PeekSI) { +void TestPeekInfo(const uint8_t *const data, uint32_t data_sz, + uint32_t peek_size) { const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get + // to decoder_peek_si_internal on frames of size < 8. + if (data_sz >= 8) { + vpx_codec_ctx_t dec; + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0)); + EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM + : VPX_CODEC_CORRUPT_FRAME, + vpx_codec_decode(&dec, data, data_sz, NULL, 0)); + vpx_codec_iter_t iter = NULL; + EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); + } + + // Verify behavior of vpx_codec_peek_stream_info. + vpx_codec_stream_info_t si; + si.sz = sizeof(si); + EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK, + vpx_codec_peek_stream_info(codec, data, data_sz, &si)); +} + +TEST(DecodeAPI, Vp9PeekStreamInfo) { // The first 9 bytes are valid and the rest of the bytes are made up. Until // size 10, this should return VPX_CODEC_UNSUP_BITSTREAM and after that it // should return VPX_CODEC_CORRUPT_FRAME. @@ -150,24 +172,18 @@ TEST(DecodeAPI, Vp9PeekSI) { }; for (uint32_t data_sz = 1; data_sz <= 32; ++data_sz) { - // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get - // to decoder_peek_si_internal on frames of size < 8. - if (data_sz >= 8) { - vpx_codec_ctx_t dec; - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0)); - EXPECT_EQ( - (data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_CORRUPT_FRAME, - vpx_codec_decode(&dec, data, data_sz, NULL, 0)); - vpx_codec_iter_t iter = NULL; - EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); - } + TestPeekInfo(data, data_sz, 10); + } +} - // Verify behavior of vpx_codec_peek_stream_info. - vpx_codec_stream_info_t si; - si.sz = sizeof(si); - EXPECT_EQ((data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK, - vpx_codec_peek_stream_info(codec, data, data_sz, &si)); +TEST(DecodeAPI, Vp9PeekStreamInfoTruncated) { + // This profile 1 header requires 10.25 bytes, ensure + // vpx_codec_peek_stream_info doesn't over read. + const uint8_t profile1_data[10] = { 0xa4, 0xe9, 0x30, 0x68, 0x53, + 0xe9, 0x30, 0x68, 0x53, 0x04 }; + + for (uint32_t data_sz = 1; data_sz <= 10; ++data_sz) { + TestPeekInfo(profile1_data, data_sz, 11); } } #endif // CONFIG_VP9_DECODER diff --git a/libs/libvpx/test/decode_corrupted.cc b/libs/libvpx/test/decode_corrupted.cc new file mode 100644 index 0000000000..b1495ce89f --- /dev/null +++ b/libs/libvpx/test/decode_corrupted.cc @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/i420_video_source.h" +#include "vpx_mem/vpx_mem.h" + +namespace { + +class DecodeCorruptedFrameTest + : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam< + std::tuple > { + public: + DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {} + + protected: + virtual ~DecodeCorruptedFrameTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + + // Set small key frame distance such that we insert more key frames. + cfg_.kf_max_dist = 3; + dec_cfg_.threads = 1; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7); + } + + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) {} + + virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) { + // Don't edit frame packet on key frame. + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt; + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; + + memcpy(&modified_pkt_, pkt, sizeof(*pkt)); + + // Halve the size so it's corrupted to decoder. + modified_pkt_.data.frame.sz = modified_pkt_.data.frame.sz / 2; + + return &modified_pkt_; + } + + virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) { + EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError(); + return VPX_CODEC_MEM_ERROR != res_dec; + } + + vpx_codec_cx_pkt_t modified_pkt_; +}; + +TEST_P(DecodeCorruptedFrameTest, DecodeCorruptedFrame) { + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9 +INSTANTIATE_TEST_CASE_P( + VP9, DecodeCorruptedFrameTest, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); +#endif // CONFIG_VP9 + +#if CONFIG_VP8 +INSTANTIATE_TEST_CASE_P( + VP8, DecodeCorruptedFrameTest, + ::testing::Values( + static_cast(&libvpx_test::kVP8))); +#endif // CONFIG_VP8 + +} // namespace diff --git a/libs/libvpx/test/decode_perf_test.cc b/libs/libvpx/test/decode_perf_test.cc index ee26c3c046..aecdd3e999 100644 --- a/libs/libvpx/test/decode_perf_test.cc +++ b/libs/libvpx/test/decode_perf_test.cc @@ -9,6 +9,8 @@ */ #include +#include + #include "test/codec_factory.h" #include "test/decode_test_driver.h" #include "test/encode_test_driver.h" @@ -21,7 +23,7 @@ #include "./ivfenc.h" #include "./vpx_version.h" -using std::tr1::make_tuple; +using std::make_tuple; namespace { @@ -34,7 +36,7 @@ const char kNewEncodeOutputFile[] = "new_encode.ivf"; /* DecodePerfTest takes a tuple of filename + number of threads to decode with */ -typedef std::tr1::tuple DecodePerfParam; +typedef std::tuple DecodePerfParam; const DecodePerfParam kVP9DecodePerfVectors[] = { make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1), @@ -137,7 +139,7 @@ class VP9NewEncodeDecodePerfTest virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, speed_); encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 2); diff --git a/libs/libvpx/test/decode_svc_test.cc b/libs/libvpx/test/decode_svc_test.cc index 69f62f13bd..c6f0873f89 100644 --- a/libs/libvpx/test/decode_svc_test.cc +++ b/libs/libvpx/test/decode_svc_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "test/codec_factory.h" @@ -53,7 +54,7 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest, // number of frames decoded. This results in 1/4x1/4 resolution (320x180). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); ASSERT_TRUE(video.get() != NULL); video->Init(); @@ -70,7 +71,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) { // number of frames decoded. This results in 1/2x1/2 resolution (640x360). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); ASSERT_TRUE(video.get() != NULL); video->Init(); @@ -87,7 +88,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) { // number of frames decoded. This results in the full resolution (1280x720). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); ASSERT_TRUE(video.get() != NULL); video->Init(); @@ -105,7 +106,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) { // the decoding should result in the full resolution (1280x720). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); ASSERT_TRUE(video.get() != NULL); video->Init(); diff --git a/libs/libvpx/test/decode_test_driver.cc b/libs/libvpx/test/decode_test_driver.cc index 48680eb8e9..ae23587759 100644 --- a/libs/libvpx/test/decode_test_driver.cc +++ b/libs/libvpx/test/decode_test_driver.cc @@ -52,9 +52,10 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder, /* Vp8's implementation of PeekStream returns an error if the frame you * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first * frame, which must be a keyframe. */ - if (video->frame_number() == 0) + if (video->frame_number() == 0) { ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: " << vpx_codec_err_to_string(res_peek); + } } else { /* The Vp9 implementation of PeekStream returns an error only if the * data passed to it isn't a valid Vp9 chunk. */ @@ -97,7 +98,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, const vpx_image_t *img = NULL; // Get decompressed data - while ((img = dec_iter.Next())) { + while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) { DecompressedFrameHook(*img, video->frame_number()); } } diff --git a/libs/libvpx/test/decode_test_driver.h b/libs/libvpx/test/decode_test_driver.h index 644fc9e90d..04876cdd7c 100644 --- a/libs/libvpx/test/decode_test_driver.h +++ b/libs/libvpx/test/decode_test_driver.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_DECODE_TEST_DRIVER_H_ -#define TEST_DECODE_TEST_DRIVER_H_ +#ifndef VPX_TEST_DECODE_TEST_DRIVER_H_ +#define VPX_TEST_DECODE_TEST_DRIVER_H_ #include #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" @@ -159,4 +159,4 @@ class DecoderTest { } // namespace libvpx_test -#endif // TEST_DECODE_TEST_DRIVER_H_ +#endif // VPX_TEST_DECODE_TEST_DRIVER_H_ diff --git a/libs/libvpx/test/encode_perf_test.cc b/libs/libvpx/test/encode_perf_test.cc index 0bb435502b..142d9e2da8 100644 --- a/libs/libvpx/test/encode_perf_test.cc +++ b/libs/libvpx/test/encode_perf_test.cc @@ -48,7 +48,7 @@ const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), }; -const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 }; +const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8, 9 }; const int kEncodePerfTestThreads[] = { 1, 2, 4 }; #define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0])) diff --git a/libs/libvpx/test/encode_test_driver.cc b/libs/libvpx/test/encode_test_driver.cc index b2cbc3f05b..8fdbdb62ae 100644 --- a/libs/libvpx/test/encode_test_driver.cc +++ b/libs/libvpx/test/encode_test_driver.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -128,6 +129,8 @@ static bool compare_img(const vpx_image_t *img1, const vpx_image_t *img2) { bool match = (img1->fmt == img2->fmt) && (img1->cs == img2->cs) && (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h); + if (!match) return false; + const unsigned int width_y = img1->d_w; const unsigned int height_y = img1->d_h; unsigned int i; @@ -177,7 +180,7 @@ void EncoderTest::RunLoop(VideoSource *video) { } BeginPassHook(pass); - testing::internal::scoped_ptr encoder( + std::unique_ptr encoder( codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_)); ASSERT_TRUE(encoder.get() != NULL); @@ -191,7 +194,7 @@ void EncoderTest::RunLoop(VideoSource *video) { if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) { dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS; } - testing::internal::scoped_ptr decoder( + std::unique_ptr decoder( codec_->CreateDecoder(dec_cfg, dec_init_flags)); bool again; for (again = true; again; video->Next()) { @@ -214,6 +217,7 @@ void EncoderTest::RunLoop(VideoSource *video) { case VPX_CODEC_CX_FRAME_PKT: has_cxdata = true; if (decoder.get() != NULL && DoDecode()) { + PreDecodeFrameHook(video, decoder.get()); vpx_codec_err_t res_dec = decoder->DecodeFrame( (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz); diff --git a/libs/libvpx/test/encode_test_driver.h b/libs/libvpx/test/encode_test_driver.h index 89a3b1767e..3edba4b926 100644 --- a/libs/libvpx/test/encode_test_driver.h +++ b/libs/libvpx/test/encode_test_driver.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_ENCODE_TEST_DRIVER_H_ -#define TEST_ENCODE_TEST_DRIVER_H_ +#ifndef VPX_TEST_ENCODE_TEST_DRIVER_H_ +#define VPX_TEST_ENCODE_TEST_DRIVER_H_ #include #include @@ -128,24 +128,37 @@ class Encoder { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + void Control(int ctrl_id, struct vpx_svc_parameters *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + + void Control(int ctrl_id, struct vpx_svc_frame_drop *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_svc_spatial_layer_sync *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER void Control(int ctrl_id, vpx_active_map_t *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } -#endif -#if CONFIG_VP8_ENCODER void Control(int ctrl_id, vpx_roi_map_t *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } #endif - void Config(const vpx_codec_enc_cfg_t *cfg) { const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); @@ -219,6 +232,9 @@ class EncoderTest { virtual void PreEncodeFrameHook(VideoSource * /*video*/, Encoder * /*encoder*/) {} + virtual void PreDecodeFrameHook(VideoSource * /*video*/, + Decoder * /*decoder*/) {} + virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {} // Hook to be called on every compressed data packet. @@ -273,4 +289,4 @@ class EncoderTest { } // namespace libvpx_test -#endif // TEST_ENCODE_TEST_DRIVER_H_ +#endif // VPX_TEST_ENCODE_TEST_DRIVER_H_ diff --git a/libs/libvpx/test/external_frame_buffer_test.cc b/libs/libvpx/test/external_frame_buffer_test.cc index dbf2971198..438eeb3ecd 100644 --- a/libs/libvpx/test/external_frame_buffer_test.cc +++ b/libs/libvpx/test/external_frame_buffer_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "./vpx_config.h" @@ -113,9 +114,9 @@ class ExternalFrameBufferList { return 0; } - // Checks that the ximage data is contained within the external frame buffer - // private data passed back in the ximage. - void CheckXImageFrameBuffer(const vpx_image_t *img) { + // Checks that the vpx_image_t data is contained within the external frame + // buffer private data passed back in the vpx_image_t. + void CheckImageFrameBuffer(const vpx_image_t *img) { if (img->fb_priv != NULL) { const struct ExternalFrameBuffer *const ext_fb = reinterpret_cast(img->fb_priv); @@ -335,14 +336,13 @@ class ExternalFrameBufferTest : public ::testing::Test { return VPX_CODEC_OK; } - protected: void CheckDecodedFrames() { libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); const vpx_image_t *img = NULL; // Get decompressed data while ((img = dec_iter.Next()) != NULL) { - fb_list_.CheckXImageFrameBuffer(img); + fb_list_.CheckImageFrameBuffer(img); } } @@ -393,7 +393,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) { #endif // Open compressed video file. - testing::internal::scoped_ptr video; + std::unique_ptr video; if (filename.substr(filename.length() - 3, 3) == "ivf") { video.reset(new libvpx_test::IVFVideoSource(filename)); } else { diff --git a/libs/libvpx/test/fdct8x8_test.cc b/libs/libvpx/test/fdct8x8_test.cc index 5021dda9b3..244b9740b0 100644 --- a/libs/libvpx/test/fdct8x8_test.cc +++ b/libs/libvpx/test/fdct8x8_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -43,9 +44,9 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, int tx_type); -typedef std::tr1::tuple Dct8x8Param; -typedef std::tr1::tuple Ht8x8Param; -typedef std::tr1::tuple Idct8x8Param; +typedef std::tuple Dct8x8Param; +typedef std::tuple Ht8x8Param; +typedef std::tuple Idct8x8Param; void reference_8x8_dct_1d(const double in[8], double out[8]) { const double kInvSqrt2 = 0.707106781186547524400844362104; @@ -628,7 +629,7 @@ TEST_P(InvTrans8x8DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( @@ -675,6 +676,7 @@ INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 0, VPX_BITS_8))); + #if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8HT, diff --git a/libs/libvpx/test/frame_size_tests.cc b/libs/libvpx/test/frame_size_tests.cc index 5a9b166e5b..f66972b4a1 100644 --- a/libs/libvpx/test/frame_size_tests.cc +++ b/libs/libvpx/test/frame_size_tests.cc @@ -34,7 +34,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); diff --git a/libs/libvpx/test/hadamard_test.cc b/libs/libvpx/test/hadamard_test.cc index 3b7cfeddcf..b194ace674 100644 --- a/libs/libvpx/test/hadamard_test.cc +++ b/libs/libvpx/test/hadamard_test.cc @@ -25,13 +25,13 @@ using ::libvpx_test::ACMRandom; typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride, tran_low_t *b); -void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { - int16_t b[8]; +void hadamard_loop(const tran_low_t *a, tran_low_t *out) { + tran_low_t b[8]; for (int i = 0; i < 8; i += 2) { - b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride]; - b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride]; + b[i + 0] = a[i * 8] + a[(i + 1) * 8]; + b[i + 1] = a[i * 8] - a[(i + 1) * 8]; } - int16_t c[8]; + tran_low_t c[8]; for (int i = 0; i < 8; i += 4) { c[i + 0] = b[i + 0] + b[i + 2]; c[i + 1] = b[i + 1] + b[i + 3]; @@ -49,12 +49,15 @@ void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { } void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) { - int16_t buf[64]; - int16_t buf2[64]; - for (int i = 0; i < 8; ++i) hadamard_loop(a + i, a_stride, buf + i * 8); - for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, 8, buf2 + i * 8); - - for (int i = 0; i < 64; ++i) b[i] = (tran_low_t)buf2[i]; + tran_low_t input[64]; + tran_low_t buf[64]; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + input[i * 8 + j] = static_cast(a[i * a_stride + j]); + } + } + for (int i = 0; i < 8; ++i) hadamard_loop(input + i, buf + i * 8); + for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, b + i * 8); } void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) { @@ -89,205 +92,229 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) { } } -class HadamardTestBase : public ::testing::TestWithParam { +void reference_hadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) { + reference_hadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0); + reference_hadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256); + reference_hadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512); + reference_hadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768); + + for (int i = 0; i < 256; ++i) { + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[256]; + const tran_low_t a2 = b[512]; + const tran_low_t a3 = b[768]; + + const tran_low_t b0 = (a0 + a1) >> 2; + const tran_low_t b1 = (a0 - a1) >> 2; + const tran_low_t b2 = (a2 + a3) >> 2; + const tran_low_t b3 = (a2 - a3) >> 2; + + b[0] = b0 + b2; + b[256] = b1 + b3; + b[512] = b0 - b2; + b[768] = b1 - b3; + + ++b; + } +} + +struct HadamardFuncWithSize { + HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {} + HadamardFunc func; + int block_size; +}; + +std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) { + return os << "block size: " << hfs.block_size; +} + +class HadamardTestBase : public ::testing::TestWithParam { public: virtual void SetUp() { - h_func_ = GetParam(); + h_func_ = GetParam().func; + bwh_ = GetParam().block_size; + block_size_ = bwh_ * bwh_; rnd_.Reset(ACMRandom::DeterministicSeed()); } + virtual int16_t Rand() = 0; + + void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b, + int bwh) { + if (bwh == 32) + reference_hadamard32x32(a, a_stride, b); + else if (bwh == 16) + reference_hadamard16x16(a, a_stride, b); + else + reference_hadamard8x8(a, a_stride, b); + } + + void CompareReferenceRandom() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < block_size_; ++i) a[i] = Rand(); + + ReferenceHadamard(a, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + + void VaryStride() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand(); + + tran_low_t b_ref[kMaxBlockSize]; + for (int i = 8; i < 64; i += 8) { + memset(b, 0, sizeof(b)); + memset(b_ref, 0, sizeof(b_ref)); + + ReferenceHadamard(a, i, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + + void SpeedTest(int times) { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]); + memset(input, 1, sizeof(input)); + memset(output, 0, sizeof(output)); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < times; ++i) { + h_func_(input, bwh_, output); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times, + elapsed_time); + } + protected: + int bwh_; + int block_size_; HadamardFunc h_func_; ACMRandom rnd_; }; -void HadamardSpeedTest(const char *name, HadamardFunc const func, - const int16_t *input, int stride, tran_low_t *output, - int times) { - int i; - vpx_usec_timer timer; +class HadamardLowbdTest : public HadamardTestBase { + protected: + virtual int16_t Rand() { return rnd_.Rand9Signed(); } +}; - vpx_usec_timer_start(&timer); - for (i = 0; i < times; ++i) { - func(input, stride, output); - } - vpx_usec_timer_mark(&timer); +TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } - const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); - printf("%s[%12d runs]: %d us\n", name, times, elapsed_time); +TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); } + +TEST_P(HadamardLowbdTest, DISABLED_Speed) { + SpeedTest(10); + SpeedTest(10000); + SpeedTest(10000000); } -class Hadamard8x8Test : public HadamardTestBase {}; - -void HadamardSpeedTest8x8(HadamardFunc const func, int times) { - DECLARE_ALIGNED(16, int16_t, input[64]); - DECLARE_ALIGNED(16, tran_low_t, output[64]); - memset(input, 1, sizeof(input)); - HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times); -} - -TEST_P(Hadamard8x8Test, CompareReferenceRandom) { - DECLARE_ALIGNED(16, int16_t, a[64]); - DECLARE_ALIGNED(16, tran_low_t, b[64]); - tran_low_t b_ref[64]; - for (int i = 0; i < 64; ++i) { - a[i] = rnd_.Rand9Signed(); - } - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); - - reference_hadamard8x8(a, 8, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b)); - - // The order of the output is not important. Sort before checking. - std::sort(b, b + 64); - std::sort(b_ref, b_ref + 64); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); -} - -TEST_P(Hadamard8x8Test, VaryStride) { - DECLARE_ALIGNED(16, int16_t, a[64 * 8]); - DECLARE_ALIGNED(16, tran_low_t, b[64]); - tran_low_t b_ref[64]; - for (int i = 0; i < 64 * 8; ++i) { - a[i] = rnd_.Rand9Signed(); - } - - for (int i = 8; i < 64; i += 8) { - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); - - reference_hadamard8x8(a, i, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); - - // The order of the output is not important. Sort before checking. - std::sort(b, b + 64); - std::sort(b_ref, b_ref + 64); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); - } -} - -TEST_P(Hadamard8x8Test, DISABLED_Speed) { - HadamardSpeedTest8x8(h_func_, 10); - HadamardSpeedTest8x8(h_func_, 10000); - HadamardSpeedTest8x8(h_func_, 10000000); -} - -INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_c)); +INSTANTIATE_TEST_CASE_P( + C, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_c, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_c, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_c, 32))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_sse2)); +INSTANTIATE_TEST_CASE_P( + SSE2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_sse2, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_sse2, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_sse2, 32))); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_16x16_avx2, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_avx2, 32))); +#endif // HAVE_AVX2 + #if HAVE_SSSE3 && ARCH_X86_64 -INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_ssse3)); +INSTANTIATE_TEST_CASE_P( + SSSE3, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_ssse3, 8))); #endif // HAVE_SSSE3 && ARCH_X86_64 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_neon)); +INSTANTIATE_TEST_CASE_P( + NEON, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16))); #endif // HAVE_NEON // TODO(jingning): Remove highbitdepth flag when the SIMD functions are // in place and turn on the unit test. #if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_msa)); +INSTANTIATE_TEST_CASE_P( + MSA, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_msa, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_msa, 16))); #endif // HAVE_MSA #endif // !CONFIG_VP9_HIGHBITDEPTH #if HAVE_VSX -INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_vsx)); +INSTANTIATE_TEST_CASE_P( + VSX, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_vsx, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16))); #endif // HAVE_VSX -class Hadamard16x16Test : public HadamardTestBase {}; +#if CONFIG_VP9_HIGHBITDEPTH +class HadamardHighbdTest : public HadamardTestBase { + protected: + virtual int16_t Rand() { return rnd_.Rand13Signed(); } +}; -void HadamardSpeedTest16x16(HadamardFunc const func, int times) { - DECLARE_ALIGNED(16, int16_t, input[256]); - DECLARE_ALIGNED(16, tran_low_t, output[256]); - memset(input, 1, sizeof(input)); - HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times); +TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } + +TEST_P(HadamardHighbdTest, VaryStride) { VaryStride(); } + +TEST_P(HadamardHighbdTest, DISABLED_Speed) { + SpeedTest(10); + SpeedTest(10000); + SpeedTest(10000000); } -TEST_P(Hadamard16x16Test, CompareReferenceRandom) { - DECLARE_ALIGNED(16, int16_t, a[16 * 16]); - DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]); - tran_low_t b_ref[16 * 16]; - for (int i = 0; i < 16 * 16; ++i) { - a[i] = rnd_.Rand9Signed(); - } - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); - - reference_hadamard16x16(a, 16, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b)); - - // The order of the output is not important. Sort before checking. - std::sort(b, b + 16 * 16); - std::sort(b_ref, b_ref + 16 * 16); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); -} - -TEST_P(Hadamard16x16Test, VaryStride) { - DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]); - DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]); - tran_low_t b_ref[16 * 16]; - for (int i = 0; i < 16 * 16 * 8; ++i) { - a[i] = rnd_.Rand9Signed(); - } - - for (int i = 8; i < 64; i += 8) { - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); - - reference_hadamard16x16(a, i, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); - - // The order of the output is not important. Sort before checking. - std::sort(b, b + 16 * 16); - std::sort(b_ref, b_ref + 16 * 16); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); - } -} - -TEST_P(Hadamard16x16Test, DISABLED_Speed) { - HadamardSpeedTest16x16(h_func_, 10); - HadamardSpeedTest16x16(h_func_, 10000); - HadamardSpeedTest16x16(h_func_, 10000000); -} - -INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_c)); - -#if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_sse2)); -#endif // HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + C, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_c, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_c, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_c, 32))); #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_avx2)); +INSTANTIATE_TEST_CASE_P( + AVX2, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2, + 32))); #endif // HAVE_AVX2 -#if HAVE_VSX -INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_vsx)); -#endif // HAVE_VSX - -#if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_neon)); -#endif // HAVE_NEON - -#if !CONFIG_VP9_HIGHBITDEPTH -#if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_msa)); -#endif // HAVE_MSA -#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/libs/libvpx/test/i420_video_source.h b/libs/libvpx/test/i420_video_source.h index 49573823b4..97473b5c2f 100644 --- a/libs/libvpx/test/i420_video_source.h +++ b/libs/libvpx/test/i420_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_I420_VIDEO_SOURCE_H_ -#define TEST_I420_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_I420_VIDEO_SOURCE_H_ +#define VPX_TEST_I420_VIDEO_SOURCE_H_ #include #include #include @@ -30,4 +30,4 @@ class I420VideoSource : public YUVVideoSource { } // namespace libvpx_test -#endif // TEST_I420_VIDEO_SOURCE_H_ +#endif // VPX_TEST_I420_VIDEO_SOURCE_H_ diff --git a/libs/libvpx/test/idct_test.cc b/libs/libvpx/test/idct_test.cc index 3700374d7a..3564c0bd5d 100644 --- a/libs/libvpx/test/idct_test.cc +++ b/libs/libvpx/test/idct_test.cc @@ -72,6 +72,7 @@ TEST_P(IDCTTest, TestAllZeros) { TEST_P(IDCTTest, TestAllOnes) { input->Set(0); + ASSERT_TRUE(input->TopLeftPixel() != NULL); // When the first element is '4' it will fill the output buffer with '1'. input->TopLeftPixel()[0] = 4; predict->Set(0); @@ -89,6 +90,7 @@ TEST_P(IDCTTest, TestAddOne) { // Set the transform output to '1' and make sure it gets added to the // prediction buffer. input->Set(0); + ASSERT_TRUE(input->TopLeftPixel() != NULL); input->TopLeftPixel()[0] = 4; output->Set(0); @@ -174,4 +176,4 @@ INSTANTIATE_TEST_CASE_P(MSA, IDCTTest, INSTANTIATE_TEST_CASE_P(MMI, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_mmi)); #endif // HAVE_MMI -} +} // namespace diff --git a/libs/libvpx/test/invalid_file_test.cc b/libs/libvpx/test/invalid_file_test.cc index 79220b0f69..8eed05eb49 100644 --- a/libs/libvpx/test/invalid_file_test.cc +++ b/libs/libvpx/test/invalid_file_test.cc @@ -10,6 +10,7 @@ #include #include +#include #include #include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -89,7 +90,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, const std::string filename = input.filename; // Open compressed video file. - testing::internal::scoped_ptr video; + std::unique_ptr video; if (filename.substr(filename.length() - 3, 3) == "ivf") { video.reset(new libvpx_test::IVFVideoSource(filename)); } else if (filename.substr(filename.length() - 4, 4) == "webm") { @@ -123,6 +124,8 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } #if CONFIG_VP8_DECODER const DecodeParam kVP8InvalidFileTests[] = { { 1, "invalid-bug-1443.ivf" }, + { 1, "invalid-token-partition.ivf" }, + { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" }, }; VP8_INSTANTIATE_TEST_CASE(InvalidFileTest, @@ -202,6 +205,8 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" }, { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" }, { 2, "invalid-crbug-629481.webm" }, + { 3, "invalid-crbug-1558.ivf" }, + { 4, "invalid-crbug-1562.ivf" }, }; INSTANTIATE_TEST_CASE_P( diff --git a/libs/libvpx/test/ivf_video_source.h b/libs/libvpx/test/ivf_video_source.h index 5862d2649f..22c05ecde9 100644 --- a/libs/libvpx/test/ivf_video_source.h +++ b/libs/libvpx/test/ivf_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_IVF_VIDEO_SOURCE_H_ -#define TEST_IVF_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_IVF_VIDEO_SOURCE_H_ +#define VPX_TEST_IVF_VIDEO_SOURCE_H_ #include #include #include @@ -16,7 +16,7 @@ #include "test/video_source.h" namespace libvpx_test { -const unsigned int kCodeBufferSize = 256 * 1024; +const unsigned int kCodeBufferSize = 256 * 1024 * 1024; const unsigned int kIvfFileHdrSize = 32; const unsigned int kIvfFrameHdrSize = 12; @@ -103,4 +103,4 @@ class IVFVideoSource : public CompressedVideoSource { } // namespace libvpx_test -#endif // TEST_IVF_VIDEO_SOURCE_H_ +#endif // VPX_TEST_IVF_VIDEO_SOURCE_H_ diff --git a/libs/libvpx/test/keyframe_test.cc b/libs/libvpx/test/keyframe_test.cc index ee75f401ca..582d448168 100644 --- a/libs/libvpx/test/keyframe_test.cc +++ b/libs/libvpx/test/keyframe_test.cc @@ -38,7 +38,7 @@ class KeyframeTest if (kf_do_force_kf_) { frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF; } - if (set_cpu_used_ && video->frame() == 1) { + if (set_cpu_used_ && video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); } } @@ -68,7 +68,9 @@ TEST_P(KeyframeTest, TestRandomVideoSource) { // In realtime mode - auto placed keyframes are exceedingly rare, don't // bother with this check if(GetParam() > 0) - if (GET_PARAM(1) > 0) EXPECT_GT(kf_count_, 1); + if (GET_PARAM(1) > 0) { + EXPECT_GT(kf_count_, 1); + } } TEST_P(KeyframeTest, TestDisableKeyframes) { @@ -128,8 +130,9 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { // In realtime mode - auto placed keyframes are exceedingly rare, don't // bother with this check - if (GET_PARAM(1) > 0) + if (GET_PARAM(1) > 0) { EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes "; + } // Verify that keyframes match the file keyframes in the file. for (std::vector::const_iterator iter = kf_pts_list_.begin(); diff --git a/libs/libvpx/test/lpf_test.cc b/libs/libvpx/test/lpf_test.cc index e04b996cd8..dfdd515992 100644 --- a/libs/libvpx/test/lpf_test.cc +++ b/libs/libvpx/test/lpf_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -56,8 +57,8 @@ typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0, const uint8_t *thresh1); #endif // CONFIG_VP9_HIGHBITDEPTH -typedef std::tr1::tuple loop8_param_t; -typedef std::tr1::tuple dualloop8_param_t; +typedef std::tuple loop8_param_t; +typedef std::tuple dualloop8_param_t; void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, const int mask, const int32_t p, const int i) { @@ -402,7 +403,7 @@ TEST_P(Loop8Test9Param, ValueCheck) { << "First failed at test case " << first_failure; } -using std::tr1::make_tuple; +using std::make_tuple; #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/test/md5_helper.h b/libs/libvpx/test/md5_helper.h index ef310a2d90..dc28dc6283 100644 --- a/libs/libvpx/test/md5_helper.h +++ b/libs/libvpx/test/md5_helper.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_MD5_HELPER_H_ -#define TEST_MD5_HELPER_H_ +#ifndef VPX_TEST_MD5_HELPER_H_ +#define VPX_TEST_MD5_HELPER_H_ #include "./md5_utils.h" #include "vpx/vpx_decoder.h" @@ -72,4 +72,4 @@ class MD5 { } // namespace libvpx_test -#endif // TEST_MD5_HELPER_H_ +#endif // VPX_TEST_MD5_HELPER_H_ diff --git a/libs/libvpx/test/partial_idct_test.cc b/libs/libvpx/test/partial_idct_test.cc index f7b50f53a1..e66a695eb0 100644 --- a/libs/libvpx/test/partial_idct_test.cc +++ b/libs/libvpx/test/partial_idct_test.cc @@ -11,8 +11,8 @@ #include #include #include - #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -51,8 +51,8 @@ void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { } #endif -typedef std::tr1::tuple +typedef std::tuple PartialInvTxfmParam; const int kMaxNumCoeffs = 1024; const int kCountTestBlock = 1000; @@ -324,7 +324,7 @@ TEST_P(PartialIDctTest, DISABLED_Speed) { << "Error: partial inverse transform produces different results"; } -using std::tr1::make_tuple; +using std::make_tuple; const PartialInvTxfmParam c_partial_idct_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/test/pp_filter_test.cc b/libs/libvpx/test/pp_filter_test.cc index 5a2ade1ef4..1ed261bf9b 100644 --- a/libs/libvpx/test/pp_filter_test.cc +++ b/libs/libvpx/test/pp_filter_test.cc @@ -11,6 +11,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" @@ -32,7 +33,6 @@ typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows, int cols, int flimit); namespace { - // Compute the filter level used in post proc from the loop filter strength int q2mbl(int x) { if (x < 20) x = 20; @@ -42,33 +42,52 @@ int q2mbl(int x) { } class VpxPostProcDownAndAcrossMbRowTest - : public ::testing::TestWithParam { + : public AbstractBench, + public ::testing::TestWithParam { public: + VpxPostProcDownAndAcrossMbRowTest() + : mb_post_proc_down_and_across_(GetParam()) {} virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + virtual void Run(); + + const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_; + // Size of the underlying data block that will be filtered. + int block_width_; + int block_height_; + Buffer *src_image_; + Buffer *dst_image_; + uint8_t *flimits_; }; +void VpxPostProcDownAndAcrossMbRowTest::Run() { + mb_post_proc_down_and_across_( + src_image_->TopLeftPixel(), dst_image_->TopLeftPixel(), + src_image_->stride(), dst_image_->stride(), block_width_, flimits_, 16); +} + // Test routine for the VPx post-processing function // vpx_post_proc_down_and_across_mb_row_c. TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Size of the underlying data block that will be filtered. - const int block_width = 16; - const int block_height = 16; + block_width_ = 16; + block_height_ = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. - Buffer src_image = Buffer(block_width, block_height, 2); + Buffer src_image = Buffer(block_width_, block_height_, 2); ASSERT_TRUE(src_image.Init()); // Filter extends output block by 8 samples at left and right edges. // Though the left padding is only 8 bytes, the assembly code tries to // read 16 bytes before the pointer. Buffer dst_image = - Buffer(block_width, block_height, 8, 16, 8, 8); + Buffer(block_width_, block_height_, 8, 16, 8, 8); ASSERT_TRUE(dst_image.Init()); - uint8_t *const flimits = - reinterpret_cast(vpx_memalign(16, block_width)); - (void)memset(flimits, 255, block_width); + flimits_ = reinterpret_cast(vpx_memalign(16, block_width_)); + (void)memset(flimits_, 255, block_width_); // Initialize pixels in the input: // block pixels to value 1, @@ -79,37 +98,36 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Initialize pixels in the output to 99. dst_image.Set(99); - ASM_REGISTER_STATE_CHECK(GetParam()( + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_( src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(), - dst_image.stride(), block_width, flimits, 16)); + dst_image.stride(), block_width_, flimits_, 16)); - static const uint8_t kExpectedOutput[block_height] = { - 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4 - }; + static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 3, 4 }; uint8_t *pixel_ptr = dst_image.TopLeftPixel(); - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { + for (int i = 0; i < block_height_; ++i) { + for (int j = 0; j < block_width_; ++j) { ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) << "at (" << i << ", " << j << ")"; } pixel_ptr += dst_image.stride(); } - vpx_free(flimits); + vpx_free(flimits_); }; TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // Size of the underlying data block that will be filtered. // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V // blocks are always a multiple of 8 wide and exactly 8 high. - const int block_width = 136; - const int block_height = 16; + block_width_ = 136; + block_height_ = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. Buffer src_image = - Buffer(block_width, block_height, 2, 2, 10, 2); + Buffer(block_width_, block_height_, 2, 2, 10, 2); ASSERT_TRUE(src_image.Init()); // Filter extends output block by 8 samples at left and right edges. @@ -118,17 +136,17 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // not a problem. // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. Buffer dst_image = - Buffer(block_width, block_height, 8, 8, 16, 8); + Buffer(block_width_, block_height_, 8, 8, 16, 8); ASSERT_TRUE(dst_image.Init()); - Buffer dst_image_ref = Buffer(block_width, block_height, 8); + Buffer dst_image_ref = + Buffer(block_width_, block_height_, 8); ASSERT_TRUE(dst_image_ref.Init()); // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so // it must be padded out. - const int flimits_width = block_width % 16 ? block_width + 8 : block_width; - uint8_t *const flimits = - reinterpret_cast(vpx_memalign(16, flimits_width)); + const int flimits_width = block_width_ % 16 ? block_width_ + 8 : block_width_; + flimits_ = reinterpret_cast(vpx_memalign(16, flimits_width)); ACMRandom rnd; rnd.Reset(ACMRandom::DeterministicSeed()); @@ -138,37 +156,78 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { src_image.SetPadding(10); src_image.Set(&rnd, &ACMRandom::Rand8); - for (int blocks = 0; blocks < block_width; blocks += 8) { - (void)memset(flimits, 0, sizeof(*flimits) * flimits_width); + for (int blocks = 0; blocks < block_width_; blocks += 8) { + (void)memset(flimits_, 0, sizeof(*flimits_) * flimits_width); for (int f = 0; f < 255; f++) { - (void)memset(flimits + blocks, f, sizeof(*flimits) * 8); - + (void)memset(flimits_ + blocks, f, sizeof(*flimits_) * 8); dst_image.Set(0); dst_image_ref.Set(0); vpx_post_proc_down_and_across_mb_row_c( src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(), - src_image.stride(), dst_image_ref.stride(), block_width, flimits, - block_height); - ASM_REGISTER_STATE_CHECK( - GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(), - src_image.stride(), dst_image.stride(), block_width, - flimits, block_height)); + src_image.stride(), dst_image_ref.stride(), block_width_, flimits_, + block_height_); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), + src_image.stride(), dst_image.stride(), block_width_, flimits_, + block_height_)); ASSERT_TRUE(dst_image.CheckValues(dst_image_ref)); } } - vpx_free(flimits); + vpx_free(flimits_); } +TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) { + // Size of the underlying data block that will be filtered. + block_width_ = 16; + block_height_ = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + Buffer src_image = Buffer(block_width_, block_height_, 2); + ASSERT_TRUE(src_image.Init()); + this->src_image_ = &src_image; + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + Buffer dst_image = + Buffer(block_width_, block_height_, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); + this->dst_image_ = &dst_image; + + flimits_ = reinterpret_cast(vpx_memalign(16, block_width_)); + (void)memset(flimits_, 255, block_width_); + + // Initialize pixels in the input: + // block pixels to value 1, + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(1); + + // Initialize pixels in the output to 99. + dst_image.Set(99); + + RunNTimes(INT16_MAX); + PrintMedian("16x16"); + + vpx_free(flimits_); +}; + class VpxMbPostProcAcrossIpTest - : public ::testing::TestWithParam { + : public AbstractBench, + public ::testing::TestWithParam { public: + VpxMbPostProcAcrossIpTest() + : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()), + src_(Buffer(rows_, cols_, 8, 8, 17, 8)) {} virtual void TearDown() { libvpx_test::ClearSystemState(); } protected: + virtual void Run(); + void SetCols(unsigned char *s, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { @@ -195,71 +254,67 @@ class VpxMbPostProcAcrossIpTest GetParam()(s, src_width, rows, cols, filter_level)); RunComparison(expected_output, s, rows, cols, src_width); } + + const int rows_; + const int cols_; + const VpxMbPostProcAcrossIpFunc mb_post_proc_across_ip_; + Buffer src_; }; +void VpxMbPostProcAcrossIpTest::Run() { + mb_post_proc_across_ip_(src_.TopLeftPixel(), src_.stride(), rows_, cols_, + q2mbl(0)); +} + TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { - const int rows = 16; - const int cols = 16; + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - Buffer src = Buffer(cols, rows, 8, 8, 17, 8); - ASSERT_TRUE(src.Init()); - src.SetPadding(10); - SetCols(src.TopLeftPixel(), rows, cols, src.stride()); - - Buffer expected_output = Buffer(cols, rows, 0); + Buffer expected_output = Buffer(cols_, rows_, 0); ASSERT_TRUE(expected_output.Init()); - SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride()); + SetCols(expected_output.TopLeftPixel(), rows_, cols_, + expected_output.stride()); - RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0), + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(0), expected_output.TopLeftPixel()); } TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) { - const int rows = 16; - const int cols = 16; + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - Buffer src = Buffer(cols, rows, 8, 8, 17, 8); - ASSERT_TRUE(src.Init()); - src.SetPadding(10); - SetCols(src.TopLeftPixel(), rows, cols, src.stride()); - - static const unsigned char kExpectedOutput[cols] = { + static const unsigned char kExpectedOutput[] = { 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13 }; - RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(70), + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(70), kExpectedOutput); } TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) { - const int rows = 16; - const int cols = 16; + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - Buffer src = Buffer(cols, rows, 8, 8, 17, 8); - ASSERT_TRUE(src.Init()); - src.SetPadding(10); - SetCols(src.TopLeftPixel(), rows, cols, src.stride()); - - static const unsigned char kExpectedOutput[cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13 }; - RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), INT_MAX, + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), INT_MAX, kExpectedOutput); - SetCols(src.TopLeftPixel(), rows, cols, src.stride()); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(100), + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(100), kExpectedOutput); } TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { - const int rows = 16; - const int cols = 16; - - Buffer c_mem = Buffer(cols, rows, 8, 8, 17, 8); + Buffer c_mem = Buffer(cols_, rows_, 8, 8, 17, 8); ASSERT_TRUE(c_mem.Init()); - Buffer asm_mem = Buffer(cols, rows, 8, 8, 17, 8); + Buffer asm_mem = Buffer(cols_, rows_, 8, 8, 17, 8); ASSERT_TRUE(asm_mem.Init()); // When level >= 100, the filter behaves the same as the level = INT_MAX @@ -267,24 +322,41 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { for (int level = 0; level < 100; level++) { c_mem.SetPadding(10); asm_mem.SetPadding(10); - SetCols(c_mem.TopLeftPixel(), rows, cols, c_mem.stride()); - SetCols(asm_mem.TopLeftPixel(), rows, cols, asm_mem.stride()); + SetCols(c_mem.TopLeftPixel(), rows_, cols_, c_mem.stride()); + SetCols(asm_mem.TopLeftPixel(), rows_, cols_, asm_mem.stride()); - vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows, - cols, q2mbl(level)); + vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows_, + cols_, q2mbl(level)); ASM_REGISTER_STATE_CHECK(GetParam()( - asm_mem.TopLeftPixel(), asm_mem.stride(), rows, cols, q2mbl(level))); + asm_mem.TopLeftPixel(), asm_mem.stride(), rows_, cols_, q2mbl(level))); ASSERT_TRUE(asm_mem.CheckValues(c_mem)); } } +TEST_P(VpxMbPostProcAcrossIpTest, DISABLED_Speed) { + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); + + RunNTimes(100000); + PrintMedian("16x16"); +} + class VpxMbPostProcDownTest - : public ::testing::TestWithParam { + : public AbstractBench, + public ::testing::TestWithParam { public: + VpxMbPostProcDownTest() + : rows_(16), cols_(16), mb_post_proc_down_(GetParam()), + src_c_(Buffer(rows_, cols_, 8, 8, 8, 17)) {} + virtual void TearDown() { libvpx_test::ClearSystemState(); } protected: + virtual void Run(); + void SetRows(unsigned char *src_c, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { memset(src_c, r, cols); @@ -306,22 +378,28 @@ class VpxMbPostProcDownTest void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width, int filter_level, const unsigned char *expected_output) { ASM_REGISTER_STATE_CHECK( - GetParam()(s, src_width, rows, cols, filter_level)); + mb_post_proc_down_(s, src_width, rows, cols, filter_level)); RunComparison(expected_output, s, rows, cols, src_width); } + + const int rows_; + const int cols_; + const VpxMbPostProcDownFunc mb_post_proc_down_; + Buffer src_c_; }; +void VpxMbPostProcDownTest::Run() { + mb_post_proc_down_(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, + q2mbl(0)); +} + TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { - const int rows = 16; - const int cols = 16; + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); - Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); - ASSERT_TRUE(src_c.Init()); - src_c.SetPadding(10); + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); - - static const unsigned char kExpectedOutput[rows * cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 4, 4, 3, 3, 3, @@ -338,26 +416,22 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { 13, 13, 13, 13, 14, 13, 13, 13, 13 }; - RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), INT_MAX, + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), INT_MAX, kExpectedOutput); - src_c.SetPadding(10); - SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); - RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(100), - kExpectedOutput); + src_c_.SetPadding(10); + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), + q2mbl(100), kExpectedOutput); } TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { - const int rows = 16; - const int cols = 16; + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); - Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); - ASSERT_TRUE(src_c.Init()); - src_c.SetPadding(10); + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); - - static const unsigned char kExpectedOutput[rows * cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, @@ -374,67 +448,69 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { 13, 13, 13, 13, 14, 13, 13, 13, 13 }; - RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(70), - kExpectedOutput); + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), + q2mbl(70), kExpectedOutput); } TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { - const int rows = 16; - const int cols = 16; + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); - Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); - ASSERT_TRUE(src_c.Init()); - src_c.SetPadding(10); + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); - - unsigned char *expected_output = new unsigned char[rows * cols]; + unsigned char *expected_output = new unsigned char[rows_ * cols_]; ASSERT_TRUE(expected_output != NULL); - SetRows(expected_output, rows, cols, cols); + SetRows(expected_output, rows_, cols_, cols_); - RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(0), + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0), expected_output); delete[] expected_output; } TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { - const int rows = 16; - const int cols = 16; - ACMRandom rnd; rnd.Reset(ACMRandom::DeterministicSeed()); - Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); - ASSERT_TRUE(src_c.Init()); - Buffer src_asm = Buffer(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c_.Init()); + Buffer src_asm = Buffer(cols_, rows_, 8, 8, 8, 17); ASSERT_TRUE(src_asm.Init()); for (int level = 0; level < 100; level++) { - src_c.SetPadding(10); + src_c_.SetPadding(10); src_asm.SetPadding(10); - src_c.Set(&rnd, &ACMRandom::Rand8); - src_asm.CopyFrom(src_c); + src_c_.Set(&rnd, &ACMRandom::Rand8); + src_asm.CopyFrom(src_c_); - vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols, + vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, q2mbl(level)); - ASM_REGISTER_STATE_CHECK(GetParam()( - src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level))); - ASSERT_TRUE(src_asm.CheckValues(src_c)); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_( + src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c_)); - src_c.SetPadding(10); + src_c_.SetPadding(10); src_asm.SetPadding(10); - src_c.Set(&rnd, &ACMRandom::Rand8Extremes); - src_asm.CopyFrom(src_c); + src_c_.Set(&rnd, &ACMRandom::Rand8Extremes); + src_asm.CopyFrom(src_c_); - vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols, + vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, q2mbl(level)); - ASM_REGISTER_STATE_CHECK(GetParam()( - src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level))); - ASSERT_TRUE(src_asm.CheckValues(src_c)); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_( + src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c_)); } } +TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) { + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); + + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + + RunNTimes(100000); + PrintMedian("16x16"); +} + INSTANTIATE_TEST_CASE_P( C, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); @@ -481,4 +557,16 @@ INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest, ::testing::Values(vpx_mbpost_proc_down_msa)); #endif // HAVE_MSA +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P( + VSX, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx)); + +INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_vsx)); + +INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_vsx)); +#endif // HAVE_VSX + } // namespace diff --git a/libs/libvpx/test/predict_test.cc b/libs/libvpx/test/predict_test.cc index 9f366ae529..d40d9c755e 100644 --- a/libs/libvpx/test/predict_test.cc +++ b/libs/libvpx/test/predict_test.cc @@ -10,30 +10,34 @@ #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vp8_rtcd.h" #include "./vpx_config.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/msvc.h" namespace { using libvpx_test::ACMRandom; -using std::tr1::make_tuple; +using std::make_tuple; typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch); -typedef std::tr1::tuple PredictParam; +typedef std::tuple PredictParam; -class PredictTestBase : public ::testing::TestWithParam { +class PredictTestBase : public AbstractBench, + public ::testing::TestWithParam { public: PredictTestBase() : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)), @@ -204,7 +208,20 @@ class PredictTestBase : public ::testing::TestWithParam { } } } -}; + + void Run() { + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + if (xoffset == 0 && yoffset == 0) { + continue; + } + + predict_(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, dst_, + dst_stride_); + } + } + } +}; // namespace class SixtapPredictTest : public PredictTestBase {}; @@ -341,6 +358,14 @@ TEST_P(BilinearPredictTest, TestWithRandomData) { TEST_P(BilinearPredictTest, TestWithUnalignedDst) { TestWithUnalignedDst(vp8_bilinear_predict16x16_c); } +TEST_P(BilinearPredictTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 5000000 / (width_ * height_); + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", width_, height_); + PrintMedian(title); +} INSTANTIATE_TEST_CASE_P( C, BilinearPredictTest, @@ -356,17 +381,13 @@ INSTANTIATE_TEST_CASE_P( make_tuple(8, 4, &vp8_bilinear_predict8x4_neon), make_tuple(4, 4, &vp8_bilinear_predict4x4_neon))); #endif -#if HAVE_MMX -INSTANTIATE_TEST_CASE_P( - MMX, BilinearPredictTest, - ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx), - make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx))); -#endif #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2), - make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2))); + make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2), + make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2), + make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2))); #endif #if HAVE_SSSE3 INSTANTIATE_TEST_CASE_P( diff --git a/libs/libvpx/test/quantize_test.cc b/libs/libvpx/test/quantize_test.cc index 40bb2642e4..a7497742ce 100644 --- a/libs/libvpx/test/quantize_test.cc +++ b/libs/libvpx/test/quantize_test.cc @@ -9,12 +9,14 @@ */ #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" #include "./vp8_rtcd.h" +#include "./vpx_config.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" @@ -33,10 +35,10 @@ const int kNumBlockEntries = 16; typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d); -typedef std::tr1::tuple VP8QuantizeParam; +typedef std::tuple VP8QuantizeParam; using libvpx_test::ACMRandom; -using std::tr1::make_tuple; +using std::make_tuple; // Create and populate a VP8_COMP instance which has a complete set of // quantization inputs as well as a second MACROBLOCKD for output. @@ -116,7 +118,8 @@ class QuantizeTestBase { }; class QuantizeTest : public QuantizeTestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam, + public AbstractBench { protected: virtual void SetUp() { SetupCompressor(); @@ -124,6 +127,10 @@ class QuantizeTest : public QuantizeTestBase, c_quant_ = GET_PARAM(1); } + virtual void Run() { + asm_quant_(&vp8_comp_->mb.block[0], ¯oblockd_dst_->block[0]); + } + void RunComparison() { for (int i = 0; i < kNumBlocks; ++i) { ASM_REGISTER_STATE_CHECK( @@ -166,6 +173,13 @@ TEST_P(QuantizeTest, TestMultipleQ) { } } +TEST_P(QuantizeTest, DISABLED_Speed) { + FillCoeffRandom(); + + RunNTimes(10000000); + PrintMedian("vp8 quantize"); +} + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, QuantizeTest, diff --git a/libs/libvpx/test/register_state_check.h b/libs/libvpx/test/register_state_check.h index a779e5c06a..238508ac0e 100644 --- a/libs/libvpx/test/register_state_check.h +++ b/libs/libvpx/test/register_state_check.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_REGISTER_STATE_CHECK_H_ -#define TEST_REGISTER_STATE_CHECK_H_ +#ifndef VPX_TEST_REGISTER_STATE_CHECK_H_ +#define VPX_TEST_REGISTER_STATE_CHECK_H_ #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" @@ -28,7 +28,7 @@ // See platform implementations of RegisterStateCheckXXX for details. // -#if defined(_WIN64) +#if defined(_WIN64) && ARCH_X86_64 #undef NOMINMAX #define NOMINMAX @@ -138,7 +138,7 @@ class RegisterStateCheck {}; } // namespace libvpx_test -#endif // _WIN64 +#endif // _WIN64 && ARCH_X86_64 #if ARCH_X86 || ARCH_X86_64 #if defined(__GNUC__) @@ -184,4 +184,4 @@ class RegisterStateCheckMMX { #define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK #endif -#endif // TEST_REGISTER_STATE_CHECK_H_ +#endif // VPX_TEST_REGISTER_STATE_CHECK_H_ diff --git a/libs/libvpx/test/resize_test.cc b/libs/libvpx/test/resize_test.cc index e95dc6651a..5f80af6fb1 100644 --- a/libs/libvpx/test/resize_test.cc +++ b/libs/libvpx/test/resize_test.cc @@ -277,12 +277,29 @@ class ResizeTest SetMode(GET_PARAM(1)); } + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); + } + + unsigned int GetFrameWidth(size_t idx) const { + return encode_frame_width_[idx]; + } + + unsigned int GetFrameHeight(size_t idx) const { + return encode_frame_height_[idx]; + } + virtual void DecompressedFrameHook(const vpx_image_t &img, vpx_codec_pts_t pts) { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } std::vector frame_info_list_; + std::vector encode_frame_width_; + std::vector encode_frame_height_; }; TEST_P(ResizeTest, TestExternalResizeWorks) { @@ -296,6 +313,9 @@ TEST_P(ResizeTest, TestExternalResizeWorks) { const unsigned int frame = static_cast(info->pts); unsigned int expected_w; unsigned int expected_h; + const size_t idx = info - frame_info_list_.begin(); + ASSERT_EQ(info->w, GetFrameWidth(idx)); + ASSERT_EQ(info->h, GetFrameHeight(idx)); ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, &expected_h, 0); EXPECT_EQ(expected_w, info->w) @@ -464,8 +484,23 @@ class ResizeRealtimeTest ++mismatch_nframes_; } + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); + } + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + unsigned int GetFrameWidth(size_t idx) const { + return encode_frame_width_[idx]; + } + + unsigned int GetFrameHeight(size_t idx) const { + return encode_frame_height_[idx]; + } + void DefaultConfig() { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 600; @@ -493,6 +528,8 @@ class ResizeRealtimeTest bool change_bitrate_; double mismatch_psnr_; int mismatch_nframes_; + std::vector encode_frame_width_; + std::vector encode_frame_height_; }; TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { @@ -582,6 +619,9 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { int resize_count = 0; for (std::vector::const_iterator info = frame_info_list_.begin(); info != frame_info_list_.end(); ++info) { + const size_t idx = info - frame_info_list_.begin(); + ASSERT_EQ(info->w, GetFrameWidth(idx)); + ASSERT_EQ(info->h, GetFrameHeight(idx)); if (info->w != last_w || info->h != last_h) { resize_count++; if (resize_count == 1) { diff --git a/libs/libvpx/test/sad_test.cc b/libs/libvpx/test/sad_test.cc index 67c3c53150..0902df70ad 100644 --- a/libs/libvpx/test/sad_test.cc +++ b/libs/libvpx/test/sad_test.cc @@ -10,19 +10,21 @@ #include #include -#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" #include "vpx/vpx_codec.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/msvc.h" +#include "vpx_ports/vpx_timer.h" template struct TestParams { @@ -84,7 +86,7 @@ class SADTestBase : public ::testing::TestWithParam { #endif // CONFIG_VP9_HIGHBITDEPTH } mask_ = (1 << bit_depth_) - 1; - source_stride_ = (params_.width + 31) & ~31; + source_stride_ = (params_.width + 63) & ~63; reference_stride_ = params_.width * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); } @@ -108,7 +110,7 @@ class SADTestBase : public ::testing::TestWithParam { protected: // Handle blocks up to 4 blocks 64x64 with stride up to 128 - static const int kDataAlignment = 16; + static const int kDataAlignment = 32; static const int kDataBlockSize = 64 * 128; static const int kDataBufferSize = 4 * kDataBlockSize; @@ -264,7 +266,7 @@ class SADx4Test : public SADTestBase { } }; -class SADTest : public SADTestBase { +class SADTest : public AbstractBench, public SADTestBase { public: SADTest() : SADTestBase(GetParam()) {} @@ -284,6 +286,11 @@ class SADTest : public SADTestBase { ASSERT_EQ(reference_sad, exp_sad); } + + void Run() { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } }; class SADavgTest : public SADTestBase { @@ -350,6 +357,17 @@ TEST_P(SADTest, ShortSrc) { source_stride_ = tmp_stride; } +TEST_P(SADTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); @@ -463,6 +481,38 @@ TEST_P(SADx4Test, SrcAlignedByWidth) { source_data_ = tmp_source_data; } +TEST_P(SADx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4], exp_sad[4]; + vpx_usec_timer timer; + + memset(reference_sad, 0, sizeof(reference_sad)); + SADs(exp_sad); + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSAD(block); + } + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + //------------------------------------------------------------------------------ // C functions const SadMxNParam c_tests[] = { @@ -971,6 +1021,9 @@ const SadMxNParam vsx_tests[] = { SadMxNParam(16, 32, &vpx_sad16x32_vsx), SadMxNParam(16, 16, &vpx_sad16x16_vsx), SadMxNParam(16, 8, &vpx_sad16x8_vsx), + SadMxNParam(8, 16, &vpx_sad8x16_vsx), + SadMxNParam(8, 8, &vpx_sad8x8_vsx), + SadMxNParam(8, 4, &vpx_sad8x4_vsx), }; INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); diff --git a/libs/libvpx/test/stress.sh b/libs/libvpx/test/stress.sh index a899c800ca..fdec764c7a 100755 --- a/libs/libvpx/test/stress.sh +++ b/libs/libvpx/test/stress.sh @@ -30,7 +30,7 @@ SHA1_FILE="$(dirname $0)/test-data.sha1" # Download a file from the url and check its sha1sum. download_and_check_file() { # Get the file from the file path. - local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}" + local root="${1#${LIBVPX_TEST_DATA_PATH}/}" # Download the file using curl. Trap to insure non partial file. (trap "rm -f $1" INT TERM \ @@ -72,13 +72,13 @@ stress_verify_environment() { # This function runs tests on libvpx that run multiple encodes and decodes # in parallel in hopes of catching synchronization and/or threading issues. stress() { - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly encoder="$(vpx_tool_path vpxenc)" - local readonly codec="$1" - local readonly webm="$2" - local readonly decode_count="$3" - local readonly threads="$4" - local readonly enc_args="$5" + local decoder="$(vpx_tool_path vpxdec)" + local encoder="$(vpx_tool_path vpxenc)" + local codec="$1" + local webm="$2" + local decode_count="$3" + local threads="$4" + local enc_args="$5" local pids="" local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5} local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5} @@ -144,6 +144,19 @@ vp8_stress_test() { fi } +vp8_stress_test_token_parititions() { + local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40} + if [ "$(vp8_decode_available)" = "yes" -a \ + "$(vp8_encode_available)" = "yes" ]; then + for threads in 2 4 8; do + for token_partitions in 1 2 3; do + stress vp8 "${VP8}" "${vp8_max_jobs}" ${threads} \ + "--token-parts=$token_partitions" + done + done + fi +} + vp9_stress() { local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25} @@ -154,16 +167,17 @@ vp9_stress() { } vp9_stress_test() { - for threads in 4 8 100; do + for threads in 4 8 64; do vp9_stress "$threads" "--row-mt=0" done } vp9_stress_test_row_mt() { - for threads in 4 8 100; do + for threads in 4 8 64; do vp9_stress "$threads" "--row-mt=1" done } run_tests stress_verify_environment \ - "vp8_stress_test vp9_stress_test vp9_stress_test_row_mt" + "vp8_stress_test vp8_stress_test_token_parititions + vp9_stress_test vp9_stress_test_row_mt" diff --git a/libs/libvpx/test/sum_squares_test.cc b/libs/libvpx/test/sum_squares_test.cc index 9c407c649f..d2c70f4d4b 100644 --- a/libs/libvpx/test/sum_squares_test.cc +++ b/libs/libvpx/test/sum_squares_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -28,7 +29,7 @@ namespace { const int kNumIterations = 10000; typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size); -typedef std::tr1::tuple SumSquaresParam; +typedef std::tuple SumSquaresParam; class SumSquaresTest : public ::testing::TestWithParam { public: @@ -102,7 +103,14 @@ TEST_P(SumSquaresTest, ExtremeValues) { } } -using std::tr1::make_tuple; +using std::make_tuple; + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_neon))); +#endif // HAVE_NEON #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( @@ -112,8 +120,9 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple( - &vpx_sum_squares_2d_i16_c, - &vpx_sum_squares_2d_i16_msa))); +INSTANTIATE_TEST_CASE_P( + MSA, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_msa))); #endif // HAVE_MSA } // namespace diff --git a/libs/libvpx/test/superframe_test.cc b/libs/libvpx/test/superframe_test.cc index 421dfccd60..8c8d1ae290 100644 --- a/libs/libvpx/test/superframe_test.cc +++ b/libs/libvpx/test/superframe_test.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ #include +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" @@ -18,7 +20,7 @@ namespace { const int kTestMode = 0; -typedef std::tr1::tuple SuperframeTestParam; +typedef std::tuple SuperframeTestParam; class SuperframeTest : public ::libvpx_test::EncoderTest, @@ -31,7 +33,7 @@ class SuperframeTest virtual void SetUp() { InitializeConfig(); const SuperframeTestParam input = GET_PARAM(1); - const libvpx_test::TestMode mode = std::tr1::get(input); + const libvpx_test::TestMode mode = std::get(input); SetMode(mode); sf_count_ = 0; sf_count_max_ = INT_MAX; @@ -41,7 +43,7 @@ class SuperframeTest virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); } } diff --git a/libs/libvpx/test/svc_datarate_test.cc b/libs/libvpx/test/svc_datarate_test.cc new file mode 100644 index 0000000000..d6b247723f --- /dev/null +++ b/libs/libvpx/test/svc_datarate_test.cc @@ -0,0 +1,1428 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/svc_test.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +namespace { + +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +class DatarateOnePassCbrSvc : public OnePassCbrSvc { + public: + explicit DatarateOnePassCbrSvc(const ::libvpx_test::CodecFactory *codec) + : OnePassCbrSvc(codec) { + inter_layer_pred_mode_ = 0; + } + + protected: + virtual ~DatarateOnePassCbrSvc() {} + + virtual void ResetModel() { + last_pts_ = 0; + duration_ = 0.0; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + denoiser_on_ = 0; + tune_content_ = 0; + base_speed_setting_ = 5; + spatial_layer_id_ = 0; + temporal_layer_id_ = 0; + update_pattern_ = 0; + memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_)); + memset(bits_total_, 0, sizeof(bits_total_)); + memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_)); + dynamic_drop_layer_ = false; + change_bitrate_ = false; + last_pts_ref_ = 0; + middle_bitrate_ = 0; + top_bitrate_ = 0; + superframe_count_ = -1; + key_frame_spacing_ = 9999; + num_nonref_frames_ = 0; + layer_framedrop_ = 0; + force_key_ = 0; + force_key_test_ = 0; + insert_layer_sync_ = 0; + layer_sync_on_base_ = 0; + force_intra_only_frame_ = 0; + superframe_has_intra_only_ = 0; + use_post_encode_drop_ = 0; + denoiser_off_on_ = false; + denoiser_enable_layers_ = false; + } + virtual void BeginPassHook(unsigned int /*pass*/) {} + + // Example pattern for spatial layers and 2 temporal layers used in the + // bypass/flexible mode. The pattern corresponds to the pattern + // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in + // non-flexible mode, except that we disable inter-layer prediction. + void set_frame_flags_bypass_mode( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + for (int sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + for (int sl = 0; sl < num_spatial_layers; ++sl) { + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl - 1); + ref_frame_config->alt_fb_idx[sl] = + VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl); + } + if (!tl) { + if (!sl) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } + } + } + } + + void CheckLayerRateTargeting(int num_spatial_layers, int num_temporal_layers, + double thresh_overshoot, + double thresh_undershoot) const { + for (int sl = 0; sl < num_spatial_layers; ++sl) + for (int tl = 0; tl < num_temporal_layers; ++tl) { + const int layer = sl * num_temporal_layers + tl; + ASSERT_GE(cfg_.layer_target_bitrate[layer], + file_datarate_[layer] * thresh_overshoot) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(cfg_.layer_target_bitrate[layer], + file_datarate_[layer] * thresh_undershoot) + << " The datarate for the file is lower than the target by too " + "much!"; + } + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + PreEncodeFrameHookSetup(video, encoder); + + if (video->frame() == 0) { + if (force_intra_only_frame_) { + // Decoder sets the color_space for Intra-only frames + // to BT_601 (see line 1810 in vp9_decodeframe.c). + // So set it here in these tess to avoid encoder-decoder + // mismatch check on color space setting. + encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601); + } + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_); + + if (layer_framedrop_) { + vpx_svc_frame_drop_t svc_drop_frame; + svc_drop_frame.framedrop_mode = LAYER_DROP; + for (int i = 0; i < number_spatial_layers_; i++) + svc_drop_frame.framedrop_thresh[i] = 30; + svc_drop_frame.max_consec_drop = 30; + encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + } + + if (use_post_encode_drop_) { + encoder->Control(VP9E_SET_POSTENCODE_DROP, use_post_encode_drop_); + } + } + + if (denoiser_off_on_) { + encoder->Control(VP9E_SET_AQ_MODE, 3); + // Set inter_layer_pred to INTER_LAYER_PRED_OFF_NONKEY (K-SVC). + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, 2); + if (!denoiser_enable_layers_) { + if (video->frame() == 0) + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0); + else if (video->frame() == 100) + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1); + } else { + // Cumulative bitrates for top spatial layers, for + // 3 temporal layers. + if (video->frame() == 0) { + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0); + // Change layer bitrates to set top spatial layer to 0. + // This is for 3 spatial 3 temporal layers. + // This will trigger skip encoding/dropping of top spatial layer. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + for (int i = 0; i < 3; i++) + bitrate_sl3_[i] = cfg_.layer_target_bitrate[i + 6]; + cfg_.layer_target_bitrate[6] = 0; + cfg_.layer_target_bitrate[7] = 0; + cfg_.layer_target_bitrate[8] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Change layer bitrates to non-zero on top spatial layer. + // This will trigger skip encoding of top spatial layer + // on key frame (period = 100). + for (int i = 0; i < 3; i++) + cfg_.layer_target_bitrate[i + 6] = bitrate_sl3_[i]; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[8]; + encoder->Config(&cfg_); + } else if (video->frame() == 120) { + // Enable denoiser and top spatial layer after key frame (period is + // 100). + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1); + } + } + } + + if (update_pattern_ && video->frame() >= 100) { + vpx_svc_layer_id_t layer_id; + if (video->frame() == 100) { + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + encoder->Config(&cfg_); + } + // Set layer id since the pattern changed. + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + } + + if (change_bitrate_ && video->frame() == 200) { + duration_ = (last_pts_ + 1) * timebase_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + file_datarate_[layer] = file_size_in_kb / duration_; + } + } + + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, + 0.78, 1.15); + + memset(file_datarate_, 0, sizeof(file_datarate_)); + memset(bits_total_, 0, sizeof(bits_total_)); + int64_t bits_in_buffer_model_tmp[VPX_MAX_LAYERS]; + last_pts_ref_ = last_pts_; + // Set new target bitarate. + cfg_.rc_target_bitrate = cfg_.rc_target_bitrate >> 1; + // Buffer level should not reset on dynamic bitrate change. + memcpy(bits_in_buffer_model_tmp, bits_in_buffer_model_, + sizeof(bits_in_buffer_model_)); + AssignLayerBitrates(); + memcpy(bits_in_buffer_model_, bits_in_buffer_model_tmp, + sizeof(bits_in_buffer_model_)); + + // Change config to update encoder with new bitrate configuration. + encoder->Config(&cfg_); + } + + if (dynamic_drop_layer_) { + // TODO(jian): Disable AQ Mode for this test for now. + encoder->Control(VP9E_SET_AQ_MODE, 0); + if (video->frame() == 0) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top two spatial layers. + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 50) { + // Change layer bitrates to non-zero on two top spatial layers. + // This will trigger skip encoding of top two spatial layers. + cfg_.layer_target_bitrate[1] = middle_bitrate_; + cfg_.layer_target_bitrate[2] = top_bitrate_; + cfg_.rc_target_bitrate += + cfg_.layer_target_bitrate[2] + cfg_.layer_target_bitrate[1]; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top two spatial layers. + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 150) { + // Change layer bitrate on second layer to non-zero to start + // encoding it again. + cfg_.layer_target_bitrate[1] = middle_bitrate_; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[1]; + encoder->Config(&cfg_); + } else if (video->frame() == 200) { + // Change layer bitrate on top layer to non-zero to start + // encoding it again. + cfg_.layer_target_bitrate[2] = top_bitrate_; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2]; + encoder->Config(&cfg_); + } + } + + if (force_key_test_ && force_key_) frame_flags_ = VPX_EFLAG_FORCE_KF; + + if (insert_layer_sync_) { + vpx_svc_spatial_layer_sync_t svc_layer_sync; + svc_layer_sync.base_layer_intra_only = 0; + for (int i = 0; i < number_spatial_layers_; i++) + svc_layer_sync.spatial_layer_sync[i] = 0; + if (force_intra_only_frame_) { + superframe_has_intra_only_ = 0; + if (video->frame() == 0) { + svc_layer_sync.base_layer_intra_only = 1; + svc_layer_sync.spatial_layer_sync[0] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + superframe_has_intra_only_ = 1; + } else if (video->frame() == 100) { + svc_layer_sync.base_layer_intra_only = 1; + svc_layer_sync.spatial_layer_sync[0] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + superframe_has_intra_only_ = 1; + } + } else { + layer_sync_on_base_ = 0; + if (video->frame() == 150) { + svc_layer_sync.spatial_layer_sync[1] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } else if (video->frame() == 240) { + svc_layer_sync.spatial_layer_sync[2] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } else if (video->frame() == 320) { + svc_layer_sync.spatial_layer_sync[0] = 1; + layer_sync_on_base_ = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } + } + } + + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, + uint32_t sizes[8], int *count) { + uint8_t marker; + marker = *(data + data_sz - 1); + *count = 0; + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + { + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + uint32_t sizes[8] = { 0 }; + uint32_t sizes_parsed[8] = { 0 }; + int count = 0; + int num_layers_encoded = 0; + last_pts_ = pkt->data.frame.pts; + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (key_frame) { + // For test that inserts layer sync frames: requesting a layer_sync on + // the base layer must force key frame. So if any key frame occurs after + // first superframe it must due to layer sync on base spatial layer. + if (superframe_count_ > 0 && insert_layer_sync_ && + !force_intra_only_frame_) { + ASSERT_EQ(layer_sync_on_base_, 1); + } + temporal_layer_id_ = 0; + superframe_count_ = 0; + } + parse_superframe_index(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, sizes_parsed, &count); + // Count may be less than number of spatial layers because of frame drops. + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + if (pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } + } + // For superframe with Intra-only count will be +1 larger + // because of no-show frame. + if (force_intra_only_frame_ && superframe_has_intra_only_) + ASSERT_EQ(count, num_layers_encoded + 1); + else + ASSERT_EQ(count, num_layers_encoded); + + // In the constrained frame drop mode, if a given spatial is dropped all + // upper layers must be dropped too. + if (!layer_framedrop_) { + int num_layers_dropped = 0; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + if (!pkt->data.frame.spatial_layer_encoded[sl]) { + // Check that all upper layers are dropped. + num_layers_dropped++; + for (int sl2 = sl + 1; sl2 < number_spatial_layers_; ++sl2) + ASSERT_EQ(pkt->data.frame.spatial_layer_encoded[sl2], 0); + } + } + if (num_layers_dropped == number_spatial_layers_ - 1) + force_key_ = 1; + else + force_key_ = 0; + } + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + sizes[sl] = sizes[sl] << 3; + // Update the total encoded bits per layer. + // For temporal layers, update the cumulative encoded bits per layer. + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_total_[layer] += static_cast(sizes[sl]); + // Update the per-layer buffer level with the encoded frame size. + bits_in_buffer_model_[layer] -= static_cast(sizes[sl]); + // There should be no buffer underrun, except on the base + // temporal layer, since there may be key frames there. + // Fo short key frame spacing, buffer can underrun on individual frames. + if (!key_frame && tl > 0 && key_frame_spacing_ < 100) { + ASSERT_GE(bits_in_buffer_model_[layer], 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + } + + ASSERT_EQ(pkt->data.frame.width[sl], + top_sl_width_ * svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]); + + ASSERT_EQ(pkt->data.frame.height[sl], + top_sl_height_ * svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]); + } + } + + virtual void EndPassHook(void) { + if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_; + duration_ = (last_pts_ + 1) * timebase_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + file_datarate_[layer] = file_size_in_kb / duration_; + } + } + } + + virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + } + + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + unsigned int GetNonRefFrames() { return num_nonref_frames_; } + + vpx_codec_pts_t last_pts_; + double timebase_; + int64_t bits_total_[VPX_MAX_LAYERS]; + double duration_; + double file_datarate_[VPX_MAX_LAYERS]; + size_t bits_in_last_frame_; + double mismatch_psnr_; + int denoiser_on_; + int tune_content_; + int spatial_layer_id_; + bool dynamic_drop_layer_; + unsigned int top_sl_width_; + unsigned int top_sl_height_; + vpx_svc_ref_frame_config_t ref_frame_config; + int update_pattern_; + bool change_bitrate_; + vpx_codec_pts_t last_pts_ref_; + int middle_bitrate_; + int top_bitrate_; + int key_frame_spacing_; + int layer_framedrop_; + int force_key_; + int force_key_test_; + int inter_layer_pred_mode_; + int insert_layer_sync_; + int layer_sync_on_base_; + int force_intra_only_frame_; + int superframe_has_intra_only_; + int use_post_encode_drop_; + int bitrate_sl3_[3]; + // Denoiser switched on the fly. + bool denoiser_off_on_; + // Top layer enabled on the fly. + bool denoiser_enable_layers_; + + private: + virtual void SetConfig(const int num_temporal_layer) { + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + if (num_temporal_layer == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (num_temporal_layer == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (num_temporal_layer == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } + } + + unsigned int num_nonref_frames_; + unsigned int mismatch_nframes_; +}; + +// Params: speed setting. +class DatarateOnePassCbrSvcSingleBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + virtual ~DatarateOnePassCbrSvcSingleBR() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 +// temporal layer, with screen content mode on and same speed setting for all +// layers. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TLScreenContent1) { + SetSvcConfig(2, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.kf_max_dist = 9999; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 500; + ResetModel(); + tune_content_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers, with force key frame after frame drop +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLForceKey) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 100; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one +// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables +// inter-layer prediction. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) { + SetSvcConfig(3, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + // Change SVC pattern on the fly. + update_pattern_ = 1; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal +// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching +// of denoiser from off to on (on at frame = 100). Key frame period is set to +// 1000 so denoise is enabled on non-key. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiserOffOnFixedLayers) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 1000; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 30, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + denoiser_off_on_ = true; + denoiser_enable_layers_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal +// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching +// of denoiser from off to on, for dynamic layers. Start at 2 spatial layers +// and enable 3rd spatial layer at frame = 100. Use periodic key frame with +// period 100 so enabling of spatial layer occurs at key frame. Enable denoiser +// at frame > 100, after the key frame sync. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiserOffOnEnableLayers) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 100; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 30, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + denoiser_off_on_ = true; + denoiser_enable_layers_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on +// the fly switching to 1 and then 2 and back to 3 spatial layers. This switch +// is done by setting spatial layer bitrates to 0, and then back to non-zero, +// during the sequence. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL_DisableEnableLayers) { + SetSvcConfig(3, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 0; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial +// downscale 5x5. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TL5x5MultipleRuns) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 3; + cfg_.temporal_layering_mode = 0; + svc_params_.scaling_factor_num[0] = 256; + svc_params_.scaling_factor_den[0] = 1280; + svc_params_.scaling_factor_num[1] = 1280; + svc_params_.scaling_factor_den[1] = 1280; + cfg_.rc_dropframe_thresh = 10; + cfg_.kf_max_dist = 999999; + cfg_.kf_min_dist = 0; + cfg_.ss_target_bitrate[0] = 300; + cfg_.ss_target_bitrate[1] = 1400; + cfg_.layer_target_bitrate[0] = 300; + cfg_.layer_target_bitrate[1] = 1400; + cfg_.rc_target_bitrate = 1700; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ResetModel(); + layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30; + bits_in_buffer_model_[0] = + cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz; + layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30; + bits_in_buffer_model_[1] = + cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting and index for bitrate array. +class DatarateOnePassCbrSvcMultiBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + virtual ~DatarateOnePassCbrSvcMultiBR() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Run CIF clip with 1 thread. +TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 200, 400, 600 }; + // TODO(marpan): Check that effective_datarate for each layer hits the + // layer target_bitrate. + cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)]; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.75, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting, layer framedrop control and index for bitrate array. +class DatarateOnePassCbrSvcFrameDropMultiBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith3Params { + public: + DatarateOnePassCbrSvcFrameDropMultiBR() + : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + virtual ~DatarateOnePassCbrSvcFrameDropMultiBR() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc2SL3TL4Threads) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.64, + 1.45); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting, inter-layer prediction mode. +class DatarateOnePassCbrSvcInterLayerPredSingleBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcInterLayerPredSingleBR() + : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + virtual ~DatarateOnePassCbrSvcInterLayerPredSingleBR() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + inter_layer_pred_mode_ = GET_PARAM(2); + ResetModel(); + } +}; + +// Check basic rate targeting with different inter-layer prediction modes for 1 +// pass CBR SVC: 3 spatial layers and 3 temporal layers. Run CIF clip with 1 +// thread. +TEST_P(DatarateOnePassCbrSvcInterLayerPredSingleBR, OnePassCbrSvc3SL3TL) { + // Disable test for inter-layer pred off for now since simulcast_mode fails. + if (inter_layer_pred_mode_ == INTER_LAYER_PRED_OFF) return; + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check rate targeting with different inter-layer prediction modes for 1 pass +// CBR SVC: 3 spatial layers and 3 temporal layers, changing the target bitrate +// at the middle of encoding. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLDynamicBitrateChange) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + change_bitrate_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Params: speed setting, noise sensitivity, index for bitrate array and inter +// layer pred mode. +class DatarateOnePassCbrSvcDenoiser + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith4Params { + public: + DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + virtual ~DatarateOnePassCbrSvcDenoiser() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + inter_layer_pred_mode_ = GET_PARAM(3); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC with denoising. +// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads. +TEST_P(DatarateOnePassCbrSvcDenoiser, OnePassCbrSvc2SL3TLDenoiserOn) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 2; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 600, 800, 1000 }; + // TODO(marpan): Check that effective_datarate for each layer hits the + // layer target_bitrate. + // For SVC, noise_sen = 1 means denoising only the top spatial layer + // noise_sen = 2 means denoising the two top spatial layers. + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + denoiser_on_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Params: speed setting, key frame dist. +class DatarateOnePassCbrSvcSmallKF + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + virtual ~DatarateOnePassCbrSvcSmallKF() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. +TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 800; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + const int kf_dist = GET_PARAM(2); + cfg_.kf_max_dist = kf_dist; + key_frame_spacing_ = kf_dist; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(jianj): webm:1554 + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 +// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. +TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc2SL3TLSmallKf) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + const int kf_dist = GET_PARAM(2) + 32; + cfg_.kf_max_dist = kf_dist; + key_frame_spacing_ = kf_dist; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers. Run VGA clip with 1 thread, and place layer sync frames: +// one at middle layer first, then another one for top layer, and another +// insert for base spatial layer (which forces key frame). +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLSyncFrames) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.kf_max_dist = 9999; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + insert_layer_sync_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 3 spatial layers, 1 temporal layer, with +// intra-only frame as sync frame on base spatial layer. +// Intra_only is inserted at start and in middle of sequence. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL1TLSyncWithIntraOnly) { + SetSvcConfig(3, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + insert_layer_sync_ = 1; + // Use intra_only frame for sync on base layer. + force_intra_only_frame_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 2 quality layers (same resolution different, +// bitrates), 1 temporal layer, with screen content mode. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2QL1TLScreen) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 2; + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + tune_content_ = 1; + // Set the layer bitrates, for 2 spatial layers, 1 temporal. + cfg_.rc_target_bitrate = 400; + cfg_.ss_target_bitrate[0] = 100; + cfg_.ss_target_bitrate[1] = 300; + cfg_.layer_target_bitrate[0] = 100; + cfg_.layer_target_bitrate[1] = 300; + for (int sl = 0; sl < 2; ++sl) { + float layer_framerate = 30.0; + layer_target_avg_bandwidth_[sl] = static_cast( + cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate); + bits_in_buffer_model_[sl] = + cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting. +class DatarateOnePassCbrSvcPostencodeDrop + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + virtual ~DatarateOnePassCbrSvcPostencodeDrop() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Run SVC encoder for 2 quality layers (same resolution different, +// bitrates), 1 temporal layer, with screen content mode. +TEST_P(DatarateOnePassCbrSvcPostencodeDrop, OnePassCbrSvc2QL1TLScreen) { + cfg_.rc_buf_initial_sz = 200; + cfg_.rc_buf_optimal_sz = 200; + cfg_.rc_buf_sz = 400; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 52; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 2; + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + top_sl_width_ = 352; + top_sl_height_ = 288; + ResetModel(); + base_speed_setting_ = speed_setting_; + tune_content_ = 1; + use_post_encode_drop_ = 1; + // Set the layer bitrates, for 2 spatial layers, 1 temporal. + cfg_.rc_target_bitrate = 400; + cfg_.ss_target_bitrate[0] = 100; + cfg_.ss_target_bitrate[1] = 300; + cfg_.layer_target_bitrate[0] = 100; + cfg_.layer_target_bitrate[1] = 300; + for (int sl = 0; sl < 2; ++sl) { + float layer_framerate = 30.0; + layer_target_avg_bandwidth_[sl] = static_cast( + cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate); + bits_in_buffer_model_[sl] = + cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSingleBR, + ::testing::Range(5, 10)); + +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcPostencodeDrop, + ::testing::Range(5, 6)); + +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcInterLayerPredSingleBR, + ::testing::Range(5, 10), ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcMultiBR, ::testing::Range(5, 10), + ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcFrameDropMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 2), + ::testing::Range(0, 3)); + +#if CONFIG_VP9_TEMPORAL_DENOISING +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcDenoiser, + ::testing::Range(5, 10), ::testing::Range(1, 3), + ::testing::Range(0, 3), ::testing::Range(0, 4)); +#endif + +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSmallKF, ::testing::Range(5, 10), + ::testing::Range(32, 36)); +} // namespace +} // namespace svc_test diff --git a/libs/libvpx/test/svc_end_to_end_test.cc b/libs/libvpx/test/svc_end_to_end_test.cc new file mode 100644 index 0000000000..82259ac30c --- /dev/null +++ b/libs/libvpx/test/svc_end_to_end_test.cc @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/svc_test.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +namespace { + +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +class ScalePartitionOnePassCbrSvc + : public OnePassCbrSvc, + public ::testing::TestWithParam { + public: + ScalePartitionOnePassCbrSvc() + : OnePassCbrSvc(GetParam()), mismatch_nframes_(0), num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + } + + protected: + virtual ~ScalePartitionOnePassCbrSvc() {} + + virtual void SetUp() { + InitializeConfig(); + speed_setting_ = 7; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + PreEncodeFrameHookSetup(video, encoder); + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + } + + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) { + ++mismatch_nframes_; + } + + virtual void SetConfig(const int /*num_temporal_layer*/) {} + + unsigned int GetMismatchFrames() const { return mismatch_nframes_; } + unsigned int GetNonRefFrames() const { return num_nonref_frames_; } + + private: + unsigned int mismatch_nframes_; + unsigned int num_nonref_frames_; +}; + +TEST_P(ScalePartitionOnePassCbrSvc, OnePassCbrSvc3SL3TL1080P) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video( + "slides_code_term_web_plot.1920_1080.yuv", 1920, 1080, 30, 1, 0, 100); + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: Inter layer prediction modes. +class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + SyncFrameOnePassCbrSvc() + : OnePassCbrSvc(GET_PARAM(0)), current_video_frame_(0), + frame_to_start_decode_(0), frame_to_sync_(0), + inter_layer_pred_mode_(GET_PARAM(1)), decode_to_layer_before_sync_(-1), + decode_to_layer_after_sync_(-1), denoiser_on_(0), + intra_only_test_(false), mismatch_nframes_(0), num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + memset(&svc_layer_sync_, 0, sizeof(svc_layer_sync_)); + } + + protected: + virtual ~SyncFrameOnePassCbrSvc() {} + + virtual void SetUp() { + InitializeConfig(); + speed_setting_ = 7; + } + + virtual bool DoDecode() const { + return current_video_frame_ >= frame_to_start_decode_; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + current_video_frame_ = video->frame(); + PreEncodeFrameHookSetup(video, encoder); + if (video->frame() == 0) { + // Do not turn off inter-layer pred completely because simulcast mode + // fails. + if (inter_layer_pred_mode_ != INTER_LAYER_PRED_OFF) + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_); + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + if (intra_only_test_) + // Decoder sets the color_space for Intra-only frames + // to BT_601 (see line 1810 in vp9_decodeframe.c). + // So set it here in these tess to avoid encoder-decoder + // mismatch check on color space setting. + encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601); + } + if (video->frame() == frame_to_sync_) { + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_); + } + } + +#if CONFIG_VP9_DECODER + virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) { + if (video->frame() < frame_to_sync_) { + if (decode_to_layer_before_sync_ >= 0) + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, + decode_to_layer_before_sync_); + } else { + if (decode_to_layer_after_sync_ >= 0) + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, + decode_to_layer_after_sync_); + } + } +#endif + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1] && + current_video_frame_ >= frame_to_sync_) + num_nonref_frames_++; + + if (intra_only_test_ && current_video_frame_ == frame_to_sync_) { + // Intra-only frame is only generated for spatial layers > 1 and <= 3, + // among other conditions (see constraint in set_intra_only_frame(). If + // intra-only is no allowed then encoder will insert key frame instead. + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (number_spatial_layers_ == 1 || number_spatial_layers_ > 3) + ASSERT_TRUE(key_frame); + else + ASSERT_FALSE(key_frame); + } + } + + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) { + if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_; + } + + unsigned int GetMismatchFrames() const { return mismatch_nframes_; } + unsigned int GetNonRefFrames() const { return num_nonref_frames_; } + + unsigned int current_video_frame_; + unsigned int frame_to_start_decode_; + unsigned int frame_to_sync_; + int inter_layer_pred_mode_; + int decode_to_layer_before_sync_; + int decode_to_layer_after_sync_; + int denoiser_on_; + bool intra_only_test_; + vpx_svc_spatial_layer_sync_t svc_layer_sync_; + + private: + virtual void SetConfig(const int num_temporal_layer) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + if (num_temporal_layer == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (num_temporal_layer == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (num_temporal_layer == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 1; + } + } + + unsigned int mismatch_nframes_; + unsigned int num_nonref_frames_; +}; + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Only start decoding on the sync layer. +// Full sync: insert key frame on base layer. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) { + SetSvcConfig(3, 3); + // Sync is on base layer so the frame to sync and the frame to start decoding + // is the same. + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = -1; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// VGA on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) { + SetSvcConfig(2, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 100; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 1; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 400; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Decoding QVGA and VGA before sync frame and decode up to +// HD on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 1; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// HD on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + svc_layer_sync_.spatial_layer_sync[2] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// VGA on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) { + SetSvcConfig(2, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 100; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 1; + + denoiser_on_ = 1; + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 400; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Start decoding from beginning of sequence, during sequence insert intra-only +// on base/qvga layer. Decode all layers. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + // The superframe containing intra-only layer will have 4 frames. Thus set the + // layer to decode after sync frame to 3. + decode_to_layer_after_sync_ = 3; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Start decoding from beginning of sequence, during sequence insert intra-only +// on base/qvga layer and sync_layer on middle/VGA layer. Decode all layers. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + // The superframe containing intra-only layer will have 4 frames. Thus set the + // layer to decode after sync frame to 3. + decode_to_layer_after_sync_ = 3; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 1; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Start decoding from sync frame, insert intra-only on base/qvga layer. Decode +// all layers. For 1 spatial layer, it inserts a key frame. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) { + SetSvcConfig(1, 3); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +VP9_INSTANTIATE_TEST_CASE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3)); + +INSTANTIATE_TEST_CASE_P( + VP9, ScalePartitionOnePassCbrSvc, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); + +} // namespace +} // namespace svc_test diff --git a/libs/libvpx/test/svc_test.cc b/libs/libvpx/test/svc_test.cc index 482d9fffa1..4798c77183 100644 --- a/libs/libvpx/test/svc_test.cc +++ b/libs/libvpx/test/svc_test.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,782 +8,127 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "test/codec_factory.h" -#include "test/decode_test_driver.h" -#include "test/i420_video_source.h" +#include "test/svc_test.h" -#include "vp9/decoder/vp9_decoder.h" +namespace svc_test { +void OnePassCbrSvc::SetSvcConfig(const int num_spatial_layer, + const int num_temporal_layer) { + SetConfig(num_temporal_layer); + cfg_.ss_number_layers = num_spatial_layer; + cfg_.ts_number_layers = num_temporal_layer; + if (num_spatial_layer == 1) { + svc_params_.scaling_factor_num[0] = 288; + svc_params_.scaling_factor_den[0] = 288; + } else if (num_spatial_layer == 2) { + svc_params_.scaling_factor_num[0] = 144; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 288; + svc_params_.scaling_factor_den[1] = 288; + } else if (num_spatial_layer == 3) { + svc_params_.scaling_factor_num[0] = 72; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 144; + svc_params_.scaling_factor_den[1] = 288; + svc_params_.scaling_factor_num[2] = 288; + svc_params_.scaling_factor_den[2] = 288; + } + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; +} -#include "vpx/svc_context.h" -#include "vpx/vp8cx.h" -#include "vpx/vpx_encoder.h" +void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { + svc_params_.max_quantizers[i] = 63; + svc_params_.min_quantizers[i] = 0; + } + svc_params_.speed_per_layer[0] = base_speed_setting_; + for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.speed_per_layer[i] = speed_setting_; + } -namespace { - -using libvpx_test::CodecFactory; -using libvpx_test::Decoder; -using libvpx_test::DxDataIterator; -using libvpx_test::VP9CodecFactory; - -class SvcTest : public ::testing::Test { - protected: - static const uint32_t kWidth = 352; - static const uint32_t kHeight = 288; - - SvcTest() - : codec_iface_(0), test_file_name_("hantro_collage_w352h288.yuv"), - codec_initialized_(false), decoder_(0) { - memset(&svc_, 0, sizeof(svc_)); - memset(&codec_, 0, sizeof(codec_)); - memset(&codec_enc_, 0, sizeof(codec_enc_)); + encoder->Control(VP9E_SET_SVC, 1); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + encoder->Control(VP8E_SET_CPUUSED, speed_setting_); + encoder->Control(VP9E_SET_AQ_MODE, 3); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300); + encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads)); + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1); } - virtual ~SvcTest() {} - - virtual void SetUp() { - svc_.log_level = SVC_LOG_DEBUG; - svc_.log_print = 0; - - codec_iface_ = vpx_codec_vp9_cx(); - const vpx_codec_err_t res = - vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0); - EXPECT_EQ(VPX_CODEC_OK, res); - - codec_enc_.g_w = kWidth; - codec_enc_.g_h = kHeight; - codec_enc_.g_timebase.num = 1; - codec_enc_.g_timebase.den = 60; - codec_enc_.kf_min_dist = 100; - codec_enc_.kf_max_dist = 100; - - vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); - VP9CodecFactory codec_factory; - decoder_ = codec_factory.CreateDecoder(dec_cfg, 0); - - tile_columns_ = 0; - tile_rows_ = 0; - } - - virtual void TearDown() { - ReleaseEncoder(); - delete (decoder_); - } - - void InitializeEncoder() { - const vpx_codec_err_t res = - vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_OK, res); - vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4); // Make the test faster - vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_); - vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_); - codec_initialized_ = true; - } - - void ReleaseEncoder() { - vpx_svc_release(&svc_); - if (codec_initialized_) vpx_codec_destroy(&codec_); - codec_initialized_ = false; - } - - void GetStatsData(std::string *const stats_buf) { - vpx_codec_iter_t iter = NULL; - const vpx_codec_cx_pkt_t *cx_pkt; - - while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) { - if (cx_pkt->kind == VPX_CODEC_STATS_PKT) { - EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U); - ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL); - stats_buf->append(static_cast(cx_pkt->data.twopass_stats.buf), - cx_pkt->data.twopass_stats.sz); - } + superframe_count_++; + temporal_layer_id_ = 0; + if (number_temporal_layers_ == 2) { + temporal_layer_id_ = (superframe_count_ % 2 != 0); + } else if (number_temporal_layers_ == 3) { + if (superframe_count_ % 2 != 0) temporal_layer_id_ = 2; + if (superframe_count_ > 1) { + if ((superframe_count_ - 2) % 4 == 0) temporal_layer_id_ = 1; } } - void Pass1EncodeNFrames(const int n, const int layers, - std::string *const stats_buf) { - vpx_codec_err_t res; + frame_flags_ = 0; +} - ASSERT_GT(n, 0); - ASSERT_GT(layers, 0); - svc_.spatial_layers = layers; - codec_enc_.g_pass = VPX_RC_FIRST_PASS; - InitializeEncoder(); - - libvpx_test::I420VideoSource video( - test_file_name_, codec_enc_.g_w, codec_enc_.g_h, - codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30); - video.Begin(); - - for (int i = 0; i < n; ++i) { - res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(), - video.duration(), VPX_DL_GOOD_QUALITY); - ASSERT_EQ(VPX_CODEC_OK, res); - GetStatsData(stats_buf); - video.Next(); - } - - // Flush encoder and test EOS packet. - res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(), video.duration(), - VPX_DL_GOOD_QUALITY); - ASSERT_EQ(VPX_CODEC_OK, res); - GetStatsData(stats_buf); - - ReleaseEncoder(); - } - - void StoreFrames(const size_t max_frame_received, - struct vpx_fixed_buf *const outputs, - size_t *const frame_received) { - vpx_codec_iter_t iter = NULL; - const vpx_codec_cx_pkt_t *cx_pkt; - - while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) { - if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) { - const size_t frame_size = cx_pkt->data.frame.sz; - - EXPECT_GT(frame_size, 0U); - ASSERT_TRUE(cx_pkt->data.frame.buf != NULL); - ASSERT_LT(*frame_received, max_frame_received); - - if (*frame_received == 0) - EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)); - - outputs[*frame_received].buf = malloc(frame_size + 16); - ASSERT_TRUE(outputs[*frame_received].buf != NULL); - memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf, - frame_size); - outputs[*frame_received].sz = frame_size; - ++(*frame_received); - } +void OnePassCbrSvc::PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + vpx_svc_layer_id_t layer_id; + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_in_buffer_model_[layer] += + static_cast(layer_target_avg_bandwidth_[layer]); } } +} - void Pass2EncodeNFrames(std::string *const stats_buf, const int n, - const int layers, - struct vpx_fixed_buf *const outputs) { - vpx_codec_err_t res; - size_t frame_received = 0; - - ASSERT_TRUE(outputs != NULL); - ASSERT_GT(n, 0); - ASSERT_GT(layers, 0); - svc_.spatial_layers = layers; - codec_enc_.rc_target_bitrate = 500; - if (codec_enc_.g_pass == VPX_RC_LAST_PASS) { - ASSERT_TRUE(stats_buf != NULL); - ASSERT_GT(stats_buf->size(), 0U); - codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0]; - codec_enc_.rc_twopass_stats_in.sz = stats_buf->size(); - } - InitializeEncoder(); - - libvpx_test::I420VideoSource video( - test_file_name_, codec_enc_.g_w, codec_enc_.g_h, - codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30); - video.Begin(); - - for (int i = 0; i < n; ++i) { - res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(), - video.duration(), VPX_DL_GOOD_QUALITY); - ASSERT_EQ(VPX_CODEC_OK, res); - StoreFrames(n, outputs, &frame_received); - video.Next(); - } - - // Flush encoder. - res = vpx_svc_encode(&svc_, &codec_, NULL, 0, video.duration(), - VPX_DL_GOOD_QUALITY); - EXPECT_EQ(VPX_CODEC_OK, res); - StoreFrames(n, outputs, &frame_received); - - EXPECT_EQ(frame_received, static_cast(n)); - - ReleaseEncoder(); - } - - void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) { - int decoded_frames = 0; - int received_frames = 0; - - ASSERT_TRUE(inputs != NULL); - ASSERT_GT(n, 0); - - for (int i = 0; i < n; ++i) { - ASSERT_TRUE(inputs[i].buf != NULL); - ASSERT_GT(inputs[i].sz, 0U); - const vpx_codec_err_t res_dec = decoder_->DecodeFrame( - static_cast(inputs[i].buf), inputs[i].sz); - ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError(); - ++decoded_frames; - - DxDataIterator dec_iter = decoder_->GetDxData(); - while (dec_iter.Next() != NULL) { - ++received_frames; - } - } - EXPECT_EQ(decoded_frames, n); - EXPECT_EQ(received_frames, n); - } - - void DropEnhancementLayers(struct vpx_fixed_buf *const inputs, - const int num_super_frames, - const int remained_spatial_layers) { - ASSERT_TRUE(inputs != NULL); - ASSERT_GT(num_super_frames, 0); - ASSERT_GT(remained_spatial_layers, 0); - - for (int i = 0; i < num_super_frames; ++i) { - uint32_t frame_sizes[8] = { 0 }; - int frame_count = 0; - int frames_found = 0; - int frame; - ASSERT_TRUE(inputs[i].buf != NULL); - ASSERT_GT(inputs[i].sz, 0U); - - vpx_codec_err_t res = vp9_parse_superframe_index( - static_cast(inputs[i].buf), inputs[i].sz, - frame_sizes, &frame_count, NULL, NULL); - ASSERT_EQ(VPX_CODEC_OK, res); - - if (frame_count == 0) { - // There's no super frame but only a single frame. - ASSERT_EQ(1, remained_spatial_layers); - } else { - // Found a super frame. - uint8_t *frame_data = static_cast(inputs[i].buf); - uint8_t *frame_start = frame_data; - for (frame = 0; frame < frame_count; ++frame) { - // Looking for a visible frame. - if (frame_data[0] & 0x02) { - ++frames_found; - if (frames_found == remained_spatial_layers) break; - } - frame_data += frame_sizes[frame]; - } - ASSERT_LT(frame, frame_count) - << "Couldn't find a visible frame. " - << "remained_spatial_layers: " << remained_spatial_layers - << " super_frame: " << i; - if (frame == frame_count - 1) continue; - - frame_data += frame_sizes[frame]; - - // We need to add one more frame for multiple frame contexts. - uint8_t marker = - static_cast(inputs[i].buf)[inputs[i].sz - 1]; - const uint32_t mag = ((marker >> 3) & 0x3) + 1; - const size_t index_sz = 2 + mag * frame_count; - const size_t new_index_sz = 2 + mag * (frame + 1); - marker &= 0x0f8; - marker |= frame; - - // Copy existing frame sizes. - memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1, - new_index_sz - 2); - // New marker. - frame_data[0] = marker; - frame_data += (mag * (frame + 1) + 1); - - *frame_data++ = marker; - inputs[i].sz = frame_data - frame_start; - } +void OnePassCbrSvc::AssignLayerBitrates() { + int sl, spatial_layer_target; + int spatial_layers = cfg_.ss_number_layers; + int temporal_layers = cfg_.ts_number_layers; + float total = 0; + float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; + float framerate = 30.0; + for (sl = 0; sl < spatial_layers; ++sl) { + if (svc_params_.scaling_factor_den[sl] > 0) { + alloc_ratio[sl] = + static_cast((svc_params_.scaling_factor_num[sl] * 1.0 / + svc_params_.scaling_factor_den[sl])); + total += alloc_ratio[sl]; } } - - void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) { - ASSERT_TRUE(inputs != NULL); - ASSERT_GT(n, 0); - - for (int i = 0; i < n; ++i) { - free(inputs[i].buf); - inputs[i].buf = NULL; - inputs[i].sz = 0; + for (sl = 0; sl < spatial_layers; ++sl) { + cfg_.ss_target_bitrate[sl] = spatial_layer_target = + static_cast(cfg_.rc_target_bitrate * alloc_ratio[sl] / + total); + const int index = sl * temporal_layers; + if (cfg_.temporal_layering_mode == 3) { + cfg_.layer_target_bitrate[index] = spatial_layer_target >> 1; + cfg_.layer_target_bitrate[index + 1] = + (spatial_layer_target >> 1) + (spatial_layer_target >> 2); + cfg_.layer_target_bitrate[index + 2] = spatial_layer_target; + } else if (cfg_.temporal_layering_mode == 2) { + cfg_.layer_target_bitrate[index] = spatial_layer_target * 2 / 3; + cfg_.layer_target_bitrate[index + 1] = spatial_layer_target; + } else if (cfg_.temporal_layering_mode <= 1) { + cfg_.layer_target_bitrate[index] = spatial_layer_target; + } + } + for (sl = 0; sl < spatial_layers; ++sl) { + for (int tl = 0; tl < temporal_layers; ++tl) { + const int layer = sl * temporal_layers + tl; + float layer_framerate = framerate; + if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2; + if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4; + if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2; + layer_target_avg_bandwidth_[layer] = static_cast( + cfg_.layer_target_bitrate[layer] * 1000.0 / layer_framerate); + bits_in_buffer_model_[layer] = + cfg_.layer_target_bitrate[layer] * cfg_.rc_buf_initial_sz; } } - - SvcContext svc_; - vpx_codec_ctx_t codec_; - struct vpx_codec_enc_cfg codec_enc_; - vpx_codec_iface_t *codec_iface_; - std::string test_file_name_; - bool codec_initialized_; - Decoder *decoder_; - int tile_columns_; - int tile_rows_; -}; - -TEST_F(SvcTest, SvcInit) { - // test missing parameters - vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - svc_.spatial_layers = 6; // too many layers - res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - svc_.spatial_layers = 0; // use default layers - InitializeEncoder(); - EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers); } - -TEST_F(SvcTest, InitTwoLayers) { - svc_.spatial_layers = 2; - InitializeEncoder(); -} - -TEST_F(SvcTest, InvalidOptions) { - vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "not-an-option=1"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); -} - -TEST_F(SvcTest, SetLayersOption) { - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3"); - EXPECT_EQ(VPX_CODEC_OK, res); - InitializeEncoder(); - EXPECT_EQ(3, svc_.spatial_layers); -} - -TEST_F(SvcTest, SetMultipleOptions) { - vpx_codec_err_t res = - vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3"); - EXPECT_EQ(VPX_CODEC_OK, res); - InitializeEncoder(); - EXPECT_EQ(2, svc_.spatial_layers); -} - -TEST_F(SvcTest, SetScaleFactorsOption) { - svc_.spatial_layers = 2; - vpx_codec_err_t res = - vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "scale-factors=1/3"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3"); - EXPECT_EQ(VPX_CODEC_OK, res); - InitializeEncoder(); -} - -TEST_F(SvcTest, SetQuantizersOption) { - svc_.spatial_layers = 2; - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "min-quantizers=nothing"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "max-quantizers=40"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "min-quantizers=40"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30"); - InitializeEncoder(); -} - -TEST_F(SvcTest, SetAutoAltRefOption) { - svc_.spatial_layers = 5; - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "auto-alt-refs=none"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1,1,0"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0"); - InitializeEncoder(); -} - -// Test that decoder can handle an SVC frame as the first frame in a sequence. -TEST_F(SvcTest, OnePassEncodeOneFrame) { - codec_enc_.g_pass = VPX_RC_ONE_PASS; - vpx_fixed_buf output = vpx_fixed_buf(); - Pass2EncodeNFrames(NULL, 1, 2, &output); - DecodeNFrames(&output, 1); - FreeBitstreamBuffers(&output, 1); -} - -TEST_F(SvcTest, OnePassEncodeThreeFrames) { - codec_enc_.g_pass = VPX_RC_ONE_PASS; - codec_enc_.g_lag_in_frames = 0; - vpx_fixed_buf outputs[3]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 3); - FreeBitstreamBuffers(&outputs[0], 3); -} - -TEST_F(SvcTest, TwoPassEncode10Frames) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(20, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1"); - vpx_fixed_buf outputs[20]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 20); - FreeBitstreamBuffers(&outputs[0], 20); -} - -TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 5, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]); - - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 4); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 3); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 2); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2SNRLayers) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1"); - Pass1EncodeNFrames(20, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1"); - vpx_fixed_buf outputs[20]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 20); - FreeBitstreamBuffers(&outputs[0], 20); -} - -TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1"); - Pass1EncodeNFrames(20, 3, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1"); - vpx_fixed_buf outputs[20]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]); - DecodeNFrames(&outputs[0], 20); - DropEnhancementLayers(&outputs[0], 20, 2); - DecodeNFrames(&outputs[0], 20); - DropEnhancementLayers(&outputs[0], 20, 1); - DecodeNFrames(&outputs[0], 20); - - FreeBitstreamBuffers(&outputs[0], 20); -} - -TEST_F(SvcTest, SetMultipleFrameContextsOption) { - svc_.spatial_layers = 5; - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - svc_.spatial_layers = 2; - res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1"); - InitializeEncoder(); -} - -TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, - TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1"); - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1,1 scale-factors=1/1,1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, - TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1"); - Pass1EncodeNFrames(10, 3, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]); - - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 2); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayers) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1 scale-factors=1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - - vpx_fixed_buf base_layer[5]; - for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2]; - - DecodeNFrames(&base_layer[0], 5); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, - TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1 scale-factors=1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - - vpx_fixed_buf base_layer[5]; - for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2]; - - DecodeNFrames(&base_layer[0], 5); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); - codec_enc_.g_w = 704; - codec_enc_.g_h = 144; - tile_columns_ = 1; - tile_rows_ = 1; - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - codec_enc_.g_error_resilient = 0; - codec_enc_.g_w = 704; - codec_enc_.g_h = 144; - tile_columns_ = 1; - tile_rows_ = 1; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1 scale-factors=1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -} // namespace +} // namespace svc_test diff --git a/libs/libvpx/test/svc_test.h b/libs/libvpx/test/svc_test.h new file mode 100644 index 0000000000..f1d727fd9d --- /dev/null +++ b/libs/libvpx/test/svc_test.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_SVC_TEST_H_ +#define VPX_TEST_SVC_TEST_H_ + +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +class OnePassCbrSvc : public ::libvpx_test::EncoderTest { + public: + explicit OnePassCbrSvc(const ::libvpx_test::CodecFactory *codec) + : EncoderTest(codec), base_speed_setting_(0), speed_setting_(0), + superframe_count_(0), temporal_layer_id_(0), number_temporal_layers_(0), + number_spatial_layers_(0) { + memset(&svc_params_, 0, sizeof(svc_params_)); + memset(bits_in_buffer_model_, 0, + sizeof(bits_in_buffer_model_[0]) * VPX_MAX_LAYERS); + memset(layer_target_avg_bandwidth_, 0, + sizeof(layer_target_avg_bandwidth_[0]) * VPX_MAX_LAYERS); + } + + protected: + virtual ~OnePassCbrSvc() {} + + virtual void SetConfig(const int num_temporal_layer) = 0; + + virtual void SetSvcConfig(const int num_spatial_layer, + const int num_temporal_layer); + + virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder); + + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder); + + virtual void AssignLayerBitrates(); + + virtual void MismatchHook(const vpx_image_t *, const vpx_image_t *) {} + + vpx_svc_extra_cfg_t svc_params_; + int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; + int layer_target_avg_bandwidth_[VPX_MAX_LAYERS]; + int base_speed_setting_; + int speed_setting_; + int superframe_count_; + int temporal_layer_id_; + int number_temporal_layers_; + int number_spatial_layers_; +}; +} // namespace svc_test + +#endif // VPX_TEST_SVC_TEST_H_ diff --git a/libs/libvpx/test/temporal_filter_test.cc b/libs/libvpx/test/temporal_filter_test.cc deleted file mode 100644 index 655a36be9a..0000000000 --- a/libs/libvpx/test/temporal_filter_test.cc +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "third_party/googletest/src/include/gtest/gtest.h" - -#include "./vp9_rtcd.h" -#include "test/acm_random.h" -#include "test/buffer.h" -#include "test/register_state_check.h" -#include "vpx_ports/vpx_timer.h" - -namespace { - -using ::libvpx_test::ACMRandom; -using ::libvpx_test::Buffer; - -typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride, - const uint8_t *b, unsigned int w, - unsigned int h, int filter_strength, - int filter_weight, unsigned int *accumulator, - uint16_t *count); - -// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply -// filter based on strength and weight. Store the resulting filter amount in -// 'count' and apply it to 'b' and store it in 'accumulator'. -void reference_filter(const Buffer &a, const Buffer &b, int w, - int h, int filter_strength, int filter_weight, - Buffer *accumulator, - Buffer *count) { - Buffer diff_sq = Buffer(w, h, 0); - ASSERT_TRUE(diff_sq.Init()); - diff_sq.Set(0); - - int rounding = 0; - if (filter_strength > 0) { - rounding = 1 << (filter_strength - 1); - } - - // Calculate all the differences. Avoids re-calculating a bunch of extra - // values. - for (int height = 0; height < h; ++height) { - for (int width = 0; width < w; ++width) { - int diff = a.TopLeftPixel()[height * a.stride() + width] - - b.TopLeftPixel()[height * b.stride() + width]; - diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff; - } - } - - // For any given point, sum the neighboring values and calculate the - // modifier. - for (int height = 0; height < h; ++height) { - for (int width = 0; width < w; ++width) { - // Determine how many values are being summed. - int summed_values = 9; - - if (height == 0 || height == (h - 1)) { - summed_values -= 3; - } - - if (width == 0 || width == (w - 1)) { - if (summed_values == 6) { // corner - summed_values -= 2; - } else { - summed_values -= 3; - } - } - - // Sum the diff_sq of the surrounding values. - int sum = 0; - for (int idy = -1; idy <= 1; ++idy) { - for (int idx = -1; idx <= 1; ++idx) { - const int y = height + idy; - const int x = width + idx; - - // If inside the border. - if (y >= 0 && y < h && x >= 0 && x < w) { - sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x]; - } - } - } - - sum *= 3; - sum /= summed_values; - sum += rounding; - sum >>= filter_strength; - - // Clamp the value and invert it. - if (sum > 16) sum = 16; - sum = 16 - sum; - - sum *= filter_weight; - - count->TopLeftPixel()[height * count->stride() + width] += sum; - accumulator->TopLeftPixel()[height * accumulator->stride() + width] += - sum * b.TopLeftPixel()[height * b.stride() + width]; - } - } -} - -class TemporalFilterTest : public ::testing::TestWithParam { - public: - virtual void SetUp() { - filter_func_ = GetParam(); - rnd_.Reset(ACMRandom::DeterministicSeed()); - } - - protected: - TemporalFilterFunc filter_func_; - ACMRandom rnd_; -}; - -TEST_P(TemporalFilterTest, SizeCombinations) { - // Depending on subsampling this function may be called with values of 8 or 16 - // for width and height, in any combination. - Buffer a = Buffer(16, 16, 8); - ASSERT_TRUE(a.Init()); - - const int filter_weight = 2; - const int filter_strength = 6; - - for (int width = 8; width <= 16; width += 8) { - for (int height = 8; height <= 16; height += 8) { - // The second buffer must not have any border. - Buffer b = Buffer(width, height, 0); - ASSERT_TRUE(b.Init()); - Buffer accum_ref = Buffer(width, height, 0); - ASSERT_TRUE(accum_ref.Init()); - Buffer accum_chk = Buffer(width, height, 0); - ASSERT_TRUE(accum_chk.Init()); - Buffer count_ref = Buffer(width, height, 0); - ASSERT_TRUE(count_ref.Init()); - Buffer count_chk = Buffer(width, height, 0); - ASSERT_TRUE(count_chk.Init()); - - // The difference between the buffers must be small to pass the threshold - // to apply the filter. - a.Set(&rnd_, 0, 7); - b.Set(&rnd_, 0, 7); - - accum_ref.Set(rnd_.Rand8()); - accum_chk.CopyFrom(accum_ref); - count_ref.Set(rnd_.Rand8()); - count_chk.CopyFrom(count_ref); - reference_filter(a, b, width, height, filter_strength, filter_weight, - &accum_ref, &count_ref); - ASM_REGISTER_STATE_CHECK( - filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, - height, filter_strength, filter_weight, - accum_chk.TopLeftPixel(), count_chk.TopLeftPixel())); - EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); - EXPECT_TRUE(count_chk.CheckValues(count_ref)); - if (HasFailure()) { - printf("Width: %d Height: %d\n", width, height); - count_chk.PrintDifference(count_ref); - accum_chk.PrintDifference(accum_ref); - return; - } - } - } -} - -TEST_P(TemporalFilterTest, CompareReferenceRandom) { - for (int width = 8; width <= 16; width += 8) { - for (int height = 8; height <= 16; height += 8) { - Buffer a = Buffer(width, height, 8); - ASSERT_TRUE(a.Init()); - // The second buffer must not have any border. - Buffer b = Buffer(width, height, 0); - ASSERT_TRUE(b.Init()); - Buffer accum_ref = Buffer(width, height, 0); - ASSERT_TRUE(accum_ref.Init()); - Buffer accum_chk = Buffer(width, height, 0); - ASSERT_TRUE(accum_chk.Init()); - Buffer count_ref = Buffer(width, height, 0); - ASSERT_TRUE(count_ref.Init()); - Buffer count_chk = Buffer(width, height, 0); - ASSERT_TRUE(count_chk.Init()); - - for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) { - for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) { - for (int repeat = 0; repeat < 100; ++repeat) { - if (repeat < 50) { - a.Set(&rnd_, 0, 7); - b.Set(&rnd_, 0, 7); - } else { - // Check large (but close) values as well. - a.Set(&rnd_, std::numeric_limits::max() - 7, - std::numeric_limits::max()); - b.Set(&rnd_, std::numeric_limits::max() - 7, - std::numeric_limits::max()); - } - - accum_ref.Set(rnd_.Rand8()); - accum_chk.CopyFrom(accum_ref); - count_ref.Set(rnd_.Rand8()); - count_chk.CopyFrom(count_ref); - reference_filter(a, b, width, height, filter_strength, - filter_weight, &accum_ref, &count_ref); - ASM_REGISTER_STATE_CHECK(filter_func_( - a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height, - filter_strength, filter_weight, accum_chk.TopLeftPixel(), - count_chk.TopLeftPixel())); - EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); - EXPECT_TRUE(count_chk.CheckValues(count_ref)); - if (HasFailure()) { - printf("Weight: %d Strength: %d\n", filter_weight, - filter_strength); - count_chk.PrintDifference(count_ref); - accum_chk.PrintDifference(accum_ref); - return; - } - } - } - } - } - } -} - -TEST_P(TemporalFilterTest, DISABLED_Speed) { - Buffer a = Buffer(16, 16, 8); - ASSERT_TRUE(a.Init()); - - const int filter_weight = 2; - const int filter_strength = 6; - - for (int width = 8; width <= 16; width += 8) { - for (int height = 8; height <= 16; height += 8) { - // The second buffer must not have any border. - Buffer b = Buffer(width, height, 0); - ASSERT_TRUE(b.Init()); - Buffer accum_ref = Buffer(width, height, 0); - ASSERT_TRUE(accum_ref.Init()); - Buffer accum_chk = Buffer(width, height, 0); - ASSERT_TRUE(accum_chk.Init()); - Buffer count_ref = Buffer(width, height, 0); - ASSERT_TRUE(count_ref.Init()); - Buffer count_chk = Buffer(width, height, 0); - ASSERT_TRUE(count_chk.Init()); - - a.Set(&rnd_, 0, 7); - b.Set(&rnd_, 0, 7); - - accum_chk.Set(0); - count_chk.Set(0); - - vpx_usec_timer timer; - vpx_usec_timer_start(&timer); - for (int i = 0; i < 10000; ++i) { - filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, - height, filter_strength, filter_weight, - accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()); - } - vpx_usec_timer_mark(&timer); - const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); - printf("Temporal filter %dx%d time: %5d us\n", width, height, - elapsed_time); - } - } -} - -INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest, - ::testing::Values(&vp9_temporal_filter_apply_c)); - -#if HAVE_SSE4_1 -INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest, - ::testing::Values(&vp9_temporal_filter_apply_sse4_1)); -#endif // HAVE_SSE4_1 -} // namespace diff --git a/libs/libvpx/test/test-data.mk b/libs/libvpx/test/test-data.mk index f405e4ef14..27a955760a 100644 --- a/libs/libvpx/test/test-data.mk +++ b/libs/libvpx/test/test-data.mk @@ -3,14 +3,16 @@ LIBVPX_TEST_SRCS-yes += test-data.mk # Encoder test source LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444_20f.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444_20f.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m @@ -734,8 +736,12 @@ endif # CONFIG_VP9_HIGHBITDEPTH # Invalid files for testing libvpx error checking. LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm @@ -783,8 +789,13 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.web LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += crbug-1539.rawfile ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) # Encode / Decode test diff --git a/libs/libvpx/test/test-data.sha1 b/libs/libvpx/test/test-data.sha1 index 99b4e1e465..88f1e10d73 100644 --- a/libs/libvpx/test/test-data.sha1 +++ b/libs/libvpx/test/test-data.sha1 @@ -17,13 +17,13 @@ df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm 3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res 9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m -a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m -0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m -ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m +0936b837708ae68c034719f8e07596021c2c214f *park_joy_90p_10_420_20f.y4m +5727a853c083c1099f837d27967bc1322d50ed4f *park_joy_90p_10_422_20f.y4m +e13489470ef8e8b2a871a5640d795a42a39be58d *park_joy_90p_10_444_20f.y4m c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv -614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m -c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m -b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m +79b0dc1784635a7f291e21c4e8d66a29c496ab99 *park_joy_90p_12_420_20f.y4m +9cf22b0f809f7464c8b9058f0cfa9d905921cbd1 *park_joy_90p_12_422_20f.y4m +22b2a4abaecc4a9ade6bb503d25fb82367947e85 *park_joy_90p_12_444_20f.y4m 82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m 4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m @@ -852,5 +852,16 @@ e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res +1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf +90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 +a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res +894fae3afee0290546590823974203ab4b8abd95 *crbug-1539.rawfile +f1026c03efd5da21b381c8eb21f0d64e6d7e4ba3 *invalid-crbug-1558.ivf +eb198c25f861c3fe2cbd310de11eb96843019345 *invalid-crbug-1558.ivf.res +c62b005a9fd32c36a1b3f67de6840330f9915e34 *invalid-crbug-1562.ivf +f0cd8389948ad16085714d96567612136f6a46c5 *invalid-crbug-1562.ivf.res +bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv +094be4b80fa30bd227149ea16ab6476d549ea092 *slides_code_term_web_plot.1920_1080.yuv diff --git a/libs/libvpx/test/test.mk b/libs/libvpx/test/test.mk index a3716be60c..8ab4932ce4 100644 --- a/libs/libvpx/test/test.mk +++ b/libs/libvpx/test/test.mk @@ -1,4 +1,6 @@ LIBVPX_TEST_SRCS-yes += acm_random.h +LIBVPX_TEST_SRCS-yes += bench.h +LIBVPX_TEST_SRCS-yes += bench.cc LIBVPX_TEST_SRCS-yes += buffer.h LIBVPX_TEST_SRCS-yes += clear_system_state.h LIBVPX_TEST_SRCS-yes += codec_factory.h @@ -22,7 +24,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../y4minput.h ../y4minput.c LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += altref_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += aq_segment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += alt_ref_aq_segment_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += datarate_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += vp8_datarate_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += vp9_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += encode_api_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h @@ -46,9 +49,15 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += decode_corrupted.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_datarate_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.h @@ -67,6 +76,7 @@ LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h +LIBWEBM_PARSER_SRCS += ../third_party/libwebm/common/webmids.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += $(LIBWEBM_PARSER_SRCS) LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../tools_common.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.cc @@ -161,7 +171,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc ifneq ($(CONFIG_REALTIME_ONLY),yes) -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc @@ -169,7 +179,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc ifeq ($(CONFIG_VP9_ENCODER),yes) -LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc endif diff --git a/libs/libvpx/test/test_intra_pred_speed.cc b/libs/libvpx/test/test_intra_pred_speed.cc index 1cdeda410a..0be9feefd9 100644 --- a/libs/libvpx/test/test_intra_pred_speed.cc +++ b/libs/libvpx/test/test_intra_pred_speed.cc @@ -313,6 +313,8 @@ INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, #endif // HAVE_MSA #if HAVE_VSX +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 INTRA_PRED_TEST(VSX, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, vpx_h_predictor_4x4_vsx, NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_vsx) @@ -321,6 +323,7 @@ INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, NULL, NULL, NULL, NULL, vpx_h_predictor_8x8_vsx, vpx_d45_predictor_8x8_vsx, NULL, NULL, NULL, NULL, vpx_d63_predictor_8x8_vsx, vpx_tm_predictor_8x8_vsx) +#endif INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx, vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx, diff --git a/libs/libvpx/test/test_libvpx.cc b/libs/libvpx/test/test_libvpx.cc index 30641ae8c8..3405e4566b 100644 --- a/libs/libvpx/test/test_libvpx.cc +++ b/libs/libvpx/test/test_libvpx.cc @@ -61,7 +61,6 @@ int main(int argc, char **argv) { #if !CONFIG_SHARED // Shared library builds don't support whitebox tests // that exercise internal symbols. - #if CONFIG_VP8 vp8_rtcd(); #endif // CONFIG_VP8 diff --git a/libs/libvpx/test/test_vector_test.cc b/libs/libvpx/test/test_vector_test.cc index 1879b3d277..5a9737122f 100644 --- a/libs/libvpx/test/test_vector_test.cc +++ b/libs/libvpx/test/test_vector_test.cc @@ -10,8 +10,11 @@ #include #include +#include #include #include +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "../tools_common.h" #include "./vpx_config.h" @@ -29,9 +32,10 @@ namespace { const int kThreads = 0; -const int kFileName = 1; +const int kMtMode = 1; +const int kFileName = 2; -typedef std::tr1::tuple DecodeParam; +typedef std::tuple DecodeParam; class TestVectorTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { @@ -54,6 +58,25 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, << "Md5 file open failed. Filename: " << md5_file_name_; } +#if CONFIG_VP9_DECODER + virtual void PreDecodeFrameHook( + const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) { + if (video.frame_number() == 0 && mt_mode_ >= 0) { + if (mt_mode_ == 1) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1); + decoder->Control(VP9D_SET_ROW_MT, 0); + } else if (mt_mode_ == 2) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0); + decoder->Control(VP9D_SET_ROW_MT, 1); + } else { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0); + decoder->Control(VP9D_SET_ROW_MT, 0); + } + } + } +#endif + virtual void DecompressedFrameHook(const vpx_image_t &img, const unsigned int frame_number) { ASSERT_TRUE(md5_file_ != NULL); @@ -77,6 +100,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, #if CONFIG_VP9_DECODER std::set resize_clips_; #endif + int mt_mode_; private: FILE *md5_file_; @@ -88,19 +112,20 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, // the test failed. TEST_P(TestVectorTest, MD5Match) { const DecodeParam input = GET_PARAM(1); - const std::string filename = std::tr1::get(input); + const std::string filename = std::get(input); vpx_codec_flags_t flags = 0; vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); char str[256]; - cfg.threads = std::tr1::get(input); - - snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d", - filename.c_str(), cfg.threads); + cfg.threads = std::get(input); + mt_mode_ = std::get(input); + snprintf(str, sizeof(str) / sizeof(str[0]) - 1, + "file: %s threads: %d MT mode: %d", filename.c_str(), cfg.threads, + mt_mode_); SCOPED_TRACE(str); // Open compressed video file. - testing::internal::scoped_ptr video; + std::unique_ptr video; if (filename.substr(filename.length() - 3, 3) == "ivf") { video.reset(new libvpx_test::IVFVideoSource(filename)); } else if (filename.substr(filename.length() - 4, 4) == "webm") { @@ -131,7 +156,8 @@ TEST_P(TestVectorTest, MD5Match) { VP8_INSTANTIATE_TEST_CASE( TestVectorTest, ::testing::Combine( - ::testing::Values(1), // Single thread. + ::testing::Values(1), // Single thread. + ::testing::Values(-1), // LPF opt and Row MT is not applicable ::testing::ValuesIn(libvpx_test::kVP8TestVectors, libvpx_test::kVP8TestVectors + libvpx_test::kNumVP8TestVectors))); @@ -144,6 +170,7 @@ INSTANTIATE_TEST_CASE_P( static_cast(&libvpx_test::kVP8)), ::testing::Combine( ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::Values(-1), // LPF opt and Row MT is not applicable ::testing::ValuesIn(libvpx_test::kVP8TestVectors, libvpx_test::kVP8TestVectors + libvpx_test::kNumVP8TestVectors)))); @@ -154,7 +181,8 @@ INSTANTIATE_TEST_CASE_P( VP9_INSTANTIATE_TEST_CASE( TestVectorTest, ::testing::Combine( - ::testing::Values(1), // Single thread. + ::testing::Values(1), // Single thread. + ::testing::Values(-1), // LPF opt and Row MT is not applicable ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + libvpx_test::kNumVP9TestVectors))); @@ -166,6 +194,10 @@ INSTANTIATE_TEST_CASE_P( static_cast(&libvpx_test::kVP9)), ::testing::Combine( ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::Range(0, 3), // With multi threads modes 0 ~ 2 + // 0: LPF opt and Row MT disabled + // 1: LPF opt enabled + // 2: Row MT enabled ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + libvpx_test::kNumVP9TestVectors)))); diff --git a/libs/libvpx/test/test_vectors.h b/libs/libvpx/test/test_vectors.h index 3df3e81133..0a4be0f1a2 100644 --- a/libs/libvpx/test/test_vectors.h +++ b/libs/libvpx/test/test_vectors.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_TEST_VECTORS_H_ -#define TEST_TEST_VECTORS_H_ +#ifndef VPX_TEST_TEST_VECTORS_H_ +#define VPX_TEST_TEST_VECTORS_H_ #include "./vpx_config.h" @@ -31,4 +31,4 @@ extern const char *const kVP9TestVectorsResize[]; } // namespace libvpx_test -#endif // TEST_TEST_VECTORS_H_ +#endif // VPX_TEST_TEST_VECTORS_H_ diff --git a/libs/libvpx/test/tile_independence_test.cc b/libs/libvpx/test/tile_independence_test.cc index e24981c68d..1d1020a9d3 100644 --- a/libs/libvpx/test/tile_independence_test.cc +++ b/libs/libvpx/test/tile_independence_test.cc @@ -48,7 +48,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_); } } diff --git a/libs/libvpx/test/timestamp_test.cc b/libs/libvpx/test/timestamp_test.cc new file mode 100644 index 0000000000..20240fb77d --- /dev/null +++ b/libs/libvpx/test/timestamp_test.cc @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +namespace { + +const int kVideoSourceWidth = 320; +const int kVideoSourceHeight = 240; +const int kFramesToEncode = 3; + +// A video source that exposes functions to set the timebase, framerate and +// starting pts. +class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource { + public: + // Parameters num and den set the timebase for the video source. + DummyTimebaseVideoSource(int num, int den) + : timebase_({ num, den }), framerate_numerator_(30), + framerate_denominator_(1), starting_pts_(0) { + SetSize(kVideoSourceWidth, kVideoSourceHeight); + set_limit(kFramesToEncode); + } + + void SetFramerate(int numerator, int denominator) { + framerate_numerator_ = numerator; + framerate_denominator_ = denominator; + } + + // Returns one frames duration in timebase units as a double. + double FrameDuration() const { + return (static_cast(timebase_.den) / timebase_.num) / + (static_cast(framerate_numerator_) / framerate_denominator_); + } + + virtual vpx_codec_pts_t pts() const { + return static_cast(frame_ * FrameDuration() + + starting_pts_ + 0.5); + } + + virtual unsigned long duration() const { + return static_cast(FrameDuration() + 0.5); + } + + virtual vpx_rational_t timebase() const { return timebase_; } + + void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; } + + private: + vpx_rational_t timebase_; + int framerate_numerator_; + int framerate_denominator_; + int64_t starting_pts_; +}; + +class TimestampTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + TimestampTest() : EncoderTest(GET_PARAM(0)) {} + virtual ~TimestampTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } +}; + +class TimestampTestVp9Only : public TimestampTest {}; + +// Tests encoding in millisecond timebase. +TEST_P(TimestampTest, EncodeFrames) { + DummyTimebaseVideoSource video(1, 1000); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// TODO(fgalligan): Enable test when +// https://bugs.chromium.org/p/webm/issues/detail?id=1614 is fixed. +TEST_P(TimestampTest, DISABLED_TestMicrosecondTimebase) { + // Set the timebase to microseconds. + DummyTimebaseVideoSource video(1, 1000000); + video.set_limit(1); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// TODO(webm:701): Enable VP8 test when the overflow issue in +// TestVpxRollover is fixed. +TEST_P(TimestampTestVp9Only, TestVpxRollover) { + DummyTimebaseVideoSource video(1, 1000); + video.set_starting_pts(922337170351ll); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP8_INSTANTIATE_TEST_CASE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_CASE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_CASE(TimestampTestVp9Only, + ::testing::Values(::libvpx_test::kTwoPassGood)); +} // namespace diff --git a/libs/libvpx/test/tools_common.sh b/libs/libvpx/test/tools_common.sh index 0bdcc08d78..844a12534d 100755 --- a/libs/libvpx/test/tools_common.sh +++ b/libs/libvpx/test/tools_common.sh @@ -150,7 +150,7 @@ is_windows_target() { # empty string. Caller is responsible for testing the string once the function # returns. vpx_tool_path() { - local readonly tool_name="$1" + local tool_name="$1" local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}" if [ ! -x "${tool_path}" ]; then # Try one directory up: when running via examples.sh the tool could be in @@ -404,12 +404,16 @@ VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm" VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm" VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm" +VP9_RAW_FILE="${LIBVPX_TEST_DATA_PATH}/crbug-1539.rawfile" + YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" YUV_RAW_INPUT_WIDTH=352 YUV_RAW_INPUT_HEIGHT=288 Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m" Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m" +Y4M_720P_INPUT_WIDTH=1280 +Y4M_720P_INPUT_HEIGHT=720 # Setup a trap function to clean up after tests complete. trap cleanup EXIT diff --git a/libs/libvpx/test/user_priv_test.cc b/libs/libvpx/test/user_priv_test.cc index 4b5de094e9..7bea76b0a9 100644 --- a/libs/libvpx/test/user_priv_test.cc +++ b/libs/libvpx/test/user_priv_test.cc @@ -27,8 +27,8 @@ namespace { -using std::string; using libvpx_test::ACMRandom; +using std::string; #if CONFIG_WEBM_IO @@ -73,7 +73,7 @@ string DecodeFile(const string &filename) { CheckUserPrivateData(img->user_priv, &frame_num); // Also test ctrl_get_reference api. - struct vp9_ref_frame ref; + struct vp9_ref_frame ref = vp9_ref_frame(); // Randomly fetch a reference frame. ref.idx = rnd.Rand8() % 3; decoder.Control(VP9_GET_REFERENCE, &ref); diff --git a/libs/libvpx/test/util.h b/libs/libvpx/test/util.h index 1f2540ecf2..985f487094 100644 --- a/libs/libvpx/test/util.h +++ b/libs/libvpx/test/util.h @@ -8,16 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_UTIL_H_ -#define TEST_UTIL_H_ +#ifndef VPX_TEST_UTIL_H_ +#define VPX_TEST_UTIL_H_ #include #include +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx/vpx_image.h" // Macros -#define GET_PARAM(k) std::tr1::get(GetParam()) +#define GET_PARAM(k) std::get(GetParam()) inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) { assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) && @@ -43,4 +45,4 @@ inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) { return psnr; } -#endif // TEST_UTIL_H_ +#endif // VPX_TEST_UTIL_H_ diff --git a/libs/libvpx/test/variance_test.cc b/libs/libvpx/test/variance_test.cc index 421024ad88..e9fa03c680 100644 --- a/libs/libvpx/test/variance_test.cc +++ b/libs/libvpx/test/variance_test.cc @@ -20,24 +20,13 @@ #include "test/register_state_check.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/variance.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" namespace { -typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse); -typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - unsigned int *sse); -typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse, - const uint8_t *second_pred); typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride); typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); @@ -572,15 +561,16 @@ class SubpelVarianceTest if (!use_high_bit_depth()) { src_ = reinterpret_cast(vpx_memalign(16, block_size())); sec_ = reinterpret_cast(vpx_memalign(16, block_size())); - ref_ = new uint8_t[block_size() + width() + height() + 1]; + ref_ = reinterpret_cast( + vpx_malloc(block_size() + width() + height() + 1)); #if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( vpx_memalign(16, block_size() * sizeof(uint16_t)))); sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast( vpx_memalign(16, block_size() * sizeof(uint16_t)))); - ref_ = CONVERT_TO_BYTEPTR( - new uint16_t[block_size() + width() + height() + 1]); + ref_ = CONVERT_TO_BYTEPTR(reinterpret_cast(vpx_malloc( + (block_size() + width() + height() + 1) * sizeof(uint16_t)))); #endif // CONFIG_VP9_HIGHBITDEPTH } ASSERT_TRUE(src_ != NULL); @@ -591,12 +581,12 @@ class SubpelVarianceTest virtual void TearDown() { if (!use_high_bit_depth()) { vpx_free(src_); - delete[] ref_; vpx_free(sec_); + vpx_free(ref_); #if CONFIG_VP9_HIGHBITDEPTH } else { vpx_free(CONVERT_TO_SHORTPTR(src_)); - delete[] CONVERT_TO_SHORTPTR(ref_); + vpx_free(CONVERT_TO_SHORTPTR(ref_)); vpx_free(CONVERT_TO_SHORTPTR(sec_)); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -692,7 +682,7 @@ void SubpelVarianceTest::ExtremeRefTest() { } template <> -void SubpelVarianceTest::RefTest() { +void SubpelVarianceTest::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { if (!use_high_bit_depth()) { @@ -728,10 +718,10 @@ void SubpelVarianceTest::RefTest() { } typedef MainTestClass VpxSseTest; -typedef MainTestClass VpxMseTest; -typedef MainTestClass VpxVarianceTest; -typedef SubpelVarianceTest VpxSubpelVarianceTest; -typedef SubpelVarianceTest VpxSubpelAvgVarianceTest; +typedef MainTestClass VpxMseTest; +typedef MainTestClass VpxVarianceTest; +typedef SubpelVarianceTest VpxSubpelVarianceTest; +typedef SubpelVarianceTest VpxSubpelAvgVarianceTest; TEST_P(VpxSseTest, RefSse) { RefTestSse(); } TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); } @@ -756,14 +746,14 @@ INSTANTIATE_TEST_CASE_P(C, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_c))); -typedef TestParams MseParams; +typedef TestParams MseParams; INSTANTIATE_TEST_CASE_P(C, VpxMseTest, ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c), MseParams(4, 3, &vpx_mse16x8_c), MseParams(3, 4, &vpx_mse8x16_c), MseParams(3, 3, &vpx_mse8x8_c))); -typedef TestParams VarianceParams; +typedef TestParams VarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c), @@ -780,7 +770,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_c), VarianceParams(2, 2, &vpx_variance4x4_c))); -typedef TestParams SubpelVarianceParams; +typedef TestParams SubpelVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelVarianceTest, ::testing::Values( @@ -798,7 +788,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0), SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); -typedef TestParams SubpelAvgVarianceParams; +typedef TestParams SubpelAvgVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelAvgVarianceTest, ::testing::Values( @@ -817,10 +807,11 @@ INSTANTIATE_TEST_CASE_P( SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); #if CONFIG_VP9_HIGHBITDEPTH -typedef MainTestClass VpxHBDMseTest; -typedef MainTestClass VpxHBDVarianceTest; -typedef SubpelVarianceTest VpxHBDSubpelVarianceTest; -typedef SubpelVarianceTest VpxHBDSubpelAvgVarianceTest; +typedef MainTestClass VpxHBDMseTest; +typedef MainTestClass VpxHBDVarianceTest; +typedef SubpelVarianceTest VpxHBDSubpelVarianceTest; +typedef SubpelVarianceTest + VpxHBDSubpelAvgVarianceTest; TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } @@ -1384,15 +1375,19 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_AVX2 INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2))); + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2), + MseParams(4, 3, &vpx_mse16x8_avx2))); INSTANTIATE_TEST_CASE_P( AVX2, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_avx2), VarianceParams(6, 5, &vpx_variance64x32_avx2), + VarianceParams(5, 6, &vpx_variance32x64_avx2), VarianceParams(5, 5, &vpx_variance32x32_avx2), VarianceParams(5, 4, &vpx_variance32x16_avx2), - VarianceParams(4, 4, &vpx_variance16x16_avx2))); + VarianceParams(4, 5, &vpx_variance16x32_avx2), + VarianceParams(4, 4, &vpx_variance16x16_avx2), + VarianceParams(4, 3, &vpx_variance16x8_avx2))); INSTANTIATE_TEST_CASE_P( AVX2, VpxSubpelVarianceTest, @@ -1539,6 +1534,27 @@ INSTANTIATE_TEST_CASE_P(VSX, SumOfSquaresTest, INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_vsx))); +INSTANTIATE_TEST_CASE_P(VSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx), + MseParams(4, 3, &vpx_mse16x8_vsx), + MseParams(3, 4, &vpx_mse8x16_vsx), + MseParams(3, 3, &vpx_mse8x8_vsx))); + +INSTANTIATE_TEST_CASE_P( + VSX, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx), + VarianceParams(6, 5, &vpx_variance64x32_vsx), + VarianceParams(5, 6, &vpx_variance32x64_vsx), + VarianceParams(5, 5, &vpx_variance32x32_vsx), + VarianceParams(5, 4, &vpx_variance32x16_vsx), + VarianceParams(4, 5, &vpx_variance16x32_vsx), + VarianceParams(4, 4, &vpx_variance16x16_vsx), + VarianceParams(4, 3, &vpx_variance16x8_vsx), + VarianceParams(3, 4, &vpx_variance8x16_vsx), + VarianceParams(3, 3, &vpx_variance8x8_vsx), + VarianceParams(3, 2, &vpx_variance8x4_vsx), + VarianceParams(2, 3, &vpx_variance4x8_vsx), + VarianceParams(2, 2, &vpx_variance4x4_vsx))); #endif // HAVE_VSX #if HAVE_MMI diff --git a/libs/libvpx/test/video_source.h b/libs/libvpx/test/video_source.h index 54f692865b..e9340f21e9 100644 --- a/libs/libvpx/test/video_source.h +++ b/libs/libvpx/test/video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_VIDEO_SOURCE_H_ -#define TEST_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_VIDEO_SOURCE_H_ +#define VPX_TEST_VIDEO_SOURCE_H_ #if defined(_WIN32) #undef NOMINMAX @@ -255,4 +255,4 @@ class CompressedVideoSource { } // namespace libvpx_test -#endif // TEST_VIDEO_SOURCE_H_ +#endif // VPX_TEST_VIDEO_SOURCE_H_ diff --git a/libs/libvpx/test/vp8_datarate_test.cc b/libs/libvpx/test/vp8_datarate_test.cc new file mode 100644 index 0000000000..95a1157f6c --- /dev/null +++ b/libs/libvpx/test/vp8_datarate_test.cc @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" + +namespace { + +class DatarateTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} + + virtual ~DatarateTestLarge() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + ResetModel(); + } + + virtual void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + first_drop_ = 0; + bits_total_ = 0; + duration_ = 0.0; + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; + gf_boost_ = 0; + use_roi_ = false; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); + } + + if (use_roi_) { + encoder->Control(VP8E_SET_ROI_MAP, &roi_); + } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + } + + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + // TODO(jimbankoski): Remove these lines when the issue: + // http://code.google.com/p/webm/issues/detail?id=496 is fixed. + // For now the codec assumes buffer starts at starting buffer rate + // plus one frame's time. + if (last_pts_ == 0) duration = 1; + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += static_cast( + duration * timebase_ * cfg_.rc_target_bitrate * 1000); + + /* Test the buffer model here before subtracting the frame. Do so because + * the way the leaky bucket model works in libvpx is to allow the buffer to + * empty - and then stop showing frames until we've got enough bits to + * show one. As noted in comment below (issue 495), this does not currently + * apply to key frames. For now exclude key frames in condition below. */ + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (!key_frame) { + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + + const int64_t frame_size_in_bits = pkt->data.frame.sz * 8; + + // Subtract from the buffer the bits associated with a played back frame. + bits_in_buffer_model_ -= frame_size_in_bits; + + // Update the running total of bits for end of test datarate checks. + bits_total_ += frame_size_in_bits; + + // If first drop not set and we have a drop set it to this time. + if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + + // We update this so that we can calculate the datarate minus the last + // frame encoded in the file. + bits_in_last_frame_ = frame_size_in_bits; + + ++frame_number_; + } + + virtual void EndPassHook(void) { + if (bits_total_) { + const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit + + duration_ = (last_pts_ + 1) * timebase_; + + // Effective file datarate includes the time spent prebuffering. + effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 / + (cfg_.rc_buf_initial_sz / 1000.0 + duration_); + + file_datarate_ = file_size_in_kb / duration_; + } + } + + virtual void DenoiserLevelsTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + for (int j = 1; j < 5; ++j) { + // Run over the denoiser levels. + // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j + // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, + // denoiserOnAggressive, and denoiserOnAdaptive. + denoiser_on_ = j; + cfg_.rc_target_bitrate = 300; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + } + + virtual void DenoiserOffOnTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 299); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + + virtual void BasicBufferModelTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // 2 pass cbr datarate control has a bug hidden by the small # of + // frames selected in this encode. The problem is that even if the buffer is + // negative we produce a keyframe on a cutscene. Ignoring datarate + // constraints + // TODO(jimbankoski): ( Fix when issue + // http://code.google.com/p/webm/issues/detail?id=495 is addressed. ) + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + + // There is an issue for low bitrates in real-time mode, where the + // effective_datarate slightly overshoots the target bitrate. + // This is same the issue as noted about (#495). + // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), + // when the issue is resolved. + for (int i = 100; i < 800; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + } + + virtual void ChangingDropFrameThreshTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_max_quantizer = 36; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.kf_mode = VPX_KF_DISABLED; + + const int frame_count = 40; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, frame_count); + + // Here we check that the first dropped frame gets earlier and earlier + // as the drop frame threshold is increased. + + const int kDropFrameThreshTestStep = 30; + vpx_codec_pts_t last_drop = frame_count; + for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + } + } + + virtual void DropFramesMultiThreadsTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + cfg_.rc_target_bitrate = 200; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + + vpx_codec_pts_t last_pts_; + int64_t bits_in_buffer_model_; + double timebase_; + int frame_number_; + vpx_codec_pts_t first_drop_; + int64_t bits_total_; + double duration_; + double file_datarate_; + double effective_datarate_; + int64_t bits_in_last_frame_; + int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; + int set_cpu_used_; + int gf_boost_; + bool use_roi_; + vpx_roi_map_t roi_; +}; + +#if CONFIG_TEMPORAL_DENOISING +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestLarge, DenoiserLevels) { DenoiserLevelsTest(); } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestLarge, DenoiserOffOn) { DenoiserOffOnTest(); } +#endif // CONFIG_TEMPORAL_DENOISING + +TEST_P(DatarateTestLarge, BasicBufferModel) { BasicBufferModelTest(); } + +TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestLarge, DropFramesMultiThreads) { + DropFramesMultiThreadsTest(); +} + +class DatarateTestRealTime : public DatarateTestLarge { + public: + virtual ~DatarateTestRealTime() {} +}; + +#if CONFIG_TEMPORAL_DENOISING +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestRealTime, DenoiserLevels) { DenoiserLevelsTest(); } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestRealTime, DenoiserOffOn) {} +#endif // CONFIG_TEMPORAL_DENOISING + +TEST_P(DatarateTestRealTime, BasicBufferModel) { BasicBufferModelTest(); } + +TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { + DropFramesMultiThreadsTest(); +} + +TEST_P(DatarateTestRealTime, RegionOfInterest) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 352; + cfg_.g_h = 288; + + ResetModel(); + + // Set ROI parameters + use_roi_ = true; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 15) / 16; + roi_.cols = (cfg_.g_w + 15) / 16; + + roi_.delta_q[0] = 0; + roi_.delta_q[1] = -20; + roi_.delta_q[2] = 0; + roi_.delta_q[3] = 0; + + roi_.delta_lf[0] = 0; + roi_.delta_lf[1] = -20; + roi_.delta_lf[2] = 0; + roi_.delta_lf[3] = 0; + + roi_.static_threshold[0] = 0; + roi_.static_threshold[1] = 1000; + roi_.static_threshold[2] = 0; + roi_.static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + +TEST_P(DatarateTestRealTime, GFBoost) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // Apply a gf boost. + gf_boost_ = 50; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} + +VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES, + ::testing::Values(0)); +VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Values(-6, -12)); +} // namespace diff --git a/libs/libvpx/test/vp8_multi_resolution_encoder.sh b/libs/libvpx/test/vp8_multi_resolution_encoder.sh index a8b7fe78ee..bd45b5381f 100755 --- a/libs/libvpx/test/vp8_multi_resolution_encoder.sh +++ b/libs/libvpx/test/vp8_multi_resolution_encoder.sh @@ -22,7 +22,7 @@ vp8_multi_resolution_encoder_verify_environment() { elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." return 1 fi - local readonly app="vp8_multi_resolution_encoder" + local app="vp8_multi_resolution_encoder" if [ -z "$(vpx_tool_path "${app}")" ]; then elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent." return 1 @@ -33,7 +33,7 @@ vp8_multi_resolution_encoder_verify_environment() { # Runs vp8_multi_resolution_encoder. Simply forwards all arguments to # vp8_multi_resolution_encoder after building path to the executable. vp8_mre() { - local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)" + local encoder="$(vpx_tool_path vp8_multi_resolution_encoder)" if [ ! -x "${encoder}" ]; then elog "${encoder} does not exist or is not executable." return 1 @@ -43,22 +43,34 @@ vp8_mre() { } vp8_multi_resolution_encoder_three_formats() { - local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf - ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf - ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf" + local output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf" + local layer_bitrates="150 80 50" + local keyframe_insert="200" + local temporal_layers="3 3 3" + local framerate="30" if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then if [ "$(vp8_encode_available)" = "yes" ]; then # Param order: # Input width # Input height + # Framerate # Input file path # Output file names + # Layer bitrates + # Temporal layers + # Keyframe insert # Output PSNR vp8_mre "${YUV_RAW_INPUT_WIDTH}" \ "${YUV_RAW_INPUT_HEIGHT}" \ + "${framerate}" \ "${YUV_RAW_INPUT}" \ ${output_files} \ + ${layer_bitrates} \ + ${temporal_layers} \ + "${keyframe_insert}" \ 0 for output_file in ${output_files}; do diff --git a/libs/libvpx/test/vp9_arf_freq_test.cc b/libs/libvpx/test/vp9_arf_freq_test.cc index 48a4ca7392..9a3455b4aa 100644 --- a/libs/libvpx/test/vp9_arf_freq_test.cc +++ b/libs/libvpx/test/vp9_arf_freq_test.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" @@ -190,7 +192,7 @@ TEST_P(ArfFreqTest, MinArfFreqTest) { init_flags_ = VPX_CODEC_USE_PSNR; if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; - testing::internal::scoped_ptr video; + std::unique_ptr video; if (is_extension_y4m(test_video_param_.filename)) { video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, kFrames)); diff --git a/libs/libvpx/test/vp9_block_error_test.cc b/libs/libvpx/test/vp9_block_error_test.cc index 0b4d1df992..71a0686d7a 100644 --- a/libs/libvpx/test/vp9_block_error_test.cc +++ b/libs/libvpx/test/vp9_block_error_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -35,7 +36,7 @@ typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff, intptr_t block_size, int64_t *ssz, int bps); -typedef std::tr1::tuple +typedef std::tuple BlockErrorParam; typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, @@ -168,7 +169,7 @@ TEST_P(BlockErrorTest, ExtremeValues) { << "First failed at test case " << first_failure; } -using std::tr1::make_tuple; +using std::make_tuple; #if HAVE_SSE2 const BlockErrorParam sse2_block_error_tests[] = { diff --git a/libs/libvpx/test/vp9_datarate_test.cc b/libs/libvpx/test/vp9_datarate_test.cc new file mode 100644 index 0000000000..b8be275eaf --- /dev/null +++ b/libs/libvpx/test/vp9_datarate_test.cc @@ -0,0 +1,901 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +class DatarateTestVP9 : public ::libvpx_test::EncoderTest { + public: + explicit DatarateTestVP9(const ::libvpx_test::CodecFactory *codec) + : EncoderTest(codec) { + tune_content_ = 0; + } + + protected: + virtual ~DatarateTestVP9() {} + + virtual void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + tot_frame_number_ = 0; + first_drop_ = 0; + num_drops_ = 0; + aq_mode_ = 3; + // Denoiser is off by default. + denoiser_on_ = 0; + // For testing up to 3 layers. + for (int i = 0; i < 3; ++i) { + bits_total_[i] = 0; + } + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; + frame_parallel_decoding_mode_ = 1; + use_roi_ = false; + } + + // + // Frame flags and layer id for temporal layers. + // + + // For two layers, test pattern is: + // 1 3 + // 0 2 ..... + // For three layers, test pattern is: + // 1 3 5 7 + // 2 6 + // 0 4 .... + // LAST is always update on base/layer 0, GOLDEN is updated on layer 1. + // For this 3 layer example, the 2nd enhancement layer (layer 2) updates + // the altref frame. + static int GetFrameFlags(int frame_num, int num_temp_layers) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + // Layer 0: predict from L and ARF, update L. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 1: predict from L, G and ARF, and update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + // Layer 0: predict from L and ARF; update L. + frame_flags = + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + } else if ((frame_num - 2) % 4 == 0) { + // Layer 1: predict from L, G, ARF; update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + } else if ((frame_num - 1) % 2 == 0) { + // Layer 2: predict from L, G, ARF; update ARF. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + } + } + return frame_flags; + } + + static int SetLayerId(int frame_num, int num_temp_layers) { + int layer_id = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + layer_id = 0; + } else { + layer_id = 1; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + layer_id = 0; + } else if ((frame_num - 2) % 4 == 0) { + layer_id = 1; + } else if ((frame_num - 1) % 2 == 0) { + layer_id = 2; + } + } + return layer_id; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); + } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + } + + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads)); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, + frame_parallel_decoding_mode_); + + if (use_roi_) { + encoder->Control(VP9E_SET_ROI_MAP, &roi_); + encoder->Control(VP9E_SET_AQ_MODE, 0); + } + + if (cfg_.ts_number_layers > 1) { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_SVC, 1); + } + vpx_svc_layer_id_t layer_id; + layer_id.spatial_layer_id = 0; + frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); + layer_id.temporal_layer_id = + SetLayerId(video->frame(), cfg_.ts_number_layers); + layer_id.temporal_layer_id_per_spatial[0] = + SetLayerId(video->frame(), cfg_.ts_number_layers); + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + } + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + if (duration > 1) { + // If first drop not set and we have a drop set it to this time. + if (!first_drop_) first_drop_ = last_pts_ + 1; + // Update the number of frame drops. + num_drops_ += static_cast(duration - 1); + // Update counter for total number of frames (#frames input to encoder). + // Needed for setting the proper layer_id below. + tot_frame_number_ += static_cast(duration - 1); + } + + int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers); + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += static_cast( + duration * timebase_ * cfg_.rc_target_bitrate * 1000); + + // Buffer should not go negative. + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + + const size_t frame_size_in_bits = pkt->data.frame.sz * 8; + + // Update the total encoded bits. For temporal layers, update the cumulative + // encoded bits per layer. + for (int i = layer; i < static_cast(cfg_.ts_number_layers); ++i) { + bits_total_[i] += frame_size_in_bits; + } + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + ++frame_number_; + ++tot_frame_number_; + } + + virtual void EndPassHook(void) { + for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); + ++layer) { + duration_ = (last_pts_ + 1) * timebase_; + if (bits_total_[layer]) { + // Effective file datarate: + effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_; + } + } + } + + vpx_codec_pts_t last_pts_; + double timebase_; + int tune_content_; + int frame_number_; // Counter for number of non-dropped/encoded frames. + int tot_frame_number_; // Counter for total number of input frames. + int64_t bits_total_[3]; + double duration_; + double effective_datarate_[3]; + int set_cpu_used_; + int64_t bits_in_buffer_model_; + vpx_codec_pts_t first_drop_; + int num_drops_; + int aq_mode_; + int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; + int frame_parallel_decoding_mode_; + bool use_roi_; + vpx_roi_map_t roi_; +}; + +// Params: test mode, speed setting and index for bitrate array. +class DatarateTestVP9RealTimeMultiBR + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Params: speed setting and index for bitrate array. +class DatarateTestVP9LargeVBR + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for VBR mode with 0 lag. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagZero) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.36) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for VBR mode with non-zero lag. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZero) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for VBR mode with non-zero lag, with +// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs +// since error_resilience is off. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZeroFrameParDecOff) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR mode. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 150, 350, 550, 750 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode +// off( and error_resilience off). +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargetingFrameParDecOff) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 150, 350, 550, 750 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting444) { + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + const int bitrates[4] = { 250, 450, 650, 850 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_[0] * 0.80) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_[0] * 1.15) + << " The datarate for the file missed the target!" + << cfg_.rc_target_bitrate << " " << effective_datarate_; +} + +// Check that (1) the first dropped frame gets earlier and earlier +// as the drop frame threshold is increased, and (2) that the total number of +// frame drops does not decrease as we increase frame drop threshold. +// Use a lower qp-max to force some frame drops. +TEST_P(DatarateTestVP9RealTimeMultiBR, ChangingDropFrameThresh) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + // TODO(marpan): Investigate datarate target failures with a smaller keyframe + // interval (128). + cfg_.kf_max_dist = 9999; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + const int kDropFrameThreshTestStep = 30; + const int bitrates[2] = { 50, 150 }; + const int bitrate_index = GET_PARAM(2); + if (bitrate_index > 1) return; + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + vpx_codec_pts_t last_drop = 140; + int last_num_drops = 0; + for (int i = 10; i < 100; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + << " The datarate for the file is greater than target by too much!"; + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + ASSERT_GE(num_drops_, last_num_drops * 0.85) + << " The number of dropped frames for drop_thresh " << i + << " < number of dropped frames for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + last_num_drops = num_drops_; + } +} // namespace + +// Check basic rate targeting for 2 temporal layers. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting2TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 200, 400, 600, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + // 60-40 bitrate allocation for 2 temporal layers. + cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + } +} + +// Check basic rate targeting for 3 temporal layers. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting3TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 200, 400, 600, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + // 40-20-40 bitrate allocation for 3 temporal layers. + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + // TODO(yaowu): Work out more stable rc control strategy and + // Adjust the thresholds to be tighter than .75. + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + // TODO(yaowu): Work out more stable rc control strategy and + // Adjust the thresholds to be tighter than 1.25. + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + } +} + +// Params: speed setting. +class DatarateTestVP9RealTime : public DatarateTestVP9, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {} + virtual ~DatarateTestVP9RealTime() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. +TEST_P(DatarateTestVP9RealTime, BasicRateTargetingDropFramesMultiThreads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 200; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for 3 temporal layers, with frame dropping. +// Only for one (low) bitrate with lower max_quantizer, and somewhat higher +// frame drop threshold, to force frame dropping. +TEST_P(DatarateTestVP9RealTime, + BasicRateTargeting3TemporalLayersFrameDropping) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + // Set frame drop threshold and rc_max_quantizer to force some frame drops. + cfg_.rc_dropframe_thresh = 20; + cfg_.rc_max_quantizer = 45; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 200; + ResetModel(); + // 40-20-40 bitrate allocation for 3 temporal layers. + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.20) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + // Expect some frame drops in this test: for this 200 frames test, + // expect at least 10% and not more than 60% drops. + ASSERT_GE(num_drops_, 20); + ASSERT_LE(num_drops_, 280); + } +} + +// Check VP9 region of interest feature. +TEST_P(DatarateTestVP9RealTime, RegionOfInterest) { + if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 640; + cfg_.g_h = 480; + + ResetModel(); + + // Set ROI parameters + use_roi_ = true; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 7) / 8; + roi_.cols = (cfg_.g_w + 7) / 8; + + roi_.delta_q[1] = -20; + roi_.delta_lf[1] = -20; + memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame)); + roi_.ref_frame[1] = 1; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = reinterpret_cast( + calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map))); + ASSERT_TRUE(roi_.roi_map != NULL); + + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + +// Params: test mode, speed setting and index for bitrate array. +class DatarateTestVP9PostEncodeDrop + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. +TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + // Encode using multiple threads. + cfg_.g_threads = 2; + cfg_.g_error_resilient = 0; + tune_content_ = 1; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 300; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Params: speed setting. +class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { + public: + virtual ~DatarateTestVP9RealTimeDenoiser() {} +}; + +// Check basic datarate targeting, for a single bitrate, when denoiser is on. +TEST_P(DatarateTestVP9RealTimeDenoiser, LowNoise) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 400; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for clip with high noise level. Use 2 threads. +TEST_P(DatarateTestVP9RealTimeDenoiser, HighNoise) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 2; + + ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: kDenoiserOnYOnly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for 1280x720 clip with 4 threads. +TEST_P(DatarateTestVP9RealTimeDenoiser, 4threads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 4; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestVP9RealTimeDenoiser, DenoiserOffOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 400; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} +#endif // CONFIG_VP9_TEMPORAL_DENOISING + +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTimeMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 4)); + +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9), + ::testing::Range(0, 2)); + +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); + +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9PostEncodeDrop, + ::testing::Range(5, 6)); + +#if CONFIG_VP9_TEMPORAL_DENOISING +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTimeDenoiser, + ::testing::Range(5, 10)); +#endif +} // namespace diff --git a/libs/libvpx/test/vp9_denoiser_test.cc b/libs/libvpx/test/vp9_denoiser_test.cc index 56ca257c59..47fa587fca 100644 --- a/libs/libvpx/test/vp9_denoiser_test.cc +++ b/libs/libvpx/test/vp9_denoiser_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/acm_random.h" @@ -35,7 +36,7 @@ typedef int (*Vp9DenoiserFilterFunc)(const uint8_t *sig, int sig_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); -typedef std::tr1::tuple VP9DenoiserTestParam; +typedef std::tuple VP9DenoiserTestParam; class VP9DenoiserTest : public ::testing::Test, @@ -99,7 +100,7 @@ TEST_P(VP9DenoiserTest, BitexactCheck) { } } -using std::tr1::make_tuple; +using std::make_tuple; // Test for all block size. #if HAVE_SSE2 diff --git a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc index 62e8dcb9b5..fade08bbd4 100644 --- a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc +++ b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" @@ -74,7 +76,7 @@ class VpxEncoderParmsGetToDecoder virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs); encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range); encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless); @@ -138,7 +140,7 @@ class VpxEncoderParmsGetToDecoder TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) { init_flags_ = VPX_CODEC_USE_PSNR; - testing::internal::scoped_ptr video( + std::unique_ptr video( new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames)); ASSERT_TRUE(video.get() != NULL); diff --git a/libs/libvpx/test/vp9_end_to_end_test.cc b/libs/libvpx/test/vp9_end_to_end_test.cc index 955f567ce2..7cb716f226 100644 --- a/libs/libvpx/test/vp9_end_to_end_test.cc +++ b/libs/libvpx/test/vp9_end_to_end_test.cc @@ -8,10 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "memory" + #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" +#include "test/i420_video_source.h" #include "test/util.h" #include "test/y4m_video_source.h" #include "test/yuv_video_source.h" @@ -21,14 +24,14 @@ namespace { const unsigned int kWidth = 160; const unsigned int kHeight = 90; const unsigned int kFramerate = 50; -const unsigned int kFrames = 10; +const unsigned int kFrames = 20; const int kBitrate = 500; // List of psnr thresholds for speed settings 0-7 and 5 encoding modes const double kPsnrThreshold[][5] = { { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 }, { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 }, - { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 }, - { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 }, + { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 }, + { 28.5, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 }, }; typedef struct { @@ -45,13 +48,13 @@ const TestVideoParam kTestVectors[] = { { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1 }, { "park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1 }, #if CONFIG_VP9_HIGHBITDEPTH - { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 }, - { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 }, - { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 }, + { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 }, + { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 }, + { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 }, { "park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3 }, - { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 }, - { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 }, - { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 }, + { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 }, + { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 }, + { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 }, { "park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3 }, #endif // CONFIG_VP9_HIGHBITDEPTH }; @@ -59,11 +62,11 @@ const TestVideoParam kTestVectors[] = { // Encoding modes tested const libvpx_test::TestMode kEncodingModeVectors[] = { ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime, + ::libvpx_test::kRealTime }; // Speed settings tested -const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 }; +const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6, 7 }; int is_extension_y4m(const char *filename) { const char *dot = strrchr(filename, '.'); @@ -74,6 +77,43 @@ int is_extension_y4m(const char *filename) { } } +class EndToEndTestAdaptiveRDThresh + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + EndToEndTestAdaptiveRDThresh() + : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)), + cpu_used_end_(GET_PARAM(2)) {} + + virtual ~EndToEndTestAdaptiveRDThresh() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + dec_cfg_.threads = 4; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_); + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, 2); + } + if (video->frame() == 100) + encoder->Control(VP8E_SET_CPUUSED, cpu_used_end_); + } + + private: + int cpu_used_start_; + int cpu_used_end_; +}; + class EndToEndTestLarge : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWith3Paramsframe() == 1) { + if (video->frame() == 0) { encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 4); encoder->Control(VP8E_SET_CPUUSED, cpu_used_); @@ -123,6 +166,9 @@ class EndToEndTestLarge encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } else { + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_AQ_MODE, cyclic_refresh_); } } } @@ -138,6 +184,8 @@ class EndToEndTestLarge TestVideoParam test_video_param_; int cpu_used_; + int cyclic_refresh_; + int denoiser_on_; private: double psnr_; @@ -145,6 +193,50 @@ class EndToEndTestLarge libvpx_test::TestMode encoding_mode_; }; +#if CONFIG_VP9_DECODER +// The test parameters control VP9D_SET_LOOP_FILTER_OPT and the number of +// decoder threads. +class EndToEndTestLoopFilterThreading + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + EndToEndTestLoopFilterThreading() + : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {} + + virtual ~EndToEndTestLoopFilterThreading() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_threads = 2; + cfg_.g_lag_in_frames = 0; + cfg_.rc_target_bitrate = 500; + cfg_.rc_end_usage = VPX_CBR; + cfg_.kf_min_dist = 1; + cfg_.kf_max_dist = 1; + dec_cfg_.threads = GET_PARAM(2); + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 8); + } + encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5); + } + + virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) { + if (video->frame() == 0) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0); + } + } + + private: + const bool use_loop_filter_opt_; +}; +#endif // CONFIG_VP9_DECODER + TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { cfg_.rc_target_bitrate = kBitrate; cfg_.g_error_resilient = 0; @@ -154,7 +246,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { init_flags_ = VPX_CODEC_USE_PSNR; if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; - testing::internal::scoped_ptr video; + std::unique_ptr video; if (is_extension_y4m(test_video_param_.filename)) { video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, kFrames)); @@ -170,8 +262,63 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { EXPECT_GT(psnr, GetPsnrThreshold()); } +TEST_P(EndToEndTestLarge, EndtoEndPSNRDenoiserAQTest) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = VPX_CODEC_USE_PSNR; + cyclic_refresh_ = 3; + denoiser_on_ = 1; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libvpx_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, + kFramerate, 1, 0, kFrames)); + } + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()); +} + +TEST_P(EndToEndTestAdaptiveRDThresh, EndtoEndAdaptiveRDThreshRowMT) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_threads = 2; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9_DECODER +TEST_P(EndToEndTestLoopFilterThreading, TileCountChange) { + ::libvpx_test::RandomVideoSource video; + video.SetSize(4096, 2160); + video.set_limit(10); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +#endif // CONFIG_VP9_DECODER + VP9_INSTANTIATE_TEST_CASE(EndToEndTestLarge, ::testing::ValuesIn(kEncodingModeVectors), ::testing::ValuesIn(kTestVectors), ::testing::ValuesIn(kCpuUsedVectors)); + +VP9_INSTANTIATE_TEST_CASE(EndToEndTestAdaptiveRDThresh, + ::testing::Values(5, 6, 7), ::testing::Values(8, 9)); + +#if CONFIG_VP9_DECODER +VP9_INSTANTIATE_TEST_CASE(EndToEndTestLoopFilterThreading, ::testing::Bool(), + ::testing::Range(2, 6)); +#endif // CONFIG_VP9_DECODER } // namespace diff --git a/libs/libvpx/test/vp9_ethread_test.cc b/libs/libvpx/test/vp9_ethread_test.cc index 6b7e512116..6de76e9e55 100644 --- a/libs/libvpx/test/vp9_ethread_test.cc +++ b/libs/libvpx/test/vp9_ethread_test.cc @@ -387,7 +387,7 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); const double multi_thr_psnr = GetAveragePsnr(); - EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.1); + EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.2); } INSTANTIATE_TEST_CASE_P( @@ -409,7 +409,7 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(3, 9), // cpu_used + ::testing::Range(3, 10), // cpu_used ::testing::Range(0, 3), // tile_columns ::testing::Range(2, 5))); // threads diff --git a/libs/libvpx/test/vp9_intrapred_test.cc b/libs/libvpx/test/vp9_intrapred_test.cc index 39c5e79ebd..58091f875b 100644 --- a/libs/libvpx/test/vp9_intrapred_test.cc +++ b/libs/libvpx/test/vp9_intrapred_test.cc @@ -130,6 +130,12 @@ TEST_P(VP9IntraPredTest, IntraPredTests) { RunTest(left_col, above_data, dst, ref_dst); } +// Instantiate a token test to avoid -Wuninitialized warnings when none of the +// other tests are enabled. +INSTANTIATE_TEST_CASE_P( + C, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_4x4_c, + &vpx_d45_predictor_4x4_c, 4, 8))); #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, VP9IntraPredTest, @@ -378,58 +384,61 @@ INSTANTIATE_TEST_CASE_P( 8))); #endif // HAVE_MSA -#if HAVE_VSX -INSTANTIATE_TEST_CASE_P( - VSX, VP9IntraPredTest, - ::testing::Values( +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8, 8), - IntraPredParam(&vpx_d45_predictor_16x16_vsx, &vpx_d45_predictor_16x16_c, - 16, 8), - IntraPredParam(&vpx_d45_predictor_32x32_vsx, &vpx_d45_predictor_32x32_c, - 32, 8), IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8, 8), - IntraPredParam(&vpx_d63_predictor_16x16_vsx, &vpx_d63_predictor_16x16_c, - 16, 8), - IntraPredParam(&vpx_d63_predictor_32x32_vsx, &vpx_d63_predictor_32x32_c, - 32, 8), - IntraPredParam(&vpx_dc_128_predictor_16x16_vsx, - &vpx_dc_128_predictor_16x16_c, 16, 8), - IntraPredParam(&vpx_dc_128_predictor_32x32_vsx, - &vpx_dc_128_predictor_32x32_c, 32, 8), - IntraPredParam(&vpx_dc_left_predictor_16x16_vsx, - &vpx_dc_left_predictor_16x16_c, 16, 8), - IntraPredParam(&vpx_dc_left_predictor_32x32_vsx, - &vpx_dc_left_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8, 8), - IntraPredParam(&vpx_dc_predictor_16x16_vsx, &vpx_dc_predictor_16x16_c, - 16, 8), - IntraPredParam(&vpx_dc_predictor_32x32_vsx, &vpx_dc_predictor_32x32_c, - 32, 8), - IntraPredParam(&vpx_dc_top_predictor_16x16_vsx, - &vpx_dc_top_predictor_16x16_c, 16, 8), - IntraPredParam(&vpx_dc_top_predictor_32x32_vsx, - &vpx_dc_top_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8), - IntraPredParam(&vpx_h_predictor_16x16_vsx, &vpx_h_predictor_16x16_c, 16, - 8), - IntraPredParam(&vpx_h_predictor_32x32_vsx, &vpx_h_predictor_32x32_c, 32, - 8), IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8, 8), - IntraPredParam(&vpx_tm_predictor_16x16_vsx, &vpx_tm_predictor_16x16_c, - 16, 8), - IntraPredParam(&vpx_tm_predictor_32x32_vsx, &vpx_tm_predictor_32x32_c, - 32, 8), - IntraPredParam(&vpx_v_predictor_16x16_vsx, &vpx_v_predictor_16x16_c, 16, - 8), - IntraPredParam(&vpx_v_predictor_32x32_vsx, &vpx_v_predictor_32x32_c, 32, - 8))); +#endif + +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P( + VSX, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_vsx, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_vsx, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_16x16_vsx, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_vsx, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_vsx, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_vsx, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_vsx, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_vsx, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_16x16_vsx, + &vpx_dc_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_vsx, + &vpx_dc_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_vsx, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_vsx, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_16x16_vsx, + &vpx_h_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_vsx, + &vpx_h_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_tm_predictor_16x16_vsx, + &vpx_tm_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_vsx, + &vpx_tm_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_v_predictor_16x16_vsx, + &vpx_v_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_vsx, + &vpx_v_predictor_32x32_c, 32, 8))); #endif // HAVE_VSX #if CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/test/vp9_lossless_test.cc b/libs/libvpx/test/vp9_lossless_test.cc index 703b55e9bd..5cf0a41da4 100644 --- a/libs/libvpx/test/vp9_lossless_test.cc +++ b/libs/libvpx/test/vp9_lossless_test.cc @@ -38,7 +38,7 @@ class LosslessTest virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { // Only call Control if quantizer > 0 to verify that using quantizer // alone will activate lossless if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) { diff --git a/libs/libvpx/test/vp9_motion_vector_test.cc b/libs/libvpx/test/vp9_motion_vector_test.cc index 1030204ae3..b556a1c378 100644 --- a/libs/libvpx/test/vp9_motion_vector_test.cc +++ b/libs/libvpx/test/vp9_motion_vector_test.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" @@ -22,7 +24,7 @@ namespace { // Encoding modes const libvpx_test::TestMode kEncodingModeVectors[] = { ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime, + ::libvpx_test::kRealTime }; // Encoding speeds @@ -59,7 +61,7 @@ class MotionVectorTestLarge virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -81,7 +83,7 @@ TEST_P(MotionVectorTestLarge, OverallTest) { cfg_.g_profile = 0; init_flags_ = VPX_CODEC_USE_PSNR; - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::YUVVideoSource( "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160, // 2048, 1080, 30, 1, 0, 5)); diff --git a/libs/libvpx/test/vp9_quantize_test.cc b/libs/libvpx/test/vp9_quantize_test.cc index b18d4522ce..cce6b6f198 100644 --- a/libs/libvpx/test/vp9_quantize_test.cc +++ b/libs/libvpx/test/vp9_quantize_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -18,6 +19,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" @@ -26,6 +28,7 @@ #include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/msvc.h" #include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -41,8 +44,8 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); -typedef std::tr1::tuple +typedef std::tuple QuantizeParam; // Wrapper for FP version which does not use zbin or quant_shift. @@ -67,10 +70,13 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block, scan, iscan); } -class VP9QuantizeBase { +class VP9QuantizeBase : public AbstractBench { public: VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) - : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) { + : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp), + coeff_(Buffer(max_size_, max_size_, 0, 16)), + qcoeff_(Buffer(max_size_, max_size_, 0, 32)), + dqcoeff_(Buffer(max_size_, max_size_, 0, 32)) { max_value_ = (1 << bit_depth_) - 1; zbin_ptr_ = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); @@ -86,6 +92,9 @@ class VP9QuantizeBase { vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); + + r_ptr_ = (is_fp_) ? round_fp_ptr_ : round_ptr_; + q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; } ~VP9QuantizeBase() { @@ -118,6 +127,15 @@ class VP9QuantizeBase { int max_value_; const int max_size_; const bool is_fp_; + Buffer coeff_; + Buffer qcoeff_; + Buffer dqcoeff_; + int16_t *r_ptr_; + int16_t *q_ptr_; + int count_; + int skip_block_; + const scan_order *scan_; + uint16_t eob_; }; class VP9QuantizeTest : public VP9QuantizeBase, @@ -128,21 +146,29 @@ class VP9QuantizeTest : public VP9QuantizeBase, quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} protected: + virtual void Run(); const QuantizeFunc quantize_op_; const QuantizeFunc ref_quantize_op_; }; +void VP9QuantizeTest::Run() { + quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, + q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, + scan_->iscan); +} + // This quantizer compares the AC coefficients to the quantization step size to // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). -void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { +inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan, int is_32x32) { int i, eob = -1; - const int thr = dequant_ptr[1] >> 1; + const int thr = dequant_ptr[1] >> (1 + is_32x32); (void)iscan; (void)skip_block; assert(!skip_block); @@ -172,11 +198,24 @@ void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // If all of the AC coeffs in a row has magnitude less than the // quantization step_size/2, quantize to zero. if (nzflag_cnt < 16) { - int tmp = - clamp(abs_coeff[y] + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant_ptr[rc != 0]) >> 16; + int tmp; + int _round; + + if (is_32x32) { + _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + } else { + _round = round_ptr[rc != 0]; + } + tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32); qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y]; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (is_32x32) { + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + } else { + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + } } else { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; @@ -195,6 +234,26 @@ void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } +void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0); +} + +void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); +} + void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, int16_t *quant, int16_t *quant_shift, int16_t *dequant, int16_t *round_fp, @@ -236,19 +295,17 @@ void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - Buffer coeff = Buffer(max_size_, max_size_, 0, 16); - ASSERT_TRUE(coeff.Init()); - Buffer qcoeff = Buffer(max_size_, max_size_, 0, 32); - ASSERT_TRUE(qcoeff.Init()); - Buffer dqcoeff = Buffer(max_size_, max_size_, 0, 32); - ASSERT_TRUE(dqcoeff.Init()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); Buffer ref_qcoeff = Buffer(max_size_, max_size_, 0, 32); ASSERT_TRUE(ref_qcoeff.Init()); Buffer ref_dqcoeff = Buffer(max_size_, max_size_, 0, 32); ASSERT_TRUE(ref_dqcoeff.Init()); - uint16_t eob, ref_eob; + uint16_t ref_eob = 0; + eob_ = 0; for (int i = 0; i < number_of_iterations; ++i) { // Test skip block for the first three iterations to catch all the different @@ -261,33 +318,31 @@ TEST_P(VP9QuantizeTest, OperationCheck) { sz = TX_32X32; } const TX_TYPE tx_type = static_cast((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); - coeff.Set(&rnd, -max_value_, max_value_); + scan_ = &vp9_scan_orders[sz][tx_type]; + count_ = (4 << sz) * (4 << sz); + coeff_.Set(&rnd, -max_value_, max_value_); GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; - int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; - ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, - q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_order->scan, scan_order->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, + r_ptr_, q_ptr_, quant_shift_ptr_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, - quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), - dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); + coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); - EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); - EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); + EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); - EXPECT_EQ(eob, ref_eob); + EXPECT_EQ(eob_, ref_eob); if (HasFailure()) { printf("Failure on iteration %d.\n", i); - qcoeff.PrintDifference(ref_qcoeff); - dqcoeff.PrintDifference(ref_dqcoeff); + qcoeff_.PrintDifference(ref_qcoeff); + dqcoeff_.PrintDifference(ref_dqcoeff); return; } } @@ -295,22 +350,21 @@ TEST_P(VP9QuantizeTest, OperationCheck) { TEST_P(VP9QuantizeTest, EOBCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - Buffer coeff = Buffer(max_size_, max_size_, 0, 16); - ASSERT_TRUE(coeff.Init()); - Buffer qcoeff = Buffer(max_size_, max_size_, 0, 32); - ASSERT_TRUE(qcoeff.Init()); - Buffer dqcoeff = Buffer(max_size_, max_size_, 0, 32); - ASSERT_TRUE(dqcoeff.Init()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); Buffer ref_qcoeff = Buffer(max_size_, max_size_, 0, 32); ASSERT_TRUE(ref_qcoeff.Init()); Buffer ref_dqcoeff = Buffer(max_size_, max_size_, 0, 32); ASSERT_TRUE(ref_dqcoeff.Init()); - uint16_t eob, ref_eob; + uint16_t ref_eob = 0; + eob_ = 0; + const uint32_t max_index = max_size_ * max_size_ - 1; for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = 0; + skip_block_ = 0; TX_SIZE sz; if (max_size_ == 16) { sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 @@ -318,38 +372,36 @@ TEST_P(VP9QuantizeTest, EOBCheck) { sz = TX_32X32; } const TX_TYPE tx_type = static_cast((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); + scan_ = &vp9_scan_orders[sz][tx_type]; + count_ = (4 << sz) * (4 << sz); // Two random entries - coeff.Set(0); - coeff.TopLeftPixel()[rnd(count)] = + coeff_.Set(0); + coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] = static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; - coeff.TopLeftPixel()[rnd(count)] = + coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] = static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; - int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; - ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, - q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_order->scan, scan_order->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, + r_ptr_, q_ptr_, quant_shift_ptr_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, - quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), - dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); + coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); - EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); - EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); + EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); - EXPECT_EQ(eob, ref_eob); + EXPECT_EQ(eob_, ref_eob); if (HasFailure()) { printf("Failure on iteration %d.\n", i); - qcoeff.PrintDifference(ref_qcoeff); - dqcoeff.PrintDifference(ref_dqcoeff); + qcoeff_.PrintDifference(ref_qcoeff); + dqcoeff_.PrintDifference(ref_dqcoeff); return; } } @@ -357,13 +409,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) { TEST_P(VP9QuantizeTest, DISABLED_Speed) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - Buffer coeff = Buffer(max_size_, max_size_, 0, 16); - ASSERT_TRUE(coeff.Init()); - Buffer qcoeff = Buffer(max_size_, max_size_, 0, 32); - ASSERT_TRUE(qcoeff.Init()); - Buffer dqcoeff = Buffer(max_size_, max_size_, 0, 32); - ASSERT_TRUE(dqcoeff.Init()); - uint16_t eob; + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); TX_SIZE starting_sz, ending_sz; if (max_size_ == 16) { @@ -377,18 +425,16 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) { for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { // zbin > coeff, zbin < coeff. for (int i = 0; i < 2; ++i) { - const int skip_block = 0; + skip_block_ = 0; // TX_TYPE defines the scan order. That is not relevant to the speed test. // Pick the first one. const TX_TYPE tx_type = DCT_DCT; - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); + count_ = (4 << sz) * (4 << sz); + scan_ = &vp9_scan_orders[sz][tx_type]; GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; - int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; if (i == 0) { // When |coeff values| are less than zbin the results are 0. @@ -399,40 +445,33 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) { threshold = 200; } for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; - coeff.Set(&rnd, -99, 99); + coeff_.Set(&rnd, -99, 99); } else if (i == 1) { for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; - coeff.Set(&rnd, -500, 500); + coeff_.Set(&rnd, -500, 500); } - vpx_usec_timer timer; - vpx_usec_timer_start(&timer); - for (int j = 0; j < 100000000 / count; ++j) { - quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, - q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(), - dqcoeff.TopLeftPixel(), dequant_ptr_, &eob, - scan_order->scan, scan_order->iscan); - } - vpx_usec_timer_mark(&timer); - const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); - if (i == 0) printf("Bypass calculations.\n"); - if (i == 1) printf("Full calculations.\n"); - printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz, - elapsed_time / 1000); + RunNTimes(10000000 / count_); + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + PrintMedian(title); } - printf("\n"); } } -using std::tr1::make_tuple; +using std::make_tuple; #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH -// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds. -// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8), INSTANTIATE_TEST_CASE_P( SSE2, VP9QuantizeTest, ::testing::Values( + make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, @@ -457,51 +496,52 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 -#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 #if ARCH_X86_64 INSTANTIATE_TEST_CASE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, - 16, true))); -#else -INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false))); -#endif - -#if ARCH_X86_64 -// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test. -INSTANTIATE_TEST_CASE_P( - DISABLED_SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3, + make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), make_tuple(&QuantFPWrapper, - &QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, true))); -#endif // ARCH_X86_64 -#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH - -// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or -// highbitdepth configurations. -#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH +#else INSTANTIATE_TEST_CASE_P( - AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, + SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - // Even though SSSE3 and AVX do not match the reference - // code, we can keep them in sync with each other. - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32, + make_tuple(&vpx_quantize_b_32x32_ssse3, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); -#endif // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH -// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds. -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH +#endif // ARCH_X86_64 +#endif // HAVE_SSSE3 + +#if HAVE_AVX +INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_avx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx, + &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false))); +#endif // HAVE_AVX + +#if ARCH_X86_64 && HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true))); +#endif // HAVE_AVX2 + +#if HAVE_NEON INSTANTIATE_TEST_CASE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, @@ -515,7 +555,23 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + VSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_vsx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_vsx, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH // Only useful to compare "Speed" test results. INSTANTIATE_TEST_CASE_P( @@ -528,6 +584,9 @@ INSTANTIATE_TEST_CASE_P( &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); diff --git a/libs/libvpx/test/vp9_scale_test.cc b/libs/libvpx/test/vp9_scale_test.cc index 5d7d38e89a..f3e7f0a0e2 100644 --- a/libs/libvpx/test/vp9_scale_test.cc +++ b/libs/libvpx/test/vp9_scale_test.cc @@ -47,7 +47,7 @@ class ScaleTest : public VpxScaleBase, scale_fn_(&img_, &dst_img_, filter_type, phase_scaler)); } - void RunTest() { + void RunTest(INTERP_FILTER filter_type) { static const int kNumSizesToTest = 20; static const int kNumScaleFactorsToTest = 4; static const int kSizesToTest[] = { @@ -55,50 +55,48 @@ class ScaleTest : public VpxScaleBase, 22, 24, 26, 28, 30, 32, 34, 68, 128, 134 }; static const int kScaleFactors[] = { 1, 2, 3, 4 }; - for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { - for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { - for (int h = 0; h < kNumSizesToTest; ++h) { - const int src_height = kSizesToTest[h]; - for (int w = 0; w < kNumSizesToTest; ++w) { - const int src_width = kSizesToTest[w]; - for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; - ++sf_up_idx) { - const int sf_up = kScaleFactors[sf_up_idx]; - for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; - ++sf_down_idx) { - const int sf_down = kScaleFactors[sf_down_idx]; - const int dst_width = src_width * sf_up / sf_down; - const int dst_height = src_height * sf_up / sf_down; - if (sf_up == sf_down && sf_up != 1) { - continue; - } - // I420 frame width and height must be even. - if (!dst_width || !dst_height || dst_width & 1 || - dst_height & 1) { - continue; - } - // vpx_convolve8_c() has restriction on the step which cannot - // exceed 64 (ratio 1 to 4). - if (src_width > 4 * dst_width || src_height > 4 * dst_height) { - continue; - } - ASSERT_NO_FATAL_FAILURE(ResetScaleImages( - src_width, src_height, dst_width, dst_height)); - ReferenceScaleFrame(filter_type, phase_scaler); - ScaleFrame(filter_type, phase_scaler); - if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, - ref_img_.frame_size)) { - printf( - "filter_type = %d, phase_scaler = %d, src_width = %4d, " - "src_height = %4d, dst_width = %4d, dst_height = %4d, " - "scale factor = %d:%d\n", - filter_type, phase_scaler, src_width, src_height, - dst_width, dst_height, sf_down, sf_up); - PrintDiff(); - } - CompareImages(dst_img_); - DeallocScaleImages(); + for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { + for (int h = 0; h < kNumSizesToTest; ++h) { + const int src_height = kSizesToTest[h]; + for (int w = 0; w < kNumSizesToTest; ++w) { + const int src_width = kSizesToTest[w]; + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; + ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; } + // I420 frame width and height must be even. + if (!dst_width || !dst_height || dst_width & 1 || + dst_height & 1) { + continue; + } + // vpx_convolve8_c() has restriction on the step which cannot + // exceed 64 (ratio 1 to 4). + if (src_width > 4 * dst_width || src_height > 4 * dst_height) { + continue; + } + ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height, + dst_width, dst_height)); + ReferenceScaleFrame(filter_type, phase_scaler); + ScaleFrame(filter_type, phase_scaler); + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up); + PrintDiff(); + } + CompareImages(dst_img_); + DeallocScaleImages(); } } } @@ -145,7 +143,10 @@ class ScaleTest : public VpxScaleBase, ScaleFrameFunc scale_fn_; }; -TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } +TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); } +TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); } +TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); } +TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); } TEST_P(ScaleTest, DISABLED_Speed) { static const int kCountSpeedTestBlock = 100; diff --git a/libs/libvpx/test/vp9_spatial_svc_encoder.sh b/libs/libvpx/test/vp9_spatial_svc_encoder.sh deleted file mode 100755 index 65031073f8..0000000000 --- a/libs/libvpx/test/vp9_spatial_svc_encoder.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/sh -## -## Copyright (c) 2014 The WebM project authors. All Rights Reserved. -## -## Use of this source code is governed by a BSD-style license -## that can be found in the LICENSE file in the root of the source -## tree. An additional intellectual property rights grant can be found -## in the file PATENTS. All contributing project authors may -## be found in the AUTHORS file in the root of the source tree. -## -## This file tests the libvpx vp9_spatial_svc_encoder example. To add new -## tests to to this file, do the following: -## 1. Write a shell function (this is your test). -## 2. Add the function to vp9_spatial_svc_tests (on a new line). -## -. $(dirname $0)/tools_common.sh - -# Environment check: $YUV_RAW_INPUT is required. -vp9_spatial_svc_encoder_verify_environment() { - if [ ! -e "${YUV_RAW_INPUT}" ]; then - echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." - return 1 - fi -} - -# Runs vp9_spatial_svc_encoder. $1 is the test name. -vp9_spatial_svc_encoder() { - local readonly \ - encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}" - local readonly test_name="$1" - local readonly \ - output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf" - local readonly frames_to_encode=10 - local readonly max_kf=9999 - - shift - - if [ ! -x "${encoder}" ]; then - elog "${encoder} does not exist or is not executable." - return 1 - fi - - eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \ - -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \ - "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull} - - [ -e "${output_file}" ] || return 1 -} - -# Each test is run with layer count 1-$vp9_ssvc_test_layers. -vp9_ssvc_test_layers=5 - -vp9_spatial_svc() { - if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly test_name="vp9_spatial_svc" - for layers in $(seq 1 ${vp9_ssvc_test_layers}); do - vp9_spatial_svc_encoder "${test_name}" -sl ${layers} - done - fi -} - -readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i - DISABLED_vp9_spatial_svc_mode_altip - DISABLED_vp9_spatial_svc_mode_ip - DISABLED_vp9_spatial_svc_mode_gf - vp9_spatial_svc" - -if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then - run_tests \ - vp9_spatial_svc_encoder_verify_environment \ - "${vp9_spatial_svc_tests}" -fi diff --git a/libs/libvpx/test/vp9_subtract_test.cc b/libs/libvpx/test/vp9_subtract_test.cc index 62845ad615..67e8de6c74 100644 --- a/libs/libvpx/test/vp9_subtract_test.cc +++ b/libs/libvpx/test/vp9_subtract_test.cc @@ -14,9 +14,11 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "vp9/common/vp9_blockd.h" +#include "vpx_ports/msvc.h" #include "vpx_mem/vpx_mem.h" typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr, @@ -26,62 +28,101 @@ typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr, namespace vp9 { -class VP9SubtractBlockTest : public ::testing::TestWithParam { +class VP9SubtractBlockTest : public AbstractBench, + public ::testing::TestWithParam { public: virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + virtual void Run() { + GetParam()(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_); + } + + void SetupBlocks(BLOCK_SIZE bsize) { + block_width_ = 4 * num_4x4_blocks_wide_lookup[bsize]; + block_height_ = 4 * num_4x4_blocks_high_lookup[bsize]; + diff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*diff_) * block_width_ * block_height_ * 2)); + pred_ = reinterpret_cast( + vpx_memalign(16, block_width_ * block_height_ * 2)); + src_ = reinterpret_cast( + vpx_memalign(16, block_width_ * block_height_ * 2)); + } + + int block_width_; + int block_height_; + int16_t *diff_; + uint8_t *pred_; + uint8_t *src_; }; using libvpx_test::ACMRandom; +TEST_P(VP9SubtractBlockTest, DISABLED_Speed) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; + bsize = static_cast(static_cast(bsize) + 1)) { + SetupBlocks(bsize); + + RunNTimes(100000000 / (block_height_ * block_width_)); + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", block_height_, + block_width_); + char title[100]; + snprintf(title, sizeof(title), "%8s ", block_size); + PrintMedian(title); + + vpx_free(diff_); + vpx_free(pred_); + vpx_free(src_); + } +} + TEST_P(VP9SubtractBlockTest, SimpleSubtract) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - // FIXME(rbultje) split in its own file for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; bsize = static_cast(static_cast(bsize) + 1)) { - const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize]; - const int block_height = 4 * num_4x4_blocks_high_lookup[bsize]; - int16_t *diff = reinterpret_cast( - vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2)); - uint8_t *pred = reinterpret_cast( - vpx_memalign(16, block_width * block_height * 2)); - uint8_t *src = reinterpret_cast( - vpx_memalign(16, block_width * block_height * 2)); + SetupBlocks(bsize); for (int n = 0; n < 100; n++) { - for (int r = 0; r < block_height; ++r) { - for (int c = 0; c < block_width * 2; ++c) { - src[r * block_width * 2 + c] = rnd.Rand8(); - pred[r * block_width * 2 + c] = rnd.Rand8(); + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_ * 2; ++c) { + src_[r * block_width_ * 2 + c] = rnd.Rand8(); + pred_[r * block_width_ * 2 + c] = rnd.Rand8(); } } - GetParam()(block_height, block_width, diff, block_width, src, block_width, - pred, block_width); + GetParam()(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_); - for (int r = 0; r < block_height; ++r) { - for (int c = 0; c < block_width; ++c) { - EXPECT_EQ(diff[r * block_width + c], - (src[r * block_width + c] - pred[r * block_width + c])) - << "r = " << r << ", c = " << c << ", bs = " << bsize; + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (src_[r * block_width_ + c] - pred_[r * block_width_ + c])) + << "r = " << r << ", c = " << c + << ", bs = " << static_cast(bsize); } } - GetParam()(block_height, block_width, diff, block_width * 2, src, - block_width * 2, pred, block_width * 2); + GetParam()(block_height_, block_width_, diff_, block_width_ * 2, src_, + block_width_ * 2, pred_, block_width_ * 2); - for (int r = 0; r < block_height; ++r) { - for (int c = 0; c < block_width; ++c) { - EXPECT_EQ( - diff[r * block_width * 2 + c], - (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c])) - << "r = " << r << ", c = " << c << ", bs = " << bsize; + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ * 2 + c], + (src_[r * block_width_ * 2 + c] - + pred_[r * block_width_ * 2 + c])) + << "r = " << r << ", c = " << c + << ", bs = " << static_cast(bsize); } } } - vpx_free(diff); - vpx_free(pred); - vpx_free(src); + vpx_free(diff_); + vpx_free(pred_); + vpx_free(src_); } } @@ -106,4 +147,9 @@ INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_mmi)); #endif +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_vsx)); +#endif + } // namespace vp9 diff --git a/libs/libvpx/test/vp9_thread_test.cc b/libs/libvpx/test/vp9_thread_test.cc index 576f5e906b..31b6fe57b4 100644 --- a/libs/libvpx/test/vp9_thread_test.cc +++ b/libs/libvpx/test/vp9_thread_test.cc @@ -147,7 +147,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) { // ----------------------------------------------------------------------------- // Multi-threaded decode tests - #if CONFIG_WEBM_IO struct FileList { const char *name; @@ -197,6 +196,7 @@ void DecodeFiles(const FileList files[]) { // Note any worker that requires synchronization between other workers will // hang. namespace impl { +namespace { void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); } int Reset(VPxWorker *const /*worker*/) { return 1; } @@ -209,6 +209,7 @@ void Execute(VPxWorker *const worker) { void Launch(VPxWorker *const worker) { Execute(worker); } void End(VPxWorker *const /*worker*/) {} +} // namespace } // namespace impl TEST(VPxWorkerThreadTest, TestSerialInterface) { diff --git a/libs/libvpx/test/vpx_scale_test.cc b/libs/libvpx/test/vpx_scale_test.cc index ac75dceb23..4fad3069af 100644 --- a/libs/libvpx/test/vpx_scale_test.cc +++ b/libs/libvpx/test/vpx_scale_test.cc @@ -20,6 +20,15 @@ #include "vpx_scale/yv12config.h" namespace libvpx_test { +namespace { + +#if ARCH_ARM || (ARCH_MIPS && !HAVE_MIPS64) || ARCH_X86 +// Avoid OOM failures on 32-bit platforms. +const int kNumSizesToTest = 7; +#else +const int kNumSizesToTest = 8; +#endif +const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 3840, 16383 }; typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf); typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf, @@ -37,13 +46,6 @@ class ExtendBorderTest void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); } void RunTest() { -#if ARCH_ARM - // Some arm devices OOM when trying to allocate the largest buffers. - static const int kNumSizesToTest = 6; -#else - static const int kNumSizesToTest = 7; -#endif - static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); @@ -76,13 +78,6 @@ class CopyFrameTest : public VpxScaleBase, } void RunTest() { -#if ARCH_ARM - // Some arm devices OOM when trying to allocate the largest buffers. - static const int kNumSizesToTest = 6; -#else - static const int kNumSizesToTest = 7; -#endif - static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); @@ -102,4 +97,5 @@ TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } INSTANTIATE_TEST_CASE_P(C, CopyFrameTest, ::testing::Values(vp8_yv12_copy_frame_c)); +} // namespace } // namespace libvpx_test diff --git a/libs/libvpx/test/vpx_scale_test.h b/libs/libvpx/test/vpx_scale_test.h index dcbd02b91f..11c259ae80 100644 --- a/libs/libvpx/test/vpx_scale_test.h +++ b/libs/libvpx/test/vpx_scale_test.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_VPX_SCALE_TEST_H_ -#define TEST_VPX_SCALE_TEST_H_ +#ifndef VPX_TEST_VPX_SCALE_TEST_H_ +#define VPX_TEST_VPX_SCALE_TEST_H_ #include "third_party/googletest/src/include/gtest/gtest.h" @@ -33,7 +33,8 @@ class VpxScaleBase { const int height) { memset(img, 0, sizeof(*img)); ASSERT_EQ( - 0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS)); + 0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS)) + << "for width: " << width << " height: " << height; memset(img->buffer_alloc, kBufFiller, img->frame_size); } @@ -197,4 +198,4 @@ class VpxScaleBase { } // namespace libvpx_test -#endif // TEST_VPX_SCALE_TEST_H_ +#endif // VPX_TEST_VPX_SCALE_TEST_H_ diff --git a/libs/libvpx/test/vpx_temporal_svc_encoder.sh b/libs/libvpx/test/vpx_temporal_svc_encoder.sh index 56a7902f4f..5e5bac8fa6 100755 --- a/libs/libvpx/test/vpx_temporal_svc_encoder.sh +++ b/libs/libvpx/test/vpx_temporal_svc_encoder.sh @@ -38,6 +38,7 @@ vpx_tsvc_encoder() { local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}" local timebase_num="1" local timebase_den="1000" + local timebase_den_y4m="30" local speed="6" local frame_drop_thresh="30" local max_threads="4" @@ -58,6 +59,12 @@ vpx_tsvc_encoder() { "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ "$@" ${devnull} + # Test for y4m input. + eval "${VPX_TEST_PREFIX}" "${encoder}" "${Y4M_720P_INPUT}" \ + "${output_file}" "${codec}" "${Y4M_720P_INPUT_WIDTH}" \ + "${Y4M_720P_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den_y4m}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" ${devnull} else eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ @@ -85,7 +92,7 @@ files_exist() { vpx_tsvc_encoder_vp8_mode_0() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_0" + local output_basename="vpx_tsvc_encoder_vp8_mode_0" vpx_tsvc_encoder vp8 "${output_basename}" 0 200 || return 1 # Mode 0 produces 1 stream files_exist "${output_basename}" 1 || return 1 @@ -94,7 +101,7 @@ vpx_tsvc_encoder_vp8_mode_0() { vpx_tsvc_encoder_vp8_mode_1() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_1" + local output_basename="vpx_tsvc_encoder_vp8_mode_1" vpx_tsvc_encoder vp8 "${output_basename}" 1 200 400 || return 1 # Mode 1 produces 2 streams files_exist "${output_basename}" 2 || return 1 @@ -103,7 +110,7 @@ vpx_tsvc_encoder_vp8_mode_1() { vpx_tsvc_encoder_vp8_mode_2() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_2" + local output_basename="vpx_tsvc_encoder_vp8_mode_2" vpx_tsvc_encoder vp8 "${output_basename}" 2 200 400 || return 1 # Mode 2 produces 2 streams files_exist "${output_basename}" 2 || return 1 @@ -112,7 +119,7 @@ vpx_tsvc_encoder_vp8_mode_2() { vpx_tsvc_encoder_vp8_mode_3() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_3" + local output_basename="vpx_tsvc_encoder_vp8_mode_3" vpx_tsvc_encoder vp8 "${output_basename}" 3 200 400 600 || return 1 # Mode 3 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -121,7 +128,7 @@ vpx_tsvc_encoder_vp8_mode_3() { vpx_tsvc_encoder_vp8_mode_4() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_4" + local output_basename="vpx_tsvc_encoder_vp8_mode_4" vpx_tsvc_encoder vp8 "${output_basename}" 4 200 400 600 || return 1 # Mode 4 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -130,7 +137,7 @@ vpx_tsvc_encoder_vp8_mode_4() { vpx_tsvc_encoder_vp8_mode_5() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_5" + local output_basename="vpx_tsvc_encoder_vp8_mode_5" vpx_tsvc_encoder vp8 "${output_basename}" 5 200 400 600 || return 1 # Mode 5 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -139,7 +146,7 @@ vpx_tsvc_encoder_vp8_mode_5() { vpx_tsvc_encoder_vp8_mode_6() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_6" + local output_basename="vpx_tsvc_encoder_vp8_mode_6" vpx_tsvc_encoder vp8 "${output_basename}" 6 200 400 600 || return 1 # Mode 6 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -148,7 +155,7 @@ vpx_tsvc_encoder_vp8_mode_6() { vpx_tsvc_encoder_vp8_mode_7() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_7" + local output_basename="vpx_tsvc_encoder_vp8_mode_7" vpx_tsvc_encoder vp8 "${output_basename}" 7 200 400 600 800 1000 || return 1 # Mode 7 produces 5 streams files_exist "${output_basename}" 5 || return 1 @@ -157,7 +164,7 @@ vpx_tsvc_encoder_vp8_mode_7() { vpx_tsvc_encoder_vp8_mode_8() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_8" + local output_basename="vpx_tsvc_encoder_vp8_mode_8" vpx_tsvc_encoder vp8 "${output_basename}" 8 200 400 || return 1 # Mode 8 produces 2 streams files_exist "${output_basename}" 2 || return 1 @@ -166,7 +173,7 @@ vpx_tsvc_encoder_vp8_mode_8() { vpx_tsvc_encoder_vp8_mode_9() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_9" + local output_basename="vpx_tsvc_encoder_vp8_mode_9" vpx_tsvc_encoder vp8 "${output_basename}" 9 200 400 600 || return 1 # Mode 9 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -175,7 +182,7 @@ vpx_tsvc_encoder_vp8_mode_9() { vpx_tsvc_encoder_vp8_mode_10() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_10" + local output_basename="vpx_tsvc_encoder_vp8_mode_10" vpx_tsvc_encoder vp8 "${output_basename}" 10 200 400 600 || return 1 # Mode 10 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -184,7 +191,7 @@ vpx_tsvc_encoder_vp8_mode_10() { vpx_tsvc_encoder_vp8_mode_11() { if [ "$(vp8_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp8_mode_11" + local output_basename="vpx_tsvc_encoder_vp8_mode_11" vpx_tsvc_encoder vp8 "${output_basename}" 11 200 400 600 || return 1 # Mode 11 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -193,7 +200,7 @@ vpx_tsvc_encoder_vp8_mode_11() { vpx_tsvc_encoder_vp9_mode_0() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_0" + local output_basename="vpx_tsvc_encoder_vp9_mode_0" vpx_tsvc_encoder vp9 "${output_basename}" 0 200 || return 1 # Mode 0 produces 1 stream files_exist "${output_basename}" 1 || return 1 @@ -202,7 +209,7 @@ vpx_tsvc_encoder_vp9_mode_0() { vpx_tsvc_encoder_vp9_mode_1() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_1" + local output_basename="vpx_tsvc_encoder_vp9_mode_1" vpx_tsvc_encoder vp9 "${output_basename}" 1 200 400 || return 1 # Mode 1 produces 2 streams files_exist "${output_basename}" 2 || return 1 @@ -211,7 +218,7 @@ vpx_tsvc_encoder_vp9_mode_1() { vpx_tsvc_encoder_vp9_mode_2() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_2" + local output_basename="vpx_tsvc_encoder_vp9_mode_2" vpx_tsvc_encoder vp9 "${output_basename}" 2 200 400 || return 1 # Mode 2 produces 2 streams files_exist "${output_basename}" 2 || return 1 @@ -220,7 +227,7 @@ vpx_tsvc_encoder_vp9_mode_2() { vpx_tsvc_encoder_vp9_mode_3() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_3" + local output_basename="vpx_tsvc_encoder_vp9_mode_3" vpx_tsvc_encoder vp9 "${output_basename}" 3 200 400 600 || return 1 # Mode 3 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -229,7 +236,7 @@ vpx_tsvc_encoder_vp9_mode_3() { vpx_tsvc_encoder_vp9_mode_4() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_4" + local output_basename="vpx_tsvc_encoder_vp9_mode_4" vpx_tsvc_encoder vp9 "${output_basename}" 4 200 400 600 || return 1 # Mode 4 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -238,7 +245,7 @@ vpx_tsvc_encoder_vp9_mode_4() { vpx_tsvc_encoder_vp9_mode_5() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_5" + local output_basename="vpx_tsvc_encoder_vp9_mode_5" vpx_tsvc_encoder vp9 "${output_basename}" 5 200 400 600 || return 1 # Mode 5 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -247,7 +254,7 @@ vpx_tsvc_encoder_vp9_mode_5() { vpx_tsvc_encoder_vp9_mode_6() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_6" + local output_basename="vpx_tsvc_encoder_vp9_mode_6" vpx_tsvc_encoder vp9 "${output_basename}" 6 200 400 600 || return 1 # Mode 6 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -256,7 +263,7 @@ vpx_tsvc_encoder_vp9_mode_6() { vpx_tsvc_encoder_vp9_mode_7() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_7" + local output_basename="vpx_tsvc_encoder_vp9_mode_7" vpx_tsvc_encoder vp9 "${output_basename}" 7 200 400 600 800 1000 || return 1 # Mode 7 produces 5 streams files_exist "${output_basename}" 5 || return 1 @@ -265,7 +272,7 @@ vpx_tsvc_encoder_vp9_mode_7() { vpx_tsvc_encoder_vp9_mode_8() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_8" + local output_basename="vpx_tsvc_encoder_vp9_mode_8" vpx_tsvc_encoder vp9 "${output_basename}" 8 200 400 || return 1 # Mode 8 produces 2 streams files_exist "${output_basename}" 2 || return 1 @@ -274,7 +281,7 @@ vpx_tsvc_encoder_vp9_mode_8() { vpx_tsvc_encoder_vp9_mode_9() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_9" + local output_basename="vpx_tsvc_encoder_vp9_mode_9" vpx_tsvc_encoder vp9 "${output_basename}" 9 200 400 600 || return 1 # Mode 9 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -283,7 +290,7 @@ vpx_tsvc_encoder_vp9_mode_9() { vpx_tsvc_encoder_vp9_mode_10() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_10" + local output_basename="vpx_tsvc_encoder_vp9_mode_10" vpx_tsvc_encoder vp9 "${output_basename}" 10 200 400 600 || return 1 # Mode 10 produces 3 streams files_exist "${output_basename}" 3 || return 1 @@ -292,7 +299,7 @@ vpx_tsvc_encoder_vp9_mode_10() { vpx_tsvc_encoder_vp9_mode_11() { if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly output_basename="vpx_tsvc_encoder_vp9_mode_11" + local output_basename="vpx_tsvc_encoder_vp9_mode_11" vpx_tsvc_encoder vp9 "${output_basename}" 11 200 400 600 || return 1 # Mode 11 produces 3 streams files_exist "${output_basename}" 3 || return 1 diff --git a/libs/libvpx/test/vpxdec.sh b/libs/libvpx/test/vpxdec.sh index de51c8004e..044aa7e16d 100755 --- a/libs/libvpx/test/vpxdec.sh +++ b/libs/libvpx/test/vpxdec.sh @@ -18,7 +18,8 @@ vpxdec_verify_environment() { if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \ [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \ - [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then + [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] || \ + [ ! -e "${VP9_RAW_FILE}" ]; then elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." return 1 fi @@ -33,8 +34,8 @@ vpxdec_verify_environment() { # input file path and shifted away. All remaining parameters are passed through # to vpxdec. vpxdec_pipe() { - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly input="$1" + local decoder="$(vpx_tool_path vpxdec)" + local input="$1" shift cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull} } @@ -43,8 +44,8 @@ vpxdec_pipe() { # the directory containing vpxdec. $1 one is used as the input file path and # shifted away. All remaining parameters are passed through to vpxdec. vpxdec() { - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly input="$1" + local decoder="$(vpx_tool_path vpxdec)" + local input="$1" shift eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull} } @@ -95,9 +96,9 @@ vpxdec_vp9_webm_less_than_50_frames() { # frames in actual webm_read_frame calls. if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly expected=10 - local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \ + local decoder="$(vpx_tool_path vpxdec)" + local expected=10 + local num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \ "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \ | awk '/^[0-9]+ decoded frames/ { print $1 }') if [ "$num_frames" -ne "$expected" ]; then @@ -107,10 +108,28 @@ vpxdec_vp9_webm_less_than_50_frames() { fi } +# Ensures VP9_RAW_FILE correctly produces 1 frame instead of causing a hang. +vpxdec_vp9_raw_file() { + # Ensure a raw file properly reports eof and doesn't cause a hang. + if [ "$(vpxdec_can_decode_vp9)" = "yes" ]; then + local decoder="$(vpx_tool_path vpxdec)" + local expected=1 + [ -x /usr/bin/timeout ] && local TIMEOUT="/usr/bin/timeout 30s" + local num_frames=$(${TIMEOUT} ${VPX_TEST_PREFIX} "${decoder}" \ + "${VP9_RAW_FILE}" --summary --noblit 2>&1 \ + | awk '/^[0-9]+ decoded frames/ { print $1 }') + if [ -z "$num_frames" ] || [ "$num_frames" -ne "$expected" ]; then + elog "Output frames ($num_frames) != expected ($expected)" + return 1 + fi + fi +} + vpxdec_tests="vpxdec_vp8_ivf vpxdec_vp8_ivf_pipe_input vpxdec_vp9_webm vpxdec_vp9_webm_frame_parallel - vpxdec_vp9_webm_less_than_50_frames" + vpxdec_vp9_webm_less_than_50_frames + vpxdec_vp9_raw_file" run_tests vpxdec_verify_environment "${vpxdec_tests}" diff --git a/libs/libvpx/test/vpxenc.sh b/libs/libvpx/test/vpxenc.sh index 0c160dafc0..f94e2e094a 100755 --- a/libs/libvpx/test/vpxenc.sh +++ b/libs/libvpx/test/vpxenc.sh @@ -67,7 +67,7 @@ y4m_input_720p() { # Echo default vpxenc real time encoding params. $1 is the codec, which defaults # to vp8 if unspecified. vpxenc_rt_params() { - local readonly codec="${1:-vp8}" + local codec="${1:-vp8}" echo "--codec=${codec} --buf-initial-sz=500 --buf-optimal-sz=600 @@ -104,8 +104,8 @@ vpxenc_passes_param() { # input file path and shifted away. All remaining parameters are passed through # to vpxenc. vpxenc_pipe() { - local readonly encoder="$(vpx_tool_path vpxenc)" - local readonly input="$1" + local encoder="$(vpx_tool_path vpxenc)" + local input="$1" shift cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \ --test-decode=fatal \ @@ -116,8 +116,8 @@ vpxenc_pipe() { # the directory containing vpxenc. $1 one is used as the input file path and # shifted away. All remaining parameters are passed through to vpxenc. vpxenc() { - local readonly encoder="$(vpx_tool_path vpxenc)" - local readonly input="$1" + local encoder="$(vpx_tool_path vpxenc)" + local input="$1" shift eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \ --test-decode=fatal \ @@ -126,7 +126,7 @@ vpxenc() { vpxenc_vp8_ivf() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf" + local output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ @@ -143,7 +143,7 @@ vpxenc_vp8_ivf() { vpxenc_vp8_webm() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ @@ -159,7 +159,7 @@ vpxenc_vp8_webm() { vpxenc_vp8_webm_rt() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm" vpxenc $(yuv_input_hantro_collage) \ $(vpxenc_rt_params vp8) \ --output="${output}" @@ -173,7 +173,7 @@ vpxenc_vp8_webm_rt() { vpxenc_vp8_webm_2pass() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ @@ -190,9 +190,9 @@ vpxenc_vp8_webm_2pass() { vpxenc_vp8_webm_lag10_frames20() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly lag_total_frames=20 - local readonly lag_frames=10 - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm" + local lag_total_frames=20 + local lag_frames=10 + local output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${lag_total_frames}" \ @@ -210,7 +210,7 @@ vpxenc_vp8_webm_lag10_frames20() { vpxenc_vp8_ivf_piped_input() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf" + local output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf" vpxenc_pipe $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ @@ -226,8 +226,8 @@ vpxenc_vp8_ivf_piped_input() { vpxenc_vp9_ivf() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" - local readonly passes=$(vpxenc_passes_param) + local output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ @@ -245,8 +245,8 @@ vpxenc_vp9_ivf() { vpxenc_vp9_webm() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" - local readonly passes=$(vpxenc_passes_param) + local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ @@ -263,7 +263,7 @@ vpxenc_vp9_webm() { vpxenc_vp9_webm_rt() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm" vpxenc $(yuv_input_hantro_collage) \ $(vpxenc_rt_params vp9) \ --output="${output}" @@ -278,11 +278,11 @@ vpxenc_vp9_webm_rt() { vpxenc_vp9_webm_rt_multithread_tiled() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm" - local readonly tilethread_min=2 - local readonly tilethread_max=4 - local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})" - local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm" + local tilethread_min=2 + local tilethread_max=4 + local num_threads="$(seq ${tilethread_min} ${tilethread_max})" + local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" for threads in ${num_threads}; do for tile_cols in ${num_tile_cols}; do @@ -291,26 +291,25 @@ vpxenc_vp9_webm_rt_multithread_tiled() { --threads=${threads} \ --tile-columns=${tile_cols} \ --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + rm "${output}" done done - - if [ ! -e "${output}" ]; then - elog "Output file does not exist." - return 1 - fi - - rm "${output}" fi } vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm" - local readonly tilethread_min=2 - local readonly tilethread_max=4 - local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})" - local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm" + local tilethread_min=2 + local tilethread_max=4 + local num_threads="$(seq ${tilethread_min} ${tilethread_max})" + local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" for threads in ${num_threads}; do for tile_cols in ${num_tile_cols}; do @@ -320,22 +319,20 @@ vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() { --tile-columns=${tile_cols} \ --frame-parallel=1 \ --output="${output}" + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + rm "${output}" done done - - if [ ! -e "${output}" ]; then - elog "Output file does not exist." - return 1 - fi - - rm "${output}" fi } vpxenc_vp9_webm_2pass() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ @@ -351,8 +348,8 @@ vpxenc_vp9_webm_2pass() { vpxenc_vp9_ivf_lossless() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" - local readonly passes=$(vpxenc_passes_param) + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ @@ -370,8 +367,8 @@ vpxenc_vp9_ivf_lossless() { vpxenc_vp9_ivf_minq0_maxq0() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" - local readonly passes=$(vpxenc_passes_param) + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ @@ -391,10 +388,10 @@ vpxenc_vp9_ivf_minq0_maxq0() { vpxenc_vp9_webm_lag10_frames20() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly lag_total_frames=20 - local readonly lag_frames=10 - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" - local readonly passes=$(vpxenc_passes_param) + local lag_total_frames=20 + local lag_frames=10 + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${lag_total_frames}" \ @@ -414,8 +411,8 @@ vpxenc_vp9_webm_lag10_frames20() { vpxenc_vp9_webm_non_square_par() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm" - local readonly passes=$(vpxenc_passes_param) + local output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm" + local passes=$(vpxenc_passes_param) vpxenc $(y4m_input_non_square_par) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ @@ -429,6 +426,42 @@ vpxenc_vp9_webm_non_square_par() { fi } +vpxenc_vp9_webm_sharpness() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + local sharpnesses="0 1 2 3 4 5 6 7" + local output="${VPX_TEST_OUTPUT_DIR}/vpxenc_vp9_webm_sharpness.ivf" + local last_size=0 + local this_size=0 + + for sharpness in ${sharpnesses}; do + + vpxenc $(yuv_input_hantro_collage) \ + --sharpness="${sharpness}" \ + --codec=vp9 \ + --limit=1 \ + --cpu-used=2 \ + --end-usage=q \ + --cq-level=40 \ + --output="${output}" \ + "${passes}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + + this_size=$(stat -c '%s' "${output}") + if [ "${this_size}" -lt "${last_size}" ]; then + elog "Higher sharpness value yielded lower file size." + echo "${this_size}" " < " "${last_size}" + return 1 + fi + last_size="${this_size}" + + done + fi +} + vpxenc_tests="vpxenc_vp8_ivf vpxenc_vp8_webm vpxenc_vp8_webm_rt @@ -441,7 +474,9 @@ vpxenc_tests="vpxenc_vp8_ivf vpxenc_vp9_ivf_lossless vpxenc_vp9_ivf_minq0_maxq0 vpxenc_vp9_webm_lag10_frames20 - vpxenc_vp9_webm_non_square_par" + vpxenc_vp9_webm_non_square_par + vpxenc_vp9_webm_sharpness" + if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then vpxenc_tests="$vpxenc_tests vpxenc_vp8_webm_2pass diff --git a/libs/libvpx/test/webm_video_source.h b/libs/libvpx/test/webm_video_source.h index 09c007a3f3..6f55f7db7c 100644 --- a/libs/libvpx/test/webm_video_source.h +++ b/libs/libvpx/test/webm_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_WEBM_VIDEO_SOURCE_H_ -#define TEST_WEBM_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_WEBM_VIDEO_SOURCE_H_ +#define VPX_TEST_WEBM_VIDEO_SOURCE_H_ #include #include #include @@ -90,4 +90,4 @@ class WebMVideoSource : public CompressedVideoSource { } // namespace libvpx_test -#endif // TEST_WEBM_VIDEO_SOURCE_H_ +#endif // VPX_TEST_WEBM_VIDEO_SOURCE_H_ diff --git a/libs/libvpx/test/y4m_test.cc b/libs/libvpx/test/y4m_test.cc index ced717a7c1..76d033d52a 100644 --- a/libs/libvpx/test/y4m_test.cc +++ b/libs/libvpx/test/y4m_test.cc @@ -40,18 +40,18 @@ const Y4mTestParam kY4mTestVectors[] = { "284a47a47133b12884ec3a14e959a0b6" }, { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, "90517ff33843d85de712fd4fe60dbed0" }, - { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, - "63f21f9f717d8b8631bd2288ee87137b" }, - { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, - "48ab51fb540aed07f7ff5af130c9b605" }, - { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, - "067bfd75aa85ff9bae91fa3e0edd1e3e" }, - { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, - "9e6d8f6508c6e55625f6b697bc461cef" }, - { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, - "b239c6b301c0b835485be349ca83a7e3" }, - { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, - "5a6481a550821dab6d0192f5c63845e9" }, + { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, + "2f56ab9809269f074df7e3daf1ce0be6" }, + { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, + "1b5c73d2e8e8c4e02dc4889ecac41c83" }, + { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, + "ec4ab5be53195c5b838d1d19e1bc2674" }, + { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, + "3370856c8ddebbd1f9bb2e66f97677f4" }, + { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, + "4eab364318dd8201acbb182e43bd4966" }, + { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, + "f189dfbbd92119fc8e5f211a550166be" }, }; static void write_image_file(const vpx_image_t *img, FILE *file) { diff --git a/libs/libvpx/test/y4m_video_source.h b/libs/libvpx/test/y4m_video_source.h index 1301f69703..89aa2a44fc 100644 --- a/libs/libvpx/test/y4m_video_source.h +++ b/libs/libvpx/test/y4m_video_source.h @@ -7,9 +7,10 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_Y4M_VIDEO_SOURCE_H_ -#define TEST_Y4M_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_Y4M_VIDEO_SOURCE_H_ +#define VPX_TEST_Y4M_VIDEO_SOURCE_H_ #include +#include #include #include "test/video_source.h" @@ -108,7 +109,7 @@ class Y4mVideoSource : public VideoSource { std::string file_name_; FILE *input_file_; - testing::internal::scoped_ptr img_; + std::unique_ptr img_; unsigned int start_; unsigned int limit_; unsigned int frame_; @@ -119,4 +120,4 @@ class Y4mVideoSource : public VideoSource { } // namespace libvpx_test -#endif // TEST_Y4M_VIDEO_SOURCE_H_ +#endif // VPX_TEST_Y4M_VIDEO_SOURCE_H_ diff --git a/libs/libvpx/test/yuv_temporal_filter_test.cc b/libs/libvpx/test/yuv_temporal_filter_test.cc new file mode 100644 index 0000000000..8f3c58b038 --- /dev/null +++ b/libs/libvpx/test/yuv_temporal_filter_test.cc @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/register_state_check.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +using ::libvpx_test::ACMRandom; +using ::libvpx_test::Buffer; + +typedef void (*YUVTemporalFilterFunc)( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +struct TemporalFilterWithBd { + TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth) + : temporal_filter(func), bd(bitdepth) {} + + YUVTemporalFilterFunc temporal_filter; + int bd; +}; + +std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) { + return os << "Bitdepth: " << tf.bd; +} + +int GetFilterWeight(unsigned int row, unsigned int col, + unsigned int block_height, unsigned int block_width, + const int *const blk_fw, int use_32x32) { + if (use_32x32) { + return blk_fw[0]; + } + + return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)]; +} + +template +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod = sum_dist * 3 / index; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, + 39322, 32768, 28087, 24576, 21846, + 19661, 17874, 0, 15124 }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int64_t index_mult[14] = { 0U, 0U, 0U, 0U, + 3221225472U, 2576980378U, 2147483648U, 1840700270U, + 1610612736U, 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = static_cast((sum_dist * index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template +void ApplyReferenceFilter( + const Buffer &y_src, const Buffer &y_pre, + const Buffer &u_src, const Buffer &v_src, + const Buffer &u_pre, const Buffer &v_pre, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *const blk_fw, int use_32x32, + Buffer *y_accumulator, Buffer *y_counter, + Buffer *u_accumulator, Buffer *u_counter, + Buffer *v_accumulator, Buffer *v_counter) { + const PixelType *y_src_ptr = y_src.TopLeftPixel(); + const PixelType *y_pre_ptr = y_pre.TopLeftPixel(); + const PixelType *u_src_ptr = u_src.TopLeftPixel(); + const PixelType *u_pre_ptr = u_pre.TopLeftPixel(); + const PixelType *v_src_ptr = v_src.TopLeftPixel(); + const PixelType *v_pre_ptr = v_pre.TopLeftPixel(); + + const int uv_block_width = block_width >> ss_x, + uv_block_height = block_height >> ss_y; + const int y_src_stride = y_src.stride(), y_pre_stride = y_pre.stride(); + const int uv_src_stride = u_src.stride(), uv_pre_stride = u_pre.stride(); + const int y_diff_stride = block_width, uv_diff_stride = uv_block_width; + + Buffer y_dif = Buffer(block_width, block_height, 0); + Buffer u_dif = Buffer(uv_block_width, uv_block_height, 0); + Buffer v_dif = Buffer(uv_block_width, uv_block_height, 0); + + ASSERT_TRUE(y_dif.Init()); + ASSERT_TRUE(u_dif.Init()); + ASSERT_TRUE(v_dif.Init()); + y_dif.Set(0); + u_dif.Set(0); + v_dif.Set(0); + + int *y_diff_ptr = y_dif.TopLeftPixel(); + int *u_diff_ptr = u_dif.TopLeftPixel(); + int *v_diff_ptr = v_dif.TopLeftPixel(); + + uint32_t *y_accum = y_accumulator->TopLeftPixel(); + uint32_t *u_accum = u_accumulator->TopLeftPixel(); + uint32_t *v_accum = v_accumulator->TopLeftPixel(); + uint16_t *y_count = y_counter->TopLeftPixel(); + uint16_t *u_count = u_counter->TopLeftPixel(); + uint16_t *v_count = v_counter->TopLeftPixel(); + + const int y_accum_stride = y_accumulator->stride(); + const int u_accum_stride = u_accumulator->stride(); + const int v_accum_stride = v_accumulator->stride(); + const int y_count_stride = y_counter->stride(); + const int u_count_stride = u_counter->stride(); + const int v_count_stride = v_counter->stride(); + + const int rounding = (1 << strength) >> 1; + + // Get the square diffs + for (int row = 0; row < static_cast(block_height); row++) { + for (int col = 0; col < static_cast(block_width); col++) { + const int diff = y_src_ptr[row * y_src_stride + col] - + y_pre_ptr[row * y_pre_stride + col]; + y_diff_ptr[row * y_diff_stride + col] = diff * diff; + } + } + + for (int row = 0; row < uv_block_height; row++) { + for (int col = 0; col < uv_block_width; col++) { + const int u_diff = u_src_ptr[row * uv_src_stride + col] - + u_pre_ptr[row * uv_pre_stride + col]; + const int v_diff = v_src_ptr[row * uv_src_stride + col] - + v_pre_ptr[row * uv_pre_stride + col]; + u_diff_ptr[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_ptr[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (int row = 0; row < static_cast(block_height); row++) { + for (int col = 0; col < static_cast(block_width); col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; + const int filter_weight = GetFilterWeight(row, col, block_height, + block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre_ptr[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < static_cast(block_height) && + sub_col >= 0 && sub_col < static_cast(block_width)) { + y_mod += y_diff_ptr[sub_row * y_diff_stride + sub_col]; + y_num_used++; + } + } + } + + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + y_mod += u_diff_ptr[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_ptr[uv_row * uv_diff_stride + uv_col]; + + y_num_used += 2; + + // Set the modifier + y_mod = GetModIndex(y_mod, y_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + y_count[row * y_count_stride + col] += y_mod; + y_accum[row * y_accum_stride + col] += y_mod * y_pixel; + } + } + + // Apply the filter to chroma + for (int uv_row = 0; uv_row < uv_block_height; uv_row++) { + for (int uv_col = 0; uv_col < uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = GetFilterWeight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre_ptr[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre_ptr[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_ptr[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_ptr[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (int row_step = 0; row_step < 1 + ss_y; row_step++) { + for (int col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_ptr[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = GetModIndex(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = GetModIndex(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * u_count_stride + uv_col] += u_mod; + u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel; + v_count[uv_row * v_count_stride + uv_col] += v_mod; + v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel; + } + } +} + +class YUVTemporalFilterTest + : public ::testing::TestWithParam { + public: + virtual void SetUp() { + filter_func_ = GetParam().temporal_filter; + bd_ = GetParam().bd; + use_highbd_ = (bd_ != 8); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + saturate_test_ = 0; + num_repeats_ = 10; + + ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12); + } + + protected: + template + void CompareTestWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template + void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + YUVTemporalFilterFunc filter_func_; + ACMRandom rnd_; + int saturate_test_; + int num_repeats_; + int use_highbd_; + int bd_; +}; + +template +void YUVTemporalFilterTest::CompareTestWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer y_src = Buffer(width, height, 0); + Buffer y_pre = Buffer(width, height, 0); + Buffer y_count_ref = Buffer(width, height, 0); + Buffer y_accum_ref = Buffer(width, height, 0); + Buffer y_count_tst = Buffer(width, height, 0); + Buffer y_accum_tst = Buffer(width, height, 0); + + Buffer u_src = Buffer(uv_width, uv_height, 0); + Buffer u_pre = Buffer(uv_width, uv_height, 0); + Buffer u_count_ref = Buffer(uv_width, uv_height, 0); + Buffer u_accum_ref = Buffer(uv_width, uv_height, 0); + Buffer u_count_tst = Buffer(uv_width, uv_height, 0); + Buffer u_accum_tst = Buffer(uv_width, uv_height, 0); + + Buffer v_src = Buffer(uv_width, uv_height, 0); + Buffer v_pre = Buffer(uv_width, uv_height, 0); + Buffer v_count_ref = Buffer(uv_width, uv_height, 0); + Buffer v_accum_ref = Buffer(uv_width, uv_height, 0); + Buffer v_count_tst = Buffer(uv_width, uv_height, 0); + Buffer v_accum_tst = Buffer(uv_width, uv_height, 0); + + ASSERT_TRUE(y_src.Init()); + ASSERT_TRUE(y_pre.Init()); + ASSERT_TRUE(y_count_ref.Init()); + ASSERT_TRUE(y_accum_ref.Init()); + ASSERT_TRUE(y_count_tst.Init()); + ASSERT_TRUE(y_accum_tst.Init()); + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count_ref.Init()); + ASSERT_TRUE(u_accum_ref.Init()); + ASSERT_TRUE(u_count_tst.Init()); + ASSERT_TRUE(u_accum_tst.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count_ref.Init()); + ASSERT_TRUE(v_accum_ref.Init()); + ASSERT_TRUE(v_count_tst.Init()); + ASSERT_TRUE(v_accum_tst.Init()); + + y_accum_ref.Set(0); + y_accum_tst.Set(0); + y_count_ref.Set(0); + y_count_tst.Set(0); + u_accum_ref.Set(0); + u_accum_tst.Set(0); + u_count_ref.Set(0); + u_count_tst.Set(0); + v_accum_ref.Set(0); + v_accum_tst.Set(0); + v_count_ref.Set(0); + v_count_tst.Set(0); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + if (saturate_test_) { + const int max_val = (1 << bd_) - 1; + y_src.Set(max_val); + y_pre.Set(0); + u_src.Set(max_val); + u_pre.Set(0); + v_src.Set(max_val); + v_pre.Set(0); + } else { + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + } + + ApplyReferenceFilter( + y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, &y_accum_ref, &y_count_ref, + &u_accum_ref, &u_count_ref, &v_accum_ref, &v_count_ref); + + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast(u_src.TopLeftPixel()), + reinterpret_cast(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast(u_pre.TopLeftPixel()), + reinterpret_cast(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum_tst.TopLeftPixel(), y_count_tst.TopLeftPixel(), + u_accum_tst.TopLeftPixel(), u_count_tst.TopLeftPixel(), + v_accum_tst.TopLeftPixel(), v_count_tst.TopLeftPixel())); + + EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref)); + EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref)); + EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref)); + EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref)); + EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref)); + EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref)); + + if (HasFailure()) { + if (use_32x32) { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y, + filter_strength, *filter_weight); + } else { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x, + ss_y, filter_strength, filter_weight[0], filter_weight[1], + filter_weight[2], filter_weight[3]); + } + y_accum_tst.PrintDifference(y_accum_ref); + y_count_tst.PrintDifference(y_count_ref); + u_accum_tst.PrintDifference(u_accum_ref); + u_count_tst.PrintDifference(u_count_ref); + v_accum_tst.PrintDifference(v_accum_ref); + v_count_tst.PrintDifference(v_count_ref); + + return; + } + } +} + +template +void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer y_src = Buffer(width, height, 0); + Buffer y_pre = Buffer(width, height, 0); + Buffer y_count = Buffer(width, height, 0); + Buffer y_accum = Buffer(width, height, 0); + + Buffer u_src = Buffer(uv_width, uv_height, 0); + Buffer u_pre = Buffer(uv_width, uv_height, 0); + Buffer u_count = Buffer(uv_width, uv_height, 0); + Buffer u_accum = Buffer(uv_width, uv_height, 0); + + Buffer v_src = Buffer(uv_width, uv_height, 0); + Buffer v_pre = Buffer(uv_width, uv_height, 0); + Buffer v_count = Buffer(uv_width, uv_height, 0); + Buffer v_accum = Buffer(uv_width, uv_height, 0); + + ASSERT_TRUE(y_src.Init()); + ASSERT_TRUE(y_pre.Init()); + ASSERT_TRUE(y_count.Init()); + ASSERT_TRUE(y_accum.Init()); + + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count.Init()); + ASSERT_TRUE(u_accum.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count.Init()); + ASSERT_TRUE(v_accum.Init()); + + y_accum.Set(0); + y_count.Set(0); + + u_accum.Set(0); + u_count.Set(0); + + v_accum.Set(0); + v_count.Set(0); + + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast(u_src.TopLeftPixel()), + reinterpret_cast(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast(u_pre.TopLeftPixel()), + reinterpret_cast(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum.TopLeftPixel(), y_count.TopLeftPixel(), u_accum.TopLeftPixel(), + u_count.TopLeftPixel(), v_accum.TopLeftPixel(), + v_count.TopLeftPixel())); + } +} + +TEST_P(YUVTemporalFilterTest, Use32x32) { + const int width = 32, height = 32; + const int use_32x32 = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + for (int filter_weight = 0; filter_weight <= 2; filter_weight++) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, Use16x16) { + const int width = 32, height = 32; + const int use_32x32 = 0; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, SaturationTest) { + const int width = 32, height = 32; + const int use_32x32 = 1; + const int filter_weight = 1; + saturate_test_ = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } +} + +TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { + const int width = 32, height = 32; + num_repeats_ = 1000; + + for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) { + const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3; + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < num_filter_weights; + filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + + if (use_highbd_) { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } else { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + + printf( + "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: " + "%d, Strength: %d, Time: %5d\n", + bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength, + elapsed_time); + } + } + } + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +#define WRAP_HIGHBD_FUNC(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, \ + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, \ + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, \ + int uv_pre_stride, unsigned int block_width, unsigned int block_height, \ + int ss_x, int ss_y, int strength, const int *const blk_fw, \ + int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, \ + uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, \ + uint16_t *v_count) { \ + func(reinterpret_cast(y_src), y_src_stride, \ + reinterpret_cast(y_pre), y_pre_stride, \ + reinterpret_cast(u_src), \ + reinterpret_cast(v_src), uv_src_stride, \ + reinterpret_cast(u_pre), \ + reinterpret_cast(v_pre), uv_pre_stride, \ + block_width, block_height, ss_x, ss_y, strength, blk_fw, use_32x32, \ + y_accumulator, y_count, u_accumulator, u_count, v_accumulator, \ + v_count); \ + } + +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10); +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12); + +INSTANTIATE_TEST_CASE_P( + C, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12))); +#if HAVE_SSE4_1 +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10); +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12); + +INSTANTIATE_TEST_CASE_P( + SSE4_1, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_10, + 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12, + 12))); +#endif // HAVE_SSE4_1 +#else +INSTANTIATE_TEST_CASE_P( + C, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P(SSE4_1, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_sse4_1, 8))); +#endif // HAVE_SSE4_1 +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/libs/libvpx/test/yuv_video_source.h b/libs/libvpx/test/yuv_video_source.h index aee6b2ffbb..020ce801d9 100644 --- a/libs/libvpx/test/yuv_video_source.h +++ b/libs/libvpx/test/yuv_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_YUV_VIDEO_SOURCE_H_ -#define TEST_YUV_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_YUV_VIDEO_SOURCE_H_ +#define VPX_TEST_YUV_VIDEO_SOURCE_H_ #include #include @@ -122,4 +122,4 @@ class YUVVideoSource : public VideoSource { } // namespace libvpx_test -#endif // TEST_YUV_VIDEO_SOURCE_H_ +#endif // VPX_TEST_YUV_VIDEO_SOURCE_H_ diff --git a/libs/libvpx/third_party/googletest/README.libvpx b/libs/libvpx/third_party/googletest/README.libvpx index 2cd6910b41..49005ddac9 100644 --- a/libs/libvpx/third_party/googletest/README.libvpx +++ b/libs/libvpx/third_party/googletest/README.libvpx @@ -1,5 +1,5 @@ -URL: https://github.com/google/googletest -Version: 1.8.0 +URL: https://github.com/google/googletest.git +Version: release-1.8.1 License: BSD License File: LICENSE @@ -13,12 +13,16 @@ generation. Local Modifications: - Remove everything but: - googletest-release-1.8.0/googletest/ + googletest-release-1.8.1/googletest/ CHANGES CONTRIBUTORS include LICENSE README.md src -- Suppress unsigned overflow instrumentation in the LCG - https://github.com/google/googletest/pull/1066 + +- Make WithParamInterface::GetParam static in order to avoid + initialization issues + https://github.com/google/googletest/pull/1830 +- Use wcslen() instead of std::wcslen() + https://github.com/google/googletest/pull/1899 diff --git a/libs/libvpx/third_party/googletest/src/README.md b/libs/libvpx/third_party/googletest/src/README.md index edd4408054..e30fe80471 100644 --- a/libs/libvpx/third_party/googletest/src/README.md +++ b/libs/libvpx/third_party/googletest/src/README.md @@ -1,23 +1,21 @@ +### Generic Build Instructions -### Generic Build Instructions ### +#### Setup -#### Setup #### +To build Google Test and your tests that use it, you need to tell your build +system where to find its headers and source files. The exact way to do it +depends on which build system you use, and is usually straightforward. -To build Google Test and your tests that use it, you need to tell your -build system where to find its headers and source files. The exact -way to do it depends on which build system you use, and is usually -straightforward. +#### Build -#### Build #### - -Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, -create a library build target (or a project as called by Visual Studio -and Xcode) to compile +Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a +library build target (or a project as called by Visual Studio and Xcode) to +compile ${GTEST_DIR}/src/gtest-all.cc with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}` -in the normal header search path. Assuming a Linux-like system and gcc, +in the normal header search path. Assuming a Linux-like system and gcc, something like the following will do: g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \ @@ -26,136 +24,239 @@ something like the following will do: (We need `-pthread` as Google Test uses threads.) -Next, you should compile your test source file with -`${GTEST_DIR}/include` in the system header search path, and link it -with gtest and any other necessary libraries: +Next, you should compile your test source file with `${GTEST_DIR}/include` in +the system header search path, and link it with gtest and any other necessary +libraries: g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \ -o your_test -As an example, the make/ directory contains a Makefile that you can -use to build Google Test on systems where GNU make is available -(e.g. Linux, Mac OS X, and Cygwin). It doesn't try to build Google -Test's own tests. Instead, it just builds the Google Test library and -a sample test. You can use it as a starting point for your own build -script. +As an example, the make/ directory contains a Makefile that you can use to build +Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and +Cygwin). It doesn't try to build Google Test's own tests. Instead, it just +builds the Google Test library and a sample test. You can use it as a starting +point for your own build script. -If the default settings are correct for your environment, the -following commands should succeed: +If the default settings are correct for your environment, the following commands +should succeed: cd ${GTEST_DIR}/make make ./sample1_unittest -If you see errors, try to tweak the contents of `make/Makefile` to make -them go away. There are instructions in `make/Makefile` on how to do -it. +If you see errors, try to tweak the contents of `make/Makefile` to make them go +away. There are instructions in `make/Makefile` on how to do it. -### Using CMake ### +### Using CMake Google Test comes with a CMake build script ( -[CMakeLists.txt](CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for -cross-platform.). If you don't have CMake installed already, you can -download it for free from . +[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) +that can be used on a wide range of platforms ("C" stands for cross-platform.). +If you don't have CMake installed already, you can download it for free from +. -CMake works by generating native makefiles or build projects that can -be used in the compiler environment of your choice. The typical -workflow starts with: +CMake works by generating native makefiles or build projects that can be used in +the compiler environment of your choice. You can either build Google Test as a +standalone project or it can be incorporated into an existing CMake build for +another project. + +#### Standalone CMake Project + +When building Google Test as a standalone project, the typical workflow starts +with: mkdir mybuild # Create a directory to hold the build output. cd mybuild cmake ${GTEST_DIR} # Generate native build scripts. -If you want to build Google Test's samples, you should replace the -last command with +If you want to build Google Test's samples, you should replace the last command +with cmake -Dgtest_build_samples=ON ${GTEST_DIR} -If you are on a \*nix system, you should now see a Makefile in the -current directory. Just type 'make' to build gtest. +If you are on a \*nix system, you should now see a Makefile in the current +directory. Just type 'make' to build gtest. -If you use Windows and have Visual Studio installed, a `gtest.sln` file -and several `.vcproj` files will be created. You can then build them -using Visual Studio. +If you use Windows and have Visual Studio installed, a `gtest.sln` file and +several `.vcproj` files will be created. You can then build them using Visual +Studio. On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated. -### Legacy Build Scripts ### +#### Incorporating Into An Existing CMake Project + +If you want to use gtest in a project which already uses CMake, then a more +robust and flexible approach is to build gtest as part of that project directly. +This is done by making the GoogleTest source code available to the main build +and adding it using CMake's `add_subdirectory()` command. This has the +significant advantage that the same compiler and linker settings are used +between gtest and the rest of your project, so issues associated with using +incompatible libraries (eg debug/release), etc. are avoided. This is +particularly useful on Windows. Making GoogleTest's source code available to the +main build can be done a few different ways: + +* Download the GoogleTest source code manually and place it at a known + location. This is the least flexible approach and can make it more difficult + to use with continuous integration systems, etc. +* Embed the GoogleTest source code as a direct copy in the main project's + source tree. This is often the simplest approach, but is also the hardest to + keep up to date. Some organizations may not permit this method. +* Add GoogleTest as a git submodule or equivalent. This may not always be + possible or appropriate. Git submodules, for example, have their own set of + advantages and drawbacks. +* Use CMake to download GoogleTest as part of the build's configure step. This + is just a little more complex, but doesn't have the limitations of the other + methods. + +The last of the above methods is implemented with a small piece of CMake code in +a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and +then invoked as a sub-build _during the CMake stage_. That directory is then +pulled into the main build with `add_subdirectory()`. For example: + +New file `CMakeLists.txt.in`: + + cmake_minimum_required(VERSION 2.8.2) + + project(googletest-download NONE) + + include(ExternalProject) + ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) + +Existing build's `CMakeLists.txt`: + + # Download and unpack googletest at configure time + configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) + execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) + if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) + if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") + endif() + + # Prevent overriding the parent project's compiler/linker + # settings on Windows + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + + # Add googletest directly to our build. This defines + # the gtest and gtest_main targets. + add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src + ${CMAKE_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) + + # The gtest/gtest_main targets carry header search path + # dependencies automatically when using CMake 2.8.11 or + # later. Otherwise we have to add them here ourselves. + if (CMAKE_VERSION VERSION_LESS 2.8.11) + include_directories("${gtest_SOURCE_DIR}/include") + endif() + + # Now simply link against gtest or gtest_main as needed. Eg + add_executable(example example.cpp) + target_link_libraries(example gtest_main) + add_test(NAME example_test COMMAND example) + +Note that this approach requires CMake 2.8.2 or later due to its use of the +`ExternalProject_Add()` command. The above technique is discussed in more detail +in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which +also contains a link to a fully generalized implementation of the technique. + +##### Visual Studio Dynamic vs Static Runtimes + +By default, new Visual Studio projects link the C runtimes dynamically but +Google Test links them statically. This will generate an error that looks +something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch +detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value +'MDd_DynamicDebug' in main.obj + +Google Test already has a CMake option for this: `gtest_force_shared_crt` + +Enabling this option will make gtest link the runtimes dynamically too, and +match the project in which it is included. + +### Legacy Build Scripts Before settling on CMake, we have been providing hand-maintained build -projects/scripts for Visual Studio, Xcode, and Autotools. While we -continue to provide them for convenience, they are not actively -maintained any more. We highly recommend that you follow the -instructions in the previous two sections to integrate Google Test -with your existing build system. +projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to +provide them for convenience, they are not actively maintained any more. We +highly recommend that you follow the instructions in the above sections to +integrate Google Test with your existing build system. If you still need to use the legacy build scripts, here's how: -The msvc\ folder contains two solutions with Visual C++ projects. -Open the `gtest.sln` or `gtest-md.sln` file using Visual Studio, and you -are ready to build Google Test the same way you build any Visual -Studio project. Files that have names ending with -md use DLL -versions of Microsoft runtime libraries (the /MD or the /MDd compiler -option). Files without that suffix use static versions of the runtime -libraries (the /MT or the /MTd option). Please note that one must use -the same option to compile both gtest and the test code. If you use -Visual Studio 2005 or above, we recommend the -md version as /MD is -the default for new projects in these versions of Visual Studio. +The msvc\ folder contains two solutions with Visual C++ projects. Open the +`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to +build Google Test the same way you build any Visual Studio project. Files that +have names ending with -md use DLL versions of Microsoft runtime libraries (the +/MD or the /MDd compiler option). Files without that suffix use static versions +of the runtime libraries (the /MT or the /MTd option). Please note that one must +use the same option to compile both gtest and the test code. If you use Visual +Studio 2005 or above, we recommend the -md version as /MD is the default for new +projects in these versions of Visual Studio. -On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using -Xcode. Build the "gtest" target. The universal binary framework will -end up in your selected build directory (selected in the Xcode -"Preferences..." -> "Building" pane and defaults to xcode/build). -Alternatively, at the command line, enter: +On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode. +Build the "gtest" target. The universal binary framework will end up in your +selected build directory (selected in the Xcode "Preferences..." -> "Building" +pane and defaults to xcode/build). Alternatively, at the command line, enter: xcodebuild -This will build the "Release" configuration of gtest.framework in your -default build location. See the "xcodebuild" man page for more -information about building different configurations and building in -different locations. +This will build the "Release" configuration of gtest.framework in your default +build location. See the "xcodebuild" man page for more information about +building different configurations and building in different locations. -If you wish to use the Google Test Xcode project with Xcode 4.x and -above, you need to either: +If you wish to use the Google Test Xcode project with Xcode 4.x and above, you +need to either: - * update the SDK configuration options in xcode/Config/General.xconfig. - Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If - you choose this route you lose the ability to target earlier versions - of MacOS X. - * Install an SDK for an earlier version. This doesn't appear to be - supported by Apple, but has been reported to work - (http://stackoverflow.com/questions/5378518). +* update the SDK configuration options in xcode/Config/General.xconfig. + Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If + you choose this route you lose the ability to target earlier versions of + MacOS X. +* Install an SDK for an earlier version. This doesn't appear to be supported + by Apple, but has been reported to work + (http://stackoverflow.com/questions/5378518). -### Tweaking Google Test ### +### Tweaking Google Test -Google Test can be used in diverse environments. The default -configuration may not work (or may not work well) out of the box in -some environments. However, you can easily tweak Google Test by -defining control macros on the compiler command line. Generally, -these macros are named like `GTEST_XYZ` and you define them to either 1 -or 0 to enable or disable a certain feature. +Google Test can be used in diverse environments. The default configuration may +not work (or may not work well) out of the box in some environments. However, +you can easily tweak Google Test by defining control macros on the compiler +command line. Generally, these macros are named like `GTEST_XYZ` and you define +them to either 1 or 0 to enable or disable a certain feature. -We list the most frequently used macros below. For a complete list, -see file [include/gtest/internal/gtest-port.h](include/gtest/internal/gtest-port.h). +We list the most frequently used macros below. For a complete list, see file +[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h). -### Choosing a TR1 Tuple Library ### +### Choosing a TR1 Tuple Library -Some Google Test features require the C++ Technical Report 1 (TR1) -tuple library, which is not yet available with all compilers. The -good news is that Google Test implements a subset of TR1 tuple that's -enough for its own need, and will automatically use this when the -compiler doesn't provide TR1 tuple. +Some Google Test features require the C++ Technical Report 1 (TR1) tuple +library, which is not yet available with all compilers. The good news is that +Google Test implements a subset of TR1 tuple that's enough for its own need, and +will automatically use this when the compiler doesn't provide TR1 tuple. -Usually you don't need to care about which tuple library Google Test -uses. However, if your project already uses TR1 tuple, you need to -tell Google Test to use the same TR1 tuple library the rest of your -project uses, or the two tuple implementations will clash. To do -that, add +Usually you don't need to care about which tuple library Google Test uses. +However, if your project already uses TR1 tuple, you need to tell Google Test to +use the same TR1 tuple library the rest of your project uses, or the two tuple +implementations will clash. To do that, add -DGTEST_USE_OWN_TR1_TUPLE=0 -to the compiler flags while compiling Google Test and your tests. If -you want to force Google Test to use its own tuple library, just add +to the compiler flags while compiling Google Test and your tests. If you want to +force Google Test to use its own tuple library, just add -DGTEST_USE_OWN_TR1_TUPLE=1 @@ -167,15 +268,15 @@ If you don't want Google Test to use tuple at all, add and all features using tuple will be disabled. -### Multi-threaded Tests ### +### Multi-threaded Tests -Google Test is thread-safe where the pthread library is available. -After `#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` -macro to see whether this is the case (yes if the macro is `#defined` to -1, no if it's undefined.). +Google Test is thread-safe where the pthread library is available. After +`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see +whether this is the case (yes if the macro is `#defined` to 1, no if it's +undefined.). -If Google Test doesn't correctly detect whether pthread is available -in your environment, you can force it with +If Google Test doesn't correctly detect whether pthread is available in your +environment, you can force it with -DGTEST_HAS_PTHREAD=1 @@ -183,26 +284,24 @@ or -DGTEST_HAS_PTHREAD=0 -When Google Test uses pthread, you may need to add flags to your -compiler and/or linker to select the pthread library, or you'll get -link errors. If you use the CMake script or the deprecated Autotools -script, this is taken care of for you. If you use your own build -script, you'll need to read your compiler and linker's manual to -figure out what flags to add. +When Google Test uses pthread, you may need to add flags to your compiler and/or +linker to select the pthread library, or you'll get link errors. If you use the +CMake script or the deprecated Autotools script, this is taken care of for you. +If you use your own build script, you'll need to read your compiler and linker's +manual to figure out what flags to add. -### As a Shared Library (DLL) ### +### As a Shared Library (DLL) -Google Test is compact, so most users can build and link it as a -static library for the simplicity. You can choose to use Google Test -as a shared library (known as a DLL on Windows) if you prefer. +Google Test is compact, so most users can build and link it as a static library +for the simplicity. You can choose to use Google Test as a shared library (known +as a DLL on Windows) if you prefer. To compile *gtest* as a shared library, add -DGTEST_CREATE_SHARED_LIBRARY=1 -to the compiler flags. You'll also need to tell the linker to produce -a shared library instead - consult your linker's manual for how to do -it. +to the compiler flags. You'll also need to tell the linker to produce a shared +library instead - consult your linker's manual for how to do it. To compile your *tests* that use the gtest shared library, add @@ -210,31 +309,28 @@ To compile your *tests* that use the gtest shared library, add to the compiler flags. -Note: while the above steps aren't technically necessary today when -using some compilers (e.g. GCC), they may become necessary in the -future, if we decide to improve the speed of loading the library (see - for details). Therefore you are -recommended to always add the above flags when using Google Test as a -shared library. Otherwise a future release of Google Test may break -your build script. +Note: while the above steps aren't technically necessary today when using some +compilers (e.g. GCC), they may become necessary in the future, if we decide to +improve the speed of loading the library (see + for details). Therefore you are recommended +to always add the above flags when using Google Test as a shared library. +Otherwise a future release of Google Test may break your build script. -### Avoiding Macro Name Clashes ### +### Avoiding Macro Name Clashes -In C++, macros don't obey namespaces. Therefore two libraries that -both define a macro of the same name will clash if you `#include` both -definitions. In case a Google Test macro clashes with another -library, you can force Google Test to rename its macro to avoid the -conflict. +In C++, macros don't obey namespaces. Therefore two libraries that both define a +macro of the same name will clash if you `#include` both definitions. In case a +Google Test macro clashes with another library, you can force Google Test to +rename its macro to avoid the conflict. -Specifically, if both Google Test and some other code define macro -FOO, you can add +Specifically, if both Google Test and some other code define macro FOO, you can +add -DGTEST_DONT_DEFINE_FOO=1 -to the compiler flags to tell Google Test to change the macro's name -from `FOO` to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, -or `TEST`. For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll -need to write +to the compiler flags to tell Google Test to change the macro's name from `FOO` +to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For +example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write GTEST_TEST(SomeTest, DoesThis) { ... } @@ -243,38 +339,3 @@ instead of TEST(SomeTest, DoesThis) { ... } in order to define a test. - -## Developing Google Test ## - -This section discusses how to make your own changes to Google Test. - -### Testing Google Test Itself ### - -To make sure your changes work as intended and don't break existing -functionality, you'll want to compile and run Google Test's own tests. -For that you can use CMake: - - mkdir mybuild - cd mybuild - cmake -Dgtest_build_tests=ON ${GTEST_DIR} - -Make sure you have Python installed, as some of Google Test's tests -are written in Python. If the cmake command complains about not being -able to find Python (`Could NOT find PythonInterp (missing: -PYTHON_EXECUTABLE)`), try telling it explicitly where your Python -executable can be found: - - cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR} - -Next, you can build Google Test and all of its own tests. On \*nix, -this is usually done by 'make'. To run the tests, do - - make test - -All tests should pass. - -Normally you don't need to worry about regenerating the source files, -unless you need to modify them. In that case, you should modify the -corresponding .pump files instead and run the pump.py Python script to -regenerate them. You can find pump.py in the [scripts/](scripts/) directory. -Read the [Pump manual](docs/PumpManual.md) for how to use it. diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h index 957a69c6a9..20c54d8695 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -26,14 +26,14 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the public API for death tests. It is // #included by gtest.h so a user doesn't need to include this // directly. +// GOOGLETEST_CM0001 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ #define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ @@ -99,10 +99,11 @@ GTEST_API_ bool InDeathTestChild(); // // On the regular expressions used in death tests: // +// GOOGLETEST_CM0005 DO NOT DELETE // On POSIX-compliant systems (*nix), we use the library, // which uses the POSIX extended regex syntax. // -// On other platforms (e.g. Windows), we only support a simple regex +// On other platforms (e.g. Windows or Mac), we only support a simple regex // syntax implemented as part of Google Test. This limited // implementation should be enough most of the time when writing // death tests; though it lacks many features you can find in PCRE @@ -160,7 +161,7 @@ GTEST_API_ bool InDeathTestChild(); // is rarely a problem as people usually don't put the test binary // directory in PATH. // -// TODO(wan@google.com): make thread-safe death tests search the PATH. +// FIXME: make thread-safe death tests search the PATH. // Asserts that a given statement causes the program to exit, with an // integer exit status that satisfies predicate, and emitting error output @@ -198,9 +199,10 @@ class GTEST_API_ ExitedWithCode { const int exit_code_; }; -# if !GTEST_OS_WINDOWS +# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Tests that an exit code describes an exit due to termination by a // given signal. +// GOOGLETEST_CM0006 DO NOT DELETE class GTEST_API_ KilledBySignal { public: explicit KilledBySignal(int signum); @@ -272,6 +274,54 @@ class GTEST_API_ KilledBySignal { # endif // NDEBUG for EXPECT_DEBUG_DEATH #endif // GTEST_HAS_DEATH_TEST +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on +// systems that support death tests. This allows one to write such a macro +// on a system that does not support death tests and be sure that it will +// compile on a death-test supporting system. It is exposed publicly so that +// systems that have death-tests with stricter requirements than +// GTEST_HAS_DEATH_TEST can write their own equivalent of +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter iff EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) \ + << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if // death tests are supported; otherwise they just issue a warning. This is @@ -284,9 +334,9 @@ class GTEST_API_ KilledBySignal { ASSERT_DEATH(statement, regex) #else # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, ) + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return) + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) #endif } // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h index fe879bca79..5ca041614c 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h @@ -26,10 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the Message class. // @@ -43,6 +42,8 @@ // to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user // program! +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ #define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ @@ -50,6 +51,9 @@ #include "gtest/internal/gtest-port.h" +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + // Ensures that there is at least one operator<< in the global namespace. // See Message& operator<<(...) below for why. void operator<<(const testing::internal::Secret&, int); @@ -196,7 +200,6 @@ class GTEST_API_ Message { std::string GetString() const; private: - #if GTEST_OS_SYMBIAN // These are needed as the Nokia Symbian Compiler cannot decide between // const T& and const T* in a function template. The Nokia compiler _can_ @@ -247,4 +250,6 @@ std::string StreamableToString(const T& streamable) { } // namespace internal } // namespace testing +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + #endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h index 038f9ba79e..3e95e4390e 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -31,13 +31,12 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Authors: vladl@google.com (Vlad Losev) -// // Macros and functions for implementing parameterized tests -// in Google C++ Testing Framework (Google Test) +// in Google C++ Testing and Mocking Framework (Google Test) // // This file is generated by a SCRIPT. DO NOT EDIT BY HAND! // +// GOOGLETEST_CM0001 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ #define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ @@ -79,7 +78,7 @@ TEST_P(FooTest, HasBlahBlah) { // Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test // case with any set of parameters you want. Google Test defines a number // of functions for generating test parameters. They return what we call -// (surprise!) parameter generators. Here is a summary of them, which +// (surprise!) parameter generators. Here is a summary of them, which // are all in the testing namespace: // // @@ -185,15 +184,10 @@ TEST_P(DerivedTest, DoesBlah) { # include #endif -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-param-util.h" #include "gtest/internal/gtest-param-util-generated.h" -#if GTEST_HAS_PARAM_TEST - namespace testing { // Functions producing parameter generators. @@ -273,7 +267,7 @@ internal::ParamGenerator Range(T start, T end) { // each with C-string values of "foo", "bar", and "baz": // // const char* strings[] = {"foo", "bar", "baz"}; -// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); +// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings)); // // This instantiates tests from test case StlStringTest // each with STL strings with values "a" and "b": @@ -1375,8 +1369,6 @@ internal::CartesianProductHolder10AddTestPattern(\ - #test_case_name, \ - #test_name, \ + GTEST_STRINGIFY_(test_case_name), \ + GTEST_STRINGIFY_(test_name), \ new ::testing::internal::TestMetaFactory< \ GTEST_TEST_CLASS_NAME_(\ test_case_name, test_name)>()); \ @@ -1412,21 +1404,21 @@ internal::CartesianProductHolder10, and return std::string. // // testing::PrintToStringParamName is a builtin test suffix generator that -// returns the value of testing::PrintToString(GetParam()). It does not work -// for std::string or C strings. +// returns the value of testing::PrintToString(GetParam()). // // Note: test names must be non-empty, unique, and may only contain ASCII -// alphanumeric characters or underscore. +// alphanumeric characters or underscore. Because PrintToString adds quotes +// to std::string and C strings, it won't work for these types. # define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \ - ::testing::internal::ParamGenerator \ + static ::testing::internal::ParamGenerator \ gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ - ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ + static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ const ::testing::TestParamInfo& info) { \ return ::testing::internal::GetParamNameGen \ (__VA_ARGS__)(info); \ } \ - int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ + static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ GetTestCasePatternHolder(\ #test_case_name, \ @@ -1439,6 +1431,4 @@ internal::CartesianProductHolder10 #endif -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-param-util.h" #include "gtest/internal/gtest-param-util-generated.h" -#if GTEST_HAS_PARAM_TEST - namespace testing { // Functions producing parameter generators. @@ -272,7 +266,7 @@ internal::ParamGenerator Range(T start, T end) { // each with C-string values of "foo", "bar", and "baz": // // const char* strings[] = {"foo", "bar", "baz"}; -// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); +// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings)); // // This instantiates tests from test case StlStringTest // each with STL strings with values "a" and "b": @@ -441,8 +435,6 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine( ]] # endif // GTEST_HAS_COMBINE - - # define TEST_P(test_case_name, test_name) \ class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ : public test_case_name { \ @@ -456,8 +448,8 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine( #test_case_name, \ ::testing::internal::CodeLocation(\ __FILE__, __LINE__))->AddTestPattern(\ - #test_case_name, \ - #test_name, \ + GTEST_STRINGIFY_(test_case_name), \ + GTEST_STRINGIFY_(test_name), \ new ::testing::internal::TestMetaFactory< \ GTEST_TEST_CLASS_NAME_(\ test_case_name, test_name)>()); \ @@ -485,14 +477,14 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine( // to std::string and C strings, it won't work for these types. # define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \ - ::testing::internal::ParamGenerator \ + static ::testing::internal::ParamGenerator \ gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ - ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ + static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ const ::testing::TestParamInfo& info) { \ return ::testing::internal::GetParamNameGen \ (__VA_ARGS__)(info); \ } \ - int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ + static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ GetTestCasePatternHolder(\ #test_case_name, \ @@ -505,6 +497,4 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine( } // namespace testing -#endif // GTEST_HAS_PARAM_TEST - #endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h index 8a33164cb3..51865f84e6 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h @@ -26,10 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// Google Test - The Google C++ Testing Framework + +// Google Test - The Google C++ Testing and Mocking Framework // // This file implements a universal value printer that can print a // value of any type T: @@ -46,6 +45,10 @@ // 2. operator<<(ostream&, const T&) defined in either foo or the // global namespace. // +// However if T is an STL-style container then it is printed element-wise +// unless foo::PrintTo(const T&, ostream*) is defined. Note that +// operator<<() is ignored for container types. +// // If none of the above is defined, it will print the debug string of // the value if it is a protocol buffer, or print the raw bytes in the // value otherwise. @@ -92,6 +95,8 @@ // being defined as many user-defined container types don't have // value_type. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ #define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ @@ -107,6 +112,12 @@ # include #endif +#if GTEST_HAS_ABSL +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "absl/types/variant.h" +#endif // GTEST_HAS_ABSL + namespace testing { // Definitions in the 'internal' and 'internal2' name spaces are @@ -125,7 +136,11 @@ enum TypeKind { kProtobuf, // a protobuf type kConvertibleToInteger, // a type implicitly convertible to BiggestInt // (e.g. a named or unnamed enum type) - kOtherType // anything else +#if GTEST_HAS_ABSL + kConvertibleToStringView, // a type implicitly convertible to + // absl::string_view +#endif + kOtherType // anything else }; // TypeWithoutFormatter::PrintValue(value, os) is called @@ -137,7 +152,8 @@ class TypeWithoutFormatter { public: // This default version is called when kTypeKind is kOtherType. static void PrintValue(const T& value, ::std::ostream* os) { - PrintBytesInObjectTo(reinterpret_cast(&value), + PrintBytesInObjectTo(static_cast( + reinterpret_cast(&value)), sizeof(value), os); } }; @@ -151,10 +167,10 @@ template class TypeWithoutFormatter { public: static void PrintValue(const T& value, ::std::ostream* os) { - const ::testing::internal::string short_str = value.ShortDebugString(); - const ::testing::internal::string pretty_str = - short_str.length() <= kProtobufOneLinerMaxLength ? - short_str : ("\n" + value.DebugString()); + std::string pretty_str = value.ShortDebugString(); + if (pretty_str.length() > kProtobufOneLinerMaxLength) { + pretty_str = "\n" + value.DebugString(); + } *os << ("<" + pretty_str + ">"); } }; @@ -175,6 +191,19 @@ class TypeWithoutFormatter { } }; +#if GTEST_HAS_ABSL +template +class TypeWithoutFormatter { + public: + // Since T has neither operator<< nor PrintTo() but can be implicitly + // converted to absl::string_view, we print it as a absl::string_view. + // + // Note: the implementation is further below, as it depends on + // internal::PrintTo symbol which is defined later in the file. + static void PrintValue(const T& value, ::std::ostream* os); +}; +#endif + // Prints the given value to the given ostream. If the value is a // protocol message, its debug string is printed; if it's an enum or // of a type implicitly convertible to BiggestInt, it's printed as an @@ -202,10 +231,19 @@ class TypeWithoutFormatter { template ::std::basic_ostream& operator<<( ::std::basic_ostream& os, const T& x) { - TypeWithoutFormatter::value ? kProtobuf : - internal::ImplicitlyConvertible::value ? - kConvertibleToInteger : kOtherType)>::PrintValue(x, &os); + TypeWithoutFormatter::value + ? kProtobuf + : internal::ImplicitlyConvertible< + const T&, internal::BiggestInt>::value + ? kConvertibleToInteger + : +#if GTEST_HAS_ABSL + internal::ImplicitlyConvertible< + const T&, absl::string_view>::value + ? kConvertibleToStringView + : +#endif + kOtherType)>::PrintValue(x, &os); return os; } @@ -364,11 +402,18 @@ class UniversalPrinter; template void UniversalPrint(const T& value, ::std::ostream* os); +enum DefaultPrinterType { + kPrintContainer, + kPrintPointer, + kPrintFunctionPointer, + kPrintOther, +}; +template struct WrapPrinterType {}; + // Used to print an STL-style container when the user doesn't define // a PrintTo() for it. template -void DefaultPrintTo(IsContainer /* dummy */, - false_type /* is not a pointer */, +void DefaultPrintTo(WrapPrinterType /* dummy */, const C& container, ::std::ostream* os) { const size_t kMaxCount = 32; // The maximum number of elements to print. *os << '{'; @@ -401,40 +446,34 @@ void DefaultPrintTo(IsContainer /* dummy */, // implementation-defined. Therefore they will be printed as raw // bytes.) template -void DefaultPrintTo(IsNotContainer /* dummy */, - true_type /* is a pointer */, +void DefaultPrintTo(WrapPrinterType /* dummy */, T* p, ::std::ostream* os) { if (p == NULL) { *os << "NULL"; } else { - // C++ doesn't allow casting from a function pointer to any object - // pointer. - // - // IsTrue() silences warnings: "Condition is always true", - // "unreachable code". - if (IsTrue(ImplicitlyConvertible::value)) { - // T is not a function type. We just call << to print p, - // relying on ADL to pick up user-defined << for their pointer - // types, if any. - *os << p; - } else { - // T is a function type, so '*os << p' doesn't do what we want - // (it just prints p as bool). We want to print p as a const - // void*. However, we cannot cast it to const void* directly, - // even using reinterpret_cast, as earlier versions of gcc - // (e.g. 3.4.5) cannot compile the cast when p is a function - // pointer. Casting to UInt64 first solves the problem. - *os << reinterpret_cast( - reinterpret_cast(p)); - } + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } +} +template +void DefaultPrintTo(WrapPrinterType /* dummy */, + T* p, ::std::ostream* os) { + if (p == NULL) { + *os << "NULL"; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. + *os << reinterpret_cast(p); } } // Used to print a non-container, non-pointer value when the user // doesn't define PrintTo() for it. template -void DefaultPrintTo(IsNotContainer /* dummy */, - false_type /* is not a pointer */, +void DefaultPrintTo(WrapPrinterType /* dummy */, const T& value, ::std::ostream* os) { ::testing_internal::DefaultPrintNonContainerTo(value, os); } @@ -452,11 +491,8 @@ void DefaultPrintTo(IsNotContainer /* dummy */, // wants). template void PrintTo(const T& value, ::std::ostream* os) { - // DefaultPrintTo() is overloaded. The type of its first two - // arguments determine which version will be picked. If T is an - // STL-style container, the version for container will be called; if - // T is a pointer, the pointer version will be called; otherwise the - // generic version will be called. + // DefaultPrintTo() is overloaded. The type of its first argument + // determines which version will be picked. // // Note that we check for container types here, prior to we check // for protocol message types in our operator<<. The rationale is: @@ -468,13 +504,27 @@ void PrintTo(const T& value, ::std::ostream* os) { // elements; therefore we check for container types here to ensure // that our format is used. // - // The second argument of DefaultPrintTo() is needed to bypass a bug - // in Symbian's C++ compiler that prevents it from picking the right - // overload between: - // - // PrintTo(const T& x, ...); - // PrintTo(T* x, ...); - DefaultPrintTo(IsContainerTest(0), is_pointer(), value, os); + // Note that MSVC and clang-cl do allow an implicit conversion from + // pointer-to-function to pointer-to-object, but clang-cl warns on it. + // So don't use ImplicitlyConvertible if it can be helped since it will + // cause this warning, and use a separate overload of DefaultPrintTo for + // function pointers so that the `*os << p` in the object pointer overload + // doesn't cause that warning either. + DefaultPrintTo( + WrapPrinterType < + (sizeof(IsContainerTest(0)) == sizeof(IsContainer)) && + !IsRecursiveContainer::value + ? kPrintContainer + : !is_pointer::value + ? kPrintOther +#if GTEST_LANG_CXX11 + : std::is_function::type>::value +#else + : !internal::ImplicitlyConvertible::value +#endif + ? kPrintFunctionPointer + : kPrintPointer > (), + value, os); } // The following list of PrintTo() overloads tells @@ -581,6 +631,17 @@ inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { } #endif // GTEST_HAS_STD_WSTRING +#if GTEST_HAS_ABSL +// Overload for absl::string_view. +inline void PrintTo(absl::string_view sp, ::std::ostream* os) { + PrintTo(::std::string(sp), os); +} +#endif // GTEST_HAS_ABSL + +#if GTEST_LANG_CXX11 +inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } +#endif // GTEST_LANG_CXX11 + #if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ // Helper function for printing a tuple. T must be instantiated with // a tuple type. @@ -710,6 +771,48 @@ class UniversalPrinter { GTEST_DISABLE_MSC_WARNINGS_POP_() }; +#if GTEST_HAS_ABSL + +// Printer for absl::optional + +template +class UniversalPrinter<::absl::optional> { + public: + static void Print(const ::absl::optional& value, ::std::ostream* os) { + *os << '('; + if (!value) { + *os << "nullopt"; + } else { + UniversalPrint(*value, os); + } + *os << ')'; + } +}; + +// Printer for absl::variant + +template +class UniversalPrinter<::absl::variant> { + public: + static void Print(const ::absl::variant& value, ::std::ostream* os) { + *os << '('; + absl::visit(Visitor{os}, value); + *os << ')'; + } + + private: + struct Visitor { + template + void operator()(const U& u) const { + *os << "'" << GetTypeName() << "' with value "; + UniversalPrint(u, os); + } + ::std::ostream* os; + }; +}; + +#endif // GTEST_HAS_ABSL + // UniversalPrintArray(begin, len, os) prints an array of 'len' // elements, starting at address 'begin'. template @@ -723,7 +826,7 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { // If the array has more than kThreshold elements, we'll have to // omit some details by printing only the first and the last // kChunkSize elements. - // TODO(wan@google.com): let the user control the threshold using a flag. + // FIXME: let the user control the threshold using a flag. if (len <= kThreshold) { PrintRawArrayTo(begin, len, os); } else { @@ -805,7 +908,7 @@ class UniversalTersePrinter { if (str == NULL) { *os << "NULL"; } else { - UniversalPrint(string(str), os); + UniversalPrint(std::string(str), os); } } }; @@ -856,7 +959,7 @@ void UniversalPrint(const T& value, ::std::ostream* os) { UniversalPrinter::Print(value, os); } -typedef ::std::vector Strings; +typedef ::std::vector< ::std::string> Strings; // TuplePolicy must provide: // - tuple_size @@ -875,12 +978,13 @@ struct TuplePolicy { static const size_t tuple_size = ::std::tr1::tuple_size::value; template - struct tuple_element : ::std::tr1::tuple_element {}; + struct tuple_element : ::std::tr1::tuple_element(I), Tuple> { + }; template - static typename AddReference< - const typename ::std::tr1::tuple_element::type>::type get( - const Tuple& tuple) { + static typename AddReference(I), Tuple>::type>::type + get(const Tuple& tuple) { return ::std::tr1::get(tuple); } }; @@ -976,6 +1080,16 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { } // namespace internal +#if GTEST_HAS_ABSL +namespace internal2 { +template +void TypeWithoutFormatter::PrintValue( + const T& value, ::std::ostream* os) { + internal::PrintTo(absl::string_view(value), os); +} +} // namespace internal2 +#endif + template ::std::string PrintToString(const T& value) { ::std::stringstream ss; diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h index f63fa9a1b2..1e8983938e 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h @@ -26,17 +26,21 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + // // Utilities for testing Google Test itself and code that uses Google Test // (e.g. frameworks built on top of Google Test). +// GOOGLETEST_CM0004 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ #define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ #include "gtest/gtest.h" +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + namespace testing { // This helper class can be used to mock out Google Test failure reporting @@ -97,13 +101,12 @@ class GTEST_API_ SingleFailureChecker { public: // The constructor remembers the arguments. SingleFailureChecker(const TestPartResultArray* results, - TestPartResult::Type type, - const string& substr); + TestPartResult::Type type, const std::string& substr); ~SingleFailureChecker(); private: const TestPartResultArray* const results_; const TestPartResult::Type type_; - const string substr_; + const std::string substr_; GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); }; @@ -112,6 +115,8 @@ class GTEST_API_ SingleFailureChecker { } // namespace testing +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + // A set of macros for testing Google Test assertions or code that's expected // to generate Google Test fatal failures. It verifies that the given // statement will cause exactly one fatal Google Test failure with 'substr' diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h index 77eb844839..1c7b89e087 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -27,8 +27,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Author: mheule@google.com (Markus Heule) -// +// GOOGLETEST_CM0001 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ #define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ @@ -38,6 +37,9 @@ #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-string.h" +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + namespace testing { // A copyable object representing the result of a test part (i.e. an @@ -143,7 +145,7 @@ class GTEST_API_ TestPartResultArray { }; // This interface knows how to report a test part result. -class TestPartResultReporterInterface { +class GTEST_API_ TestPartResultReporterInterface { public: virtual ~TestPartResultReporterInterface() {} @@ -176,4 +178,6 @@ class GTEST_API_ HasNewFatalFailureHelper } // namespace testing +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + #endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h index 5f69d5678e..74bce46bdc 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -26,8 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + + +// GOOGLETEST_CM0001 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ #define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ @@ -82,6 +83,24 @@ TYPED_TEST(FooTest, DoesBlah) { TYPED_TEST(FooTest, HasPropertyA) { ... } +// TYPED_TEST_CASE takes an optional third argument which allows to specify a +// class that generates custom test name suffixes based on the type. This should +// be a class which has a static template function GetName(int index) returning +// a string for each type. The provided integer index equals the index of the +// type in the provided type list. In many cases the index can be ignored. +// +// For example: +// class MyTypeNames { +// public: +// template +// static std::string GetName(int) { +// if (std::is_same()) return "char"; +// if (std::is_same()) return "int"; +// if (std::is_same()) return "unsignedInt"; +// } +// }; +// TYPED_TEST_CASE(FooTest, MyTypes, MyTypeNames); + #endif // 0 // Type-parameterized tests are abstract test patterns parameterized @@ -143,6 +162,11 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); // If the type list contains only one type, you can write that type // directly without Types<...>: // INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); +// +// Similar to the optional argument of TYPED_TEST_CASE above, +// INSTANTIATE_TEST_CASE_P takes an optional fourth argument which allows to +// generate custom names. +// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes, MyTypeNames); #endif // 0 @@ -159,32 +183,46 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); // given test case. # define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ +// Expands to the name of the typedef for the NameGenerator, responsible for +// creating the suffixes of the name. +#define GTEST_NAME_GENERATOR_(TestCaseName) \ + gtest_type_params_##TestCaseName##_NameGenerator + // The 'Types' template argument below must have spaces around it // since some compilers may choke on '>>' when passing a template // instance (e.g. Types) -# define TYPED_TEST_CASE(CaseName, Types) \ - typedef ::testing::internal::TypeList< Types >::type \ - GTEST_TYPE_PARAMS_(CaseName) +# define TYPED_TEST_CASE(CaseName, Types, ...) \ + typedef ::testing::internal::TypeList< Types >::type GTEST_TYPE_PARAMS_( \ + CaseName); \ + typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ + GTEST_NAME_GENERATOR_(CaseName) -# define TYPED_TEST(CaseName, TestName) \ - template \ - class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ - : public CaseName { \ - private: \ - typedef CaseName TestFixture; \ - typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ - }; \ - bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTest< \ - CaseName, \ - ::testing::internal::TemplateSel< \ - GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \ - GTEST_TYPE_PARAMS_(CaseName)>::Register(\ - "", ::testing::internal::CodeLocation(__FILE__, __LINE__), \ - #CaseName, #TestName, 0); \ - template \ - void GTEST_TEST_CLASS_NAME_(CaseName, TestName)::TestBody() +# define TYPED_TEST(CaseName, TestName) \ + template \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + static bool gtest_##CaseName##_##TestName##_registered_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel, \ + GTEST_TYPE_PARAMS_( \ + CaseName)>::Register("", \ + ::testing::internal::CodeLocation( \ + __FILE__, __LINE__), \ + #CaseName, #TestName, 0, \ + ::testing::internal::GenerateNames< \ + GTEST_NAME_GENERATOR_(CaseName), \ + GTEST_TYPE_PARAMS_(CaseName)>()); \ + template \ + void GTEST_TEST_CLASS_NAME_(CaseName, \ + TestName)::TestBody() #endif // GTEST_HAS_TYPED_TEST @@ -241,22 +279,27 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); namespace GTEST_CASE_NAMESPACE_(CaseName) { \ typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ } \ - static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\ - __FILE__, __LINE__, #__VA_ARGS__) + static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) \ + GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames( \ + __FILE__, __LINE__, #__VA_ARGS__) // The 'Types' template argument below must have spaces around it // since some compilers may choke on '>>' when passing a template // instance (e.g. Types) -# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \ - bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTestCase::type>::Register(\ - #Prefix, \ - ::testing::internal::CodeLocation(__FILE__, __LINE__), \ - >EST_TYPED_TEST_CASE_P_STATE_(CaseName), \ - #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName)) +# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types, ...) \ + static bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestCase< \ + CaseName, GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \ + ::testing::internal::TypeList< Types >::type>:: \ + Register(#Prefix, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_CASE_P_STATE_(CaseName), #CaseName, \ + GTEST_REGISTERED_TEST_NAMES_(CaseName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ + ::testing::internal::TypeList< Types >::type>()) #endif // GTEST_HAS_TYPED_TEST_P diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h index f846c5bd66..3b4bb1ee90 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h @@ -26,10 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the public API for Google Test. It should be // included by any test program that uses Google Test. @@ -48,6 +47,8 @@ // registration from Barthelemy Dagenais' (barthelemy@prologique.com) // easyUnit framework. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_GTEST_H_ #define GTEST_INCLUDE_GTEST_GTEST_H_ @@ -65,6 +66,9 @@ #include "gtest/gtest-test-part.h" #include "gtest/gtest-typed-test.h" +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + // Depending on the platform, different string classes are available. // On Linux, in addition to ::std::string, Google also makes use of // class ::string, which has the same interface as ::std::string, but @@ -82,6 +86,15 @@ namespace testing { +// Silence C4100 (unreferenced formal parameter) and 4805 +// unsafe mix of type 'const int' and type 'const bool' +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4805) +# pragma warning(disable:4100) +#endif + + // Declares the flags. // This flag temporary enables the disabled tests. @@ -103,6 +116,10 @@ GTEST_DECLARE_string_(color); // the tests to run. If the filter is not given all tests are executed. GTEST_DECLARE_string_(filter); +// This flag controls whether Google Test installs a signal handler that dumps +// debugging information when fatal signals are raised. +GTEST_DECLARE_bool_(install_failure_signal_handler); + // This flag causes the Google Test to list tests. None of the tests listed // are actually run if the flag is provided. GTEST_DECLARE_bool_(list_tests); @@ -115,6 +132,9 @@ GTEST_DECLARE_string_(output); // test. GTEST_DECLARE_bool_(print_time); +// This flags control whether Google Test prints UTF8 characters as text. +GTEST_DECLARE_bool_(print_utf8); + // This flag specifies the random number seed. GTEST_DECLARE_int32_(random_seed); @@ -135,7 +155,7 @@ GTEST_DECLARE_int32_(stack_trace_depth); // When this flag is specified, a failed assertion will throw an // exception if exceptions are enabled, or exit the program with a -// non-zero code otherwise. +// non-zero code otherwise. For use with an external test framework. GTEST_DECLARE_bool_(throw_on_failure); // When this flag is set with a "host:port" string, on supported @@ -143,6 +163,10 @@ GTEST_DECLARE_bool_(throw_on_failure); // the specified host machine. GTEST_DECLARE_string_(stream_result_to); +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DECLARE_string_(flagfile); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + // The upper limit for valid stack trace depths. const int kMaxStackTraceDepth = 100; @@ -160,6 +184,7 @@ class TestEventListenersAccessor; class TestEventRepeater; class UnitTestRecordPropertyTestHelper; class WindowsDeathTest; +class FuchsiaDeathTest; class UnitTestImpl* GetUnitTestImpl(); void ReportFailureInUnknownLocation(TestPartResult::Type result_type, const std::string& message); @@ -259,7 +284,9 @@ class GTEST_API_ AssertionResult { // Used in EXPECT_TRUE/FALSE(assertion_result). AssertionResult(const AssertionResult& other); +#if defined(_MSC_VER) && _MSC_VER < 1910 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) +#endif // Used in the EXPECT_TRUE/FALSE(bool_expression). // @@ -276,7 +303,9 @@ class GTEST_API_ AssertionResult { /*enabler*/ = NULL) : success_(success) {} +#if defined(_MSC_VER) && _MSC_VER < 1910 GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif // Assignment operator. AssertionResult& operator=(AssertionResult other) { @@ -297,7 +326,7 @@ class GTEST_API_ AssertionResult { const char* message() const { return message_.get() != NULL ? message_->c_str() : ""; } - // TODO(vladl@google.com): Remove this after making sure no clients use it. + // FIXME: Remove this after making sure no clients use it. // Deprecated; please use message() instead. const char* failure_message() const { return message(); } @@ -345,6 +374,15 @@ GTEST_API_ AssertionResult AssertionFailure(); // Deprecated; use AssertionFailure() << msg. GTEST_API_ AssertionResult AssertionFailure(const Message& msg); +} // namespace testing + +// Includes the auto-generated header that implements a family of generic +// predicate assertion macros. This include comes late because it relies on +// APIs declared above. +#include "gtest/gtest_pred_impl.h" + +namespace testing { + // The abstract class that all tests inherit from. // // In Google Test, a unit test program contains one or many TestCases, and @@ -355,7 +393,7 @@ GTEST_API_ AssertionResult AssertionFailure(const Message& msg); // this for you. // // The only time you derive from Test is when defining a test fixture -// to be used a TEST_F. For example: +// to be used in a TEST_F. For example: // // class FooTest : public testing::Test { // protected: @@ -550,9 +588,8 @@ class GTEST_API_ TestResult { // Returns the elapsed time, in milliseconds. TimeInMillis elapsed_time() const { return elapsed_time_; } - // Returns the i-th test part result among all the results. i can range - // from 0 to test_property_count() - 1. If i is not in that range, aborts - // the program. + // Returns the i-th test part result among all the results. i can range from 0 + // to total_part_count() - 1. If i is not in that range, aborts the program. const TestPartResult& GetTestPartResult(int i) const; // Returns the i-th test property. i can range from 0 to @@ -569,6 +606,7 @@ class GTEST_API_ TestResult { friend class internal::TestResultAccessor; friend class internal::UnitTestImpl; friend class internal::WindowsDeathTest; + friend class internal::FuchsiaDeathTest; // Gets the vector of TestPartResults. const std::vector& test_part_results() const { @@ -594,7 +632,7 @@ class GTEST_API_ TestResult { // Adds a failure if the key is a reserved attribute of Google Test // testcase tags. Returns true if the property is valid. - // TODO(russr): Validate attribute names are legal and human readable. + // FIXME: Validate attribute names are legal and human readable. static bool ValidateTestProperty(const std::string& xml_element, const TestProperty& test_property); @@ -675,6 +713,9 @@ class GTEST_API_ TestInfo { // Returns the line where this test is defined. int line() const { return location_.line; } + // Return true if this test should not be run because it's in another shard. + bool is_in_another_shard() const { return is_in_another_shard_; } + // Returns true if this test should run, that is if the test is not // disabled (or it is disabled but the also_run_disabled_tests flag has // been specified) and its full name matches the user-specified filter. @@ -695,10 +736,9 @@ class GTEST_API_ TestInfo { // Returns true iff this test will appear in the XML report. bool is_reportable() const { - // For now, the XML report includes all tests matching the filter. - // In the future, we may trim tests that are excluded because of - // sharding. - return matches_filter_; + // The XML report includes tests matching the filter, excluding those + // run in other shards. + return matches_filter_ && !is_in_another_shard_; } // Returns the result of the test. @@ -762,6 +802,7 @@ class GTEST_API_ TestInfo { bool is_disabled_; // True iff this test is disabled bool matches_filter_; // True if this test matches the // user-specified filter. + bool is_in_another_shard_; // Will be run in another shard. internal::TestFactoryBase* const factory_; // The factory that creates // the test object @@ -986,6 +1027,18 @@ class Environment { virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } }; +#if GTEST_HAS_EXCEPTIONS + +// Exception which can be thrown from TestEventListener::OnTestPartResult. +class GTEST_API_ AssertionException + : public internal::GoogleTestFailureException { + public: + explicit AssertionException(const TestPartResult& result) + : GoogleTestFailureException(result) {} +}; + +#endif // GTEST_HAS_EXCEPTIONS + // The interface for tracing execution of tests. The methods are organized in // the order the corresponding events are fired. class TestEventListener { @@ -1014,6 +1067,8 @@ class TestEventListener { virtual void OnTestStart(const TestInfo& test_info) = 0; // Fired after a failed assertion or a SUCCEED() invocation. + // If you want to throw an exception from this function to skip to the next + // TEST, it must be AssertionException defined above, or inherited from it. virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0; // Fired after the test ends. @@ -1180,14 +1235,12 @@ class GTEST_API_ UnitTest { // Returns the random seed used at the start of the current test run. int random_seed() const; -#if GTEST_HAS_PARAM_TEST // Returns the ParameterizedTestCaseRegistry object used to keep track of // value-parameterized tests and instantiate and register them. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. internal::ParameterizedTestCaseRegistry& parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_); -#endif // GTEST_HAS_PARAM_TEST // Gets the number of successful test cases. int successful_test_case_count() const; @@ -1287,11 +1340,11 @@ class GTEST_API_ UnitTest { internal::UnitTestImpl* impl() { return impl_; } const internal::UnitTestImpl* impl() const { return impl_; } - // These classes and funcions are friends as they need to access private + // These classes and functions are friends as they need to access private // members of UnitTest. + friend class ScopedTrace; friend class Test; friend class internal::AssertHelper; - friend class internal::ScopedTrace; friend class internal::StreamingListenerTest; friend class internal::UnitTestRecordPropertyTestHelper; friend Environment* AddGlobalTestEnvironment(Environment* env); @@ -1388,11 +1441,9 @@ AssertionResult CmpHelperEQ(const char* lhs_expression, const char* rhs_expression, const T1& lhs, const T2& rhs) { -GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */) if (lhs == rhs) { return AssertionSuccess(); } -GTEST_DISABLE_MSC_WARNINGS_POP_() return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs); } @@ -1706,7 +1757,6 @@ class GTEST_API_ AssertHelper { } // namespace internal -#if GTEST_HAS_PARAM_TEST // The pure interface class that all value-parameterized tests inherit from. // A value-parameterized class must inherit from both ::testing::Test and // ::testing::WithParamInterface. In most cases that just means inheriting @@ -1748,11 +1798,8 @@ class WithParamInterface { virtual ~WithParamInterface() {} // The current parameter value. Is also available in the test fixture's - // constructor. This member function is non-static, even though it only - // references static data, to reduce the opportunity for incorrect uses - // like writing 'WithParamInterface::GetParam()' for a test that - // uses a fixture whose parameter type is int. - const ParamType& GetParam() const { + // constructor. + static const ParamType& GetParam() { GTEST_CHECK_(parameter_ != NULL) << "GetParam() can only be called inside a value-parameterized test " << "-- did you intend to write TEST_P instead of TEST_F?"; @@ -1783,8 +1830,6 @@ template class TestWithParam : public Test, public WithParamInterface { }; -#endif // GTEST_HAS_PARAM_TEST - // Macros for indicating success/failure in test code. // ADD_FAILURE unconditionally adds a failure to the current test. @@ -1857,22 +1902,18 @@ class TestWithParam : public Test, public WithParamInterface { // AssertionResult. For more information on how to use AssertionResult with // these macros see comments on that class. #define EXPECT_TRUE(condition) \ - GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ GTEST_NONFATAL_FAILURE_) #define EXPECT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_NONFATAL_FAILURE_) #define ASSERT_TRUE(condition) \ - GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ GTEST_FATAL_FAILURE_) #define ASSERT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_FATAL_FAILURE_) -// Includes the auto-generated header that implements a family of -// generic predicate assertion macros. -#include "gtest/gtest_pred_impl.h" - // Macros for testing equalities and inequalities. // // * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2 @@ -1914,8 +1955,8 @@ class TestWithParam : public Test, public WithParamInterface { // // Examples: // -// EXPECT_NE(5, Foo()); -// EXPECT_EQ(NULL, a_pointer); +// EXPECT_NE(Foo(), 5); +// EXPECT_EQ(a_pointer, NULL); // ASSERT_LT(i, array_size); // ASSERT_GT(records.size(), 0) << "There is no record left."; @@ -2101,6 +2142,57 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, #define EXPECT_NO_FATAL_FAILURE(statement) \ GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) +// Causes a trace (including the given source file path and line number, +// and the given message) to be included in every test failure message generated +// by code in the scope of the lifetime of an instance of this class. The effect +// is undone with the destruction of the instance. +// +// The message argument can be anything streamable to std::ostream. +// +// Example: +// testing::ScopedTrace trace("file.cc", 123, "message"); +// +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + + // Template version. Uses Message() to convert the values into strings. + // Slow, but flexible. + template + ScopedTrace(const char* file, int line, const T& message) { + PushTrace(file, line, (Message() << message).GetString()); + } + + // Optimize for some known types. + ScopedTrace(const char* file, int line, const char* message) { + PushTrace(file, line, message ? message : "(null)"); + } + +#if GTEST_HAS_GLOBAL_STRING + ScopedTrace(const char* file, int line, const ::string& message) { + PushTrace(file, line, message); + } +#endif + + ScopedTrace(const char* file, int line, const std::string& message) { + PushTrace(file, line, message); + } + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + void PushTrace(const char* file, int line, std::string message); + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. + // Causes a trace (including the source file path, the current line // number, and the given message) to be included in every test failure // message generated by code in the current scope. The effect is @@ -2112,9 +2204,14 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // of the dummy variable name, thus allowing multiple SCOPED_TRACE()s // to appear in the same block - as long as they are on different // lines. +// +// Assuming that each thread maintains its own stack of traces. +// Therefore, a SCOPED_TRACE() would (correctly) only affect the +// assertions in its own thread. #define SCOPED_TRACE(message) \ - ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ - __FILE__, __LINE__, ::testing::Message() << (message)) + ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ + __FILE__, __LINE__, (message)) + // Compile-time assertion for type equality. // StaticAssertTypeEq() compiles iff type1 and type2 are @@ -2194,7 +2291,7 @@ bool StaticAssertTypeEq() { // name of the test within the test case. // // A test fixture class must be declared earlier. The user should put -// his test code between braces after using this macro. Example: +// the test code between braces after using this macro. Example: // // class FooTest : public testing::Test { // protected: @@ -2209,14 +2306,22 @@ bool StaticAssertTypeEq() { // } // // TEST_F(FooTest, ReturnsElementCountCorrectly) { -// EXPECT_EQ(0, a_.size()); -// EXPECT_EQ(1, b_.size()); +// EXPECT_EQ(a_.size(), 0); +// EXPECT_EQ(b_.size(), 1); // } #define TEST_F(test_fixture, test_name)\ GTEST_TEST_(test_fixture, test_name, test_fixture, \ ::testing::internal::GetTypeId()) +// Returns a path to temporary directory. +// Tries to determine an appropriate directory for the platform. +GTEST_API_ std::string TempDir(); + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + } // namespace testing // Use this function in main() to run all tests. It returns 0 if all @@ -2233,4 +2338,6 @@ inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); } +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + #endif // GTEST_INCLUDE_GTEST_GTEST_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h index 30ae712f50..0c1105cb8e 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -27,18 +27,19 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command +// This file is AUTOMATICALLY GENERATED on 01/02/2018 by command // 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! // // Implements a family of generic predicate assertion macros. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ #define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ -// Makes sure this header is not included before gtest.h. -#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ -# error Do not include gtest_pred_impl.h directly. Include gtest.h instead. -#endif // GTEST_INCLUDE_GTEST_GTEST_H_ +#include "gtest/gtest.h" + +namespace testing { // This header implements a family of generic predicate assertion // macros: @@ -66,8 +67,6 @@ // We also define the EXPECT_* variations. // // For now we only support predicates whose arity is at most 5. -// Please email googletestframework@googlegroups.com if you need -// support for higher arities. // GTEST_ASSERT_ is the basic statement to which all of the assertions // in this file reduce. Don't use this in your code. @@ -355,4 +354,6 @@ AssertionResult AssertPred5Helper(const char* pred_text, +} // namespace testing + #endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h index da80ddc6c7..e651671ebd 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h @@ -26,10 +26,10 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: wan@google.com (Zhanyong Wan) -// -// Google C++ Testing Framework definitions useful in production code. +// Google C++ Testing and Mocking Framework definitions useful in production code. +// GOOGLETEST_CM0003 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ #define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ @@ -40,17 +40,20 @@ // // class MyClass { // private: -// void MyMethod(); -// FRIEND_TEST(MyClassTest, MyMethod); +// void PrivateMethod(); +// FRIEND_TEST(MyClassTest, PrivateMethodWorks); // }; // // class MyClassTest : public testing::Test { // // ... // }; // -// TEST_F(MyClassTest, MyMethod) { -// // Can call MyClass::MyMethod() here. +// TEST_F(MyClassTest, PrivateMethodWorks) { +// // Can call MyClass::PrivateMethod() here. // } +// +// Note: The test class must be in the same namespace as the class being tested. +// For example, putting MyClassTest in an anonymous namespace will not work. #define FRIEND_TEST(test_case_name, test_name)\ friend class test_case_name##_##test_name##_Test diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md new file mode 100644 index 0000000000..ff391fb4e2 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md @@ -0,0 +1,56 @@ +# Customization Points + +The custom directory is an injection point for custom user configurations. + +## Header `gtest.h` + +### The following macros can be defined: + +* `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of + `OsStackTraceGetterInterface`. +* `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See + `testing::TempDir` for semantics and signature. + +## Header `gtest-port.h` + +The following macros can be defined: + +### Flag related macros: + +* `GTEST_FLAG(flag_name)` +* `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its + own flagfile flag parsing. +* `GTEST_DECLARE_bool_(name)` +* `GTEST_DECLARE_int32_(name)` +* `GTEST_DECLARE_string_(name)` +* `GTEST_DEFINE_bool_(name, default_val, doc)` +* `GTEST_DEFINE_int32_(name, default_val, doc)` +* `GTEST_DEFINE_string_(name, default_val, doc)` + +### Logging: + +* `GTEST_LOG_(severity)` +* `GTEST_CHECK_(condition)` +* Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too. + +### Threading: + +* `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided. +* `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal` + are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)` + and `GTEST_DEFINE_STATIC_MUTEX_(mutex)` +* `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)` +* `GTEST_LOCK_EXCLUDED_(locks)` + +### Underlying library support features + +* `GTEST_HAS_CXXABI_H_` + +### Exporting API symbols: + +* `GTEST_API_` - Specifier for exported symbols. + +## Header `gtest-printers.h` + +* See documentation at `gtest/gtest-printers.h` for details on how to define a + custom printer. diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h index 7e744bd3bb..cd85d956d2 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h @@ -27,39 +27,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Injection point for custom user configurations. -// The following macros can be defined: -// -// Flag related macros: -// GTEST_FLAG(flag_name) -// GTEST_USE_OWN_FLAGFILE_FLAG_ - Define to 0 when the system provides its -// own flagfile flag parsing. -// GTEST_DECLARE_bool_(name) -// GTEST_DECLARE_int32_(name) -// GTEST_DECLARE_string_(name) -// GTEST_DEFINE_bool_(name, default_val, doc) -// GTEST_DEFINE_int32_(name, default_val, doc) -// GTEST_DEFINE_string_(name, default_val, doc) -// -// Test filtering: -// GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that -// will be used if --GTEST_FLAG(test_filter) -// is not provided. -// -// Logging: -// GTEST_LOG_(severity) -// GTEST_CHECK_(condition) -// Functions LogToStderr() and FlushInfoLog() have to be provided too. -// -// Threading: -// GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided. -// GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are -// already provided. -// Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and -// GTEST_DEFINE_STATIC_MUTEX_(mutex) -// -// GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) -// GTEST_LOCK_EXCLUDED_(locks) +// Injection point for custom user configurations. See README for details // // ** Custom implementation starts here ** diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h index 60c1ea050b..eb4467abca 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h @@ -31,8 +31,8 @@ // installation of gTest. // It will be included from gtest-printers.h and the overrides in this file // will be visible to everyone. -// See documentation at gtest/gtest-printers.h for details on how to define a -// custom printer. +// +// Injection point for custom user configurations. See README for details // // ** Custom implementation starts here ** diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h index c27412a898..4c8e07be23 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h @@ -27,11 +27,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Injection point for custom user configurations. -// The following macros can be defined: -// -// GTEST_OS_STACK_TRACE_GETTER_ - The name of an implementation of -// OsStackTraceGetterInterface. +// Injection point for custom user configurations. See README for details // // ** Custom implementation starts here ** diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h index 2b3a78f5bf..0a9b42c8a5 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -27,12 +27,11 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines internal utilities needed for implementing // death tests. They are subject to change without notice. +// GOOGLETEST_CM0001 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ @@ -53,6 +52,9 @@ const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; #if GTEST_HAS_DEATH_TEST +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + // DeathTest is a class that hides much of the complexity of the // GTEST_DEATH_TEST_ macro. It is abstract; its static Create method // returns a concrete class that depends on the prevailing death test @@ -136,6 +138,8 @@ class GTEST_API_ DeathTest { GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); }; +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + // Factory interface for death tests. May be mocked out for testing. class DeathTestFactory { public: @@ -218,14 +222,18 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status); // can be streamed. // This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in -// NDEBUG mode. In this case we need the statements to be executed, the regex is -// ignored, and the macro must accept a streamed message even though the message -// is never printed. -# define GTEST_EXECUTE_STATEMENT_(statement, regex) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } else \ +// NDEBUG mode. In this case we need the statements to be executed and the macro +// must accept a streamed message even though the message is never printed. +// The regex object is not evaluated, but it is used to prevent "unused" +// warnings and to avoid an expression that doesn't compile in debug mode. +#define GTEST_EXECUTE_STATEMENT_(statement, regex) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else if (!::testing::internal::AlwaysTrue()) { \ + const ::testing::internal::RE& gtest_regex = (regex); \ + static_cast(gtest_regex); \ + } else \ ::testing::Message() // A class representing the parsed contents of the @@ -264,53 +272,6 @@ class InternalRunDeathTestFlag { // the flag is specified; otherwise returns NULL. InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); -#else // GTEST_HAS_DEATH_TEST - -// This macro is used for implementing macros such as -// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where -// death tests are not supported. Those macros must compile on such systems -// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on -// systems that support death tests. This allows one to write such a macro -// on a system that does not support death tests and be sure that it will -// compile on a death-test supporting system. -// -// Parameters: -// statement - A statement that a macro such as EXPECT_DEATH would test -// for program termination. This macro has to make sure this -// statement is compiled but not executed, to ensure that -// EXPECT_DEATH_IF_SUPPORTED compiles with a certain -// parameter iff EXPECT_DEATH compiles with it. -// regex - A regex that a macro such as EXPECT_DEATH would use to test -// the output of statement. This parameter has to be -// compiled but not evaluated by this macro, to ensure that -// this macro only accepts expressions that a macro such as -// EXPECT_DEATH would accept. -// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED -// and a return statement for ASSERT_DEATH_IF_SUPPORTED. -// This ensures that ASSERT_DEATH_IF_SUPPORTED will not -// compile inside functions where ASSERT_DEATH doesn't -// compile. -// -// The branch that has an always false condition is used to ensure that -// statement and regex are compiled (and thus syntactically correct) but -// never executed. The unreachable code macro protects the terminator -// statement from generating an 'unreachable code' warning in case -// statement unconditionally returns or throws. The Message constructor at -// the end allows the syntax of streaming additional messages into the -// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. -# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_LOG_(WARNING) \ - << "Death tests are not supported on this platform.\n" \ - << "Statement '" #statement "' cannot be verified."; \ - } else if (::testing::internal::AlwaysFalse()) { \ - ::testing::internal::RE::PartialMatch(".*", (regex)); \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - terminator; \ - } else \ - ::testing::Message() - #endif // GTEST_HAS_DEATH_TEST } // namespace internal diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h index 7a13b4b0de..ae38d95bf8 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -27,21 +27,24 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Author: keith.ray@gmail.com (Keith Ray) -// // Google Test filepath utilities // // This header file declares classes and functions used internally by // Google Test. They are subject to change without notice. // -// This file is #included in . +// This file is #included in gtest/internal/gtest-internal.h. // Do not include this header file separately! +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ #include "gtest/internal/gtest-string.h" +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + namespace testing { namespace internal { @@ -203,4 +206,6 @@ class GTEST_API_ FilePath { } // namespace internal } // namespace testing +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h index ebd1cf615d..b762f61fc5 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -27,13 +27,13 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file declares functions and macros used internally by // Google Test. They are subject to change without notice. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ @@ -61,8 +61,8 @@ #include #include "gtest/gtest-message.h" -#include "gtest/internal/gtest-string.h" #include "gtest/internal/gtest-filepath.h" +#include "gtest/internal/gtest-string.h" #include "gtest/internal/gtest-type-util.h" // Due to C++ preprocessor weirdness, we need double indirection to @@ -76,6 +76,9 @@ #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) #define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar +// Stringifies its argument. +#define GTEST_STRINGIFY_(name) #name + class ProtocolMessage; namespace proto2 { class Message; } @@ -96,7 +99,6 @@ template namespace internal { struct TraceInfo; // Information about a trace point. -class ScopedTrace; // Implements scoped trace. class TestInfoImpl; // Opaque implementation of TestInfo class UnitTestImpl; // Opaque implementation of UnitTest @@ -139,6 +141,9 @@ GTEST_API_ std::string AppendUserMessage( #if GTEST_HAS_EXCEPTIONS +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \ +/* an exported class was derived from a class that was not exported */) + // This exception is thrown by (and only by) a failed Google Test // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions // are enabled). We derive it from std::runtime_error, which is for @@ -150,32 +155,15 @@ class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { explicit GoogleTestFailureException(const TestPartResult& failure); }; +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4275 + #endif // GTEST_HAS_EXCEPTIONS -// A helper class for creating scoped traces in user programs. -class GTEST_API_ ScopedTrace { - public: - // The c'tor pushes the given source file location and message onto - // a trace stack maintained by Google Test. - ScopedTrace(const char* file, int line, const Message& message); - - // The d'tor pops the info pushed by the c'tor. - // - // Note that the d'tor is not virtual in order to be efficient. - // Don't inherit from ScopedTrace! - ~ScopedTrace(); - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); -} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its - // c'tor and d'tor. Therefore it doesn't - // need to be used otherwise. - namespace edit_distance { // Returns the optimal edits to go from 'left' to 'right'. // All edits cost the same, with replace having lower priority than // add/remove. -// Simple implementation of the Wagner–Fischer algorithm. +// Simple implementation of the Wagner-Fischer algorithm. // See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm enum EditType { kMatch, kAdd, kRemove, kReplace }; GTEST_API_ std::vector CalculateOptimalEdits( @@ -502,9 +490,10 @@ typedef void (*SetUpTestCaseFunc)(); typedef void (*TearDownTestCaseFunc)(); struct CodeLocation { - CodeLocation(const string& a_file, int a_line) : file(a_file), line(a_line) {} + CodeLocation(const std::string& a_file, int a_line) + : file(a_file), line(a_line) {} - string file; + std::string file; int line; }; @@ -544,6 +533,9 @@ GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + // State of the definition of a type-parameterized test case. class GTEST_API_ TypedTestCasePState { public: @@ -589,6 +581,8 @@ class GTEST_API_ TypedTestCasePState { RegisteredTestsMap registered_tests_; }; +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + // Skips to the first non-space char after the first comma in 'str'; // returns NULL if no comma is found in 'str'. inline const char* SkipComma(const char* str) { @@ -612,6 +606,37 @@ inline std::string GetPrefixUntilComma(const char* str) { void SplitString(const ::std::string& str, char delimiter, ::std::vector< ::std::string>* dest); +// The default argument to the template below for the case when the user does +// not provide a name generator. +struct DefaultNameGenerator { + template + static std::string GetName(int i) { + return StreamableToString(i); + } +}; + +template +struct NameGeneratorSelector { + typedef Provided type; +}; + +template +void GenerateNamesRecursively(Types0, std::vector*, int) {} + +template +void GenerateNamesRecursively(Types, std::vector* result, int i) { + result->push_back(NameGenerator::template GetName(i)); + GenerateNamesRecursively(typename Types::Tail(), result, + i + 1); +} + +template +std::vector GenerateNames() { + std::vector result; + GenerateNamesRecursively(Types(), &result, 0); + return result; +} + // TypeParameterizedTest::Register() // registers a list of type-parameterized tests with Google Test. The // return value is insignificant - we just need to return something @@ -626,10 +651,10 @@ class TypeParameterizedTest { // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, // Types). Valid values for 'index' are [0, N - 1] where N is the // length of Types. - static bool Register(const char* prefix, - CodeLocation code_location, - const char* case_name, const char* test_names, - int index) { + static bool Register(const char* prefix, const CodeLocation& code_location, + const char* case_name, const char* test_names, int index, + const std::vector& type_names = + GenerateNames()) { typedef typename Types::Head Type; typedef Fixture FixtureClass; typedef typename GTEST_BIND_(TestSel, Type) TestClass; @@ -637,20 +662,23 @@ class TypeParameterizedTest { // First, registers the first type-parameterized test in the type // list. MakeAndRegisterTestInfo( - (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/" - + StreamableToString(index)).c_str(), + (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + + "/" + type_names[index]) + .c_str(), StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(), GetTypeName().c_str(), NULL, // No value parameter. - code_location, - GetTypeId(), - TestClass::SetUpTestCase, - TestClass::TearDownTestCase, - new TestFactoryImpl); + code_location, GetTypeId(), TestClass::SetUpTestCase, + TestClass::TearDownTestCase, new TestFactoryImpl); // Next, recurses (at compile time) with the tail of the type list. - return TypeParameterizedTest - ::Register(prefix, code_location, case_name, test_names, index + 1); + return TypeParameterizedTest::Register(prefix, + code_location, + case_name, + test_names, + index + 1, + type_names); } }; @@ -658,9 +686,11 @@ class TypeParameterizedTest { template class TypeParameterizedTest { public: - static bool Register(const char* /*prefix*/, CodeLocation, + static bool Register(const char* /*prefix*/, const CodeLocation&, const char* /*case_name*/, const char* /*test_names*/, - int /*index*/) { + int /*index*/, + const std::vector& = + std::vector() /*type_names*/) { return true; } }; @@ -673,8 +703,10 @@ template class TypeParameterizedTestCase { public: static bool Register(const char* prefix, CodeLocation code_location, - const TypedTestCasePState* state, - const char* case_name, const char* test_names) { + const TypedTestCasePState* state, const char* case_name, + const char* test_names, + const std::vector& type_names = + GenerateNames()) { std::string test_name = StripTrailingSpaces( GetPrefixUntilComma(test_names)); if (!state->TestExists(test_name)) { @@ -691,12 +723,14 @@ class TypeParameterizedTestCase { // First, register the first test in 'Test' for each type in 'Types'. TypeParameterizedTest::Register( - prefix, test_location, case_name, test_names, 0); + prefix, test_location, case_name, test_names, 0, type_names); // Next, recurses (at compile time) with the tail of the test list. - return TypeParameterizedTestCase - ::Register(prefix, code_location, state, - case_name, SkipComma(test_names)); + return TypeParameterizedTestCase::Register(prefix, code_location, + state, case_name, + SkipComma(test_names), + type_names); } }; @@ -704,9 +738,11 @@ class TypeParameterizedTestCase { template class TypeParameterizedTestCase { public: - static bool Register(const char* /*prefix*/, CodeLocation, + static bool Register(const char* /*prefix*/, const CodeLocation&, const TypedTestCasePState* /*state*/, - const char* /*case_name*/, const char* /*test_names*/) { + const char* /*case_name*/, const char* /*test_names*/, + const std::vector& = + std::vector() /*type_names*/) { return true; } }; @@ -823,31 +859,6 @@ struct RemoveConst { #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) -// Adds reference to a type if it is not a reference type, -// otherwise leaves it unchanged. This is the same as -// tr1::add_reference, which is not widely available yet. -template -struct AddReference { typedef T& type; }; // NOLINT -template -struct AddReference { typedef T& type; }; // NOLINT - -// A handy wrapper around AddReference that works when the argument T -// depends on template parameters. -#define GTEST_ADD_REFERENCE_(T) \ - typename ::testing::internal::AddReference::type - -// Adds a reference to const on top of T as necessary. For example, -// it transforms -// -// char ==> const char& -// const char ==> const char& -// char& ==> const char& -// const char& ==> const char& -// -// The argument T must depend on some template parameters. -#define GTEST_REFERENCE_TO_CONST_(T) \ - GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T)) - // ImplicitlyConvertible::value is a compile-time bool // constant that's true iff type From can be implicitly converted to // type To. @@ -917,8 +928,11 @@ struct IsAProtocolMessage // a container class by checking the type of IsContainerTest(0). // The value of the expression is insignificant. // -// Note that we look for both C::iterator and C::const_iterator. The -// reason is that C++ injects the name of a class as a member of the +// In C++11 mode we check the existence of a const_iterator and that an +// iterator is properly implemented for the container. +// +// For pre-C++11 that we look for both C::iterator and C::const_iterator. +// The reason is that C++ injects the name of a class as a member of the // class itself (e.g. you can refer to class iterator as either // 'iterator' or 'iterator::iterator'). If we look for C::iterator // only, for example, we would mistakenly think that a class named @@ -928,17 +942,96 @@ struct IsAProtocolMessage // IsContainerTest(typename C::const_iterator*) and // IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. typedef int IsContainer; +#if GTEST_LANG_CXX11 +template ().begin()), + class = decltype(::std::declval().end()), + class = decltype(++::std::declval()), + class = decltype(*::std::declval()), + class = typename C::const_iterator> +IsContainer IsContainerTest(int /* dummy */) { + return 0; +} +#else template IsContainer IsContainerTest(int /* dummy */, typename C::iterator* /* it */ = NULL, typename C::const_iterator* /* const_it */ = NULL) { return 0; } +#endif // GTEST_LANG_CXX11 typedef char IsNotContainer; template IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } +// Trait to detect whether a type T is a hash table. +// The heuristic used is that the type contains an inner type `hasher` and does +// not contain an inner type `reverse_iterator`. +// If the container is iterable in reverse, then order might actually matter. +template +struct IsHashTable { + private: + template + static char test(typename U::hasher*, typename U::reverse_iterator*); + template + static int test(typename U::hasher*, ...); + template + static char test(...); + + public: + static const bool value = sizeof(test(0, 0)) == sizeof(int); +}; + +template +const bool IsHashTable::value; + +template +struct VoidT { + typedef void value_type; +}; + +template +struct HasValueType : false_type {}; +template +struct HasValueType > : true_type { +}; + +template (0)) == sizeof(IsContainer), + bool = HasValueType::value> +struct IsRecursiveContainerImpl; + +template +struct IsRecursiveContainerImpl : public false_type {}; + +// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to +// obey the same inconsistencies as the IsContainerTest, namely check if +// something is a container is relying on only const_iterator in C++11 and +// is relying on both const_iterator and iterator otherwise +template +struct IsRecursiveContainerImpl : public false_type {}; + +template +struct IsRecursiveContainerImpl { + #if GTEST_LANG_CXX11 + typedef typename IteratorTraits::value_type + value_type; +#else + typedef typename IteratorTraits::value_type value_type; +#endif + typedef is_same type; +}; + +// IsRecursiveContainer is a unary compile-time predicate that +// evaluates whether C is a recursive container type. A recursive container +// type is a container type whose value_type is equal to the container type +// itself. An example for a recursive container type is +// boost::filesystem::path, whose iterator has a value_type that is equal to +// boost::filesystem::path. +template +struct IsRecursiveContainer : public IsRecursiveContainerImpl::type {}; + // EnableIf::type is void when 'Cond' is true, and // undefined when 'Cond' is false. To use SFINAE to make a function // overload only apply when a particular expression is true, add @@ -1070,7 +1163,7 @@ class NativeArray { private: enum { kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper< - Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value, + Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value }; // Initializes this object with a copy of the input. @@ -1115,7 +1208,7 @@ class NativeArray { #define GTEST_SUCCESS_(message) \ GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) -// Suppresses MSVC warnings 4072 (unreachable code) for the code following +// Suppress MSVC warning 4702 (unreachable code) for the code following // statement if it returns or throws (or doesn't return or throw in some // situations). #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ @@ -1235,4 +1328,3 @@ class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ - diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h index 3602942217..082b87289a 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h @@ -27,8 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Authors: Dan Egnor (egnor@google.com) -// // A "smart" pointer type with reference tracking. Every pointer to a // particular object is kept on a circular linked list. When the last pointer // to an object is destroyed or reassigned, the object is deleted. @@ -62,9 +60,11 @@ // raw pointer (e.g. via get()) concurrently, and // - it's safe to write to two linked_ptrs that point to the same // shared object concurrently. -// TODO(wan@google.com): rename this to safe_linked_ptr to avoid +// FIXME: rename this to safe_linked_ptr to avoid // confusion with normal linked_ptr. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h index 4d1d81d20f..4fac8c0270 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h @@ -30,8 +30,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: vladl@google.com (Vlad Losev) + // Type and function utilities for implementing parameterized tests. // This file is generated by a SCRIPT. DO NOT EDIT BY HAND! @@ -43,17 +42,14 @@ // by the maximum arity of the implementation of tuple which is // currently set at 10. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. #include "gtest/internal/gtest-param-util.h" #include "gtest/internal/gtest-port.h" -#if GTEST_HAS_PARAM_TEST - namespace testing { // Forward declarations of ValuesIn(), which is implemented in @@ -84,6 +80,8 @@ class ValueArray1 { return ValuesIn(array); } + ValueArray1(const ValueArray1& other) : v1_(other.v1_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray1& other); @@ -102,6 +100,8 @@ class ValueArray2 { return ValuesIn(array); } + ValueArray2(const ValueArray2& other) : v1_(other.v1_), v2_(other.v2_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray2& other); @@ -122,6 +122,9 @@ class ValueArray3 { return ValuesIn(array); } + ValueArray3(const ValueArray3& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray3& other); @@ -144,6 +147,9 @@ class ValueArray4 { return ValuesIn(array); } + ValueArray4(const ValueArray4& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray4& other); @@ -167,6 +173,9 @@ class ValueArray5 { return ValuesIn(array); } + ValueArray5(const ValueArray5& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray5& other); @@ -193,6 +202,9 @@ class ValueArray6 { return ValuesIn(array); } + ValueArray6(const ValueArray6& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray6& other); @@ -220,6 +232,10 @@ class ValueArray7 { return ValuesIn(array); } + ValueArray7(const ValueArray7& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray7& other); @@ -249,6 +265,10 @@ class ValueArray8 { return ValuesIn(array); } + ValueArray8(const ValueArray8& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray8& other); @@ -280,6 +300,10 @@ class ValueArray9 { return ValuesIn(array); } + ValueArray9(const ValueArray9& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray9& other); @@ -312,6 +336,10 @@ class ValueArray10 { return ValuesIn(array); } + ValueArray10(const ValueArray10& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray10& other); @@ -346,6 +374,11 @@ class ValueArray11 { return ValuesIn(array); } + ValueArray11(const ValueArray11& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray11& other); @@ -382,6 +415,11 @@ class ValueArray12 { return ValuesIn(array); } + ValueArray12(const ValueArray12& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray12& other); @@ -420,6 +458,11 @@ class ValueArray13 { return ValuesIn(array); } + ValueArray13(const ValueArray13& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray13& other); @@ -459,6 +502,11 @@ class ValueArray14 { return ValuesIn(array); } + ValueArray14(const ValueArray14& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray14& other); @@ -500,6 +548,12 @@ class ValueArray15 { return ValuesIn(array); } + ValueArray15(const ValueArray15& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray15& other); @@ -544,6 +598,12 @@ class ValueArray16 { return ValuesIn(array); } + ValueArray16(const ValueArray16& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray16& other); @@ -589,6 +649,12 @@ class ValueArray17 { return ValuesIn(array); } + ValueArray17(const ValueArray17& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray17& other); @@ -636,6 +702,12 @@ class ValueArray18 { return ValuesIn(array); } + ValueArray18(const ValueArray18& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray18& other); @@ -684,6 +756,13 @@ class ValueArray19 { return ValuesIn(array); } + ValueArray19(const ValueArray19& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray19& other); @@ -734,6 +813,13 @@ class ValueArray20 { return ValuesIn(array); } + ValueArray20(const ValueArray20& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray20& other); @@ -787,6 +873,13 @@ class ValueArray21 { return ValuesIn(array); } + ValueArray21(const ValueArray21& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray21& other); @@ -841,6 +934,13 @@ class ValueArray22 { return ValuesIn(array); } + ValueArray22(const ValueArray22& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray22& other); @@ -897,6 +997,14 @@ class ValueArray23 { return ValuesIn(array); } + ValueArray23(const ValueArray23& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray23& other); @@ -955,6 +1063,14 @@ class ValueArray24 { return ValuesIn(array); } + ValueArray24(const ValueArray24& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray24& other); @@ -1014,6 +1130,14 @@ class ValueArray25 { return ValuesIn(array); } + ValueArray25(const ValueArray25& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray25& other); @@ -1075,6 +1199,14 @@ class ValueArray26 { return ValuesIn(array); } + ValueArray26(const ValueArray26& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray26& other); @@ -1139,6 +1271,15 @@ class ValueArray27 { return ValuesIn(array); } + ValueArray27(const ValueArray27& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray27& other); @@ -1204,6 +1345,15 @@ class ValueArray28 { return ValuesIn(array); } + ValueArray28(const ValueArray28& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray28& other); @@ -1270,6 +1420,15 @@ class ValueArray29 { return ValuesIn(array); } + ValueArray29(const ValueArray29& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray29& other); @@ -1339,6 +1498,15 @@ class ValueArray30 { return ValuesIn(array); } + ValueArray30(const ValueArray30& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray30& other); @@ -1410,6 +1578,16 @@ class ValueArray31 { return ValuesIn(array); } + ValueArray31(const ValueArray31& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray31& other); @@ -1482,6 +1660,16 @@ class ValueArray32 { return ValuesIn(array); } + ValueArray32(const ValueArray32& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray32& other); @@ -1557,6 +1745,16 @@ class ValueArray33 { return ValuesIn(array); } + ValueArray33(const ValueArray33& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray33& other); @@ -1633,6 +1831,16 @@ class ValueArray34 { return ValuesIn(array); } + ValueArray34(const ValueArray34& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray34& other); @@ -1710,6 +1918,17 @@ class ValueArray35 { return ValuesIn(array); } + ValueArray35(const ValueArray35& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray35& other); @@ -1790,6 +2009,17 @@ class ValueArray36 { return ValuesIn(array); } + ValueArray36(const ValueArray36& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray36& other); @@ -1872,6 +2102,17 @@ class ValueArray37 { return ValuesIn(array); } + ValueArray37(const ValueArray37& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray37& other); @@ -1955,6 +2196,17 @@ class ValueArray38 { return ValuesIn(array); } + ValueArray38(const ValueArray38& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray38& other); @@ -2040,6 +2292,18 @@ class ValueArray39 { return ValuesIn(array); } + ValueArray39(const ValueArray39& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray39& other); @@ -2127,6 +2391,18 @@ class ValueArray40 { return ValuesIn(array); } + ValueArray40(const ValueArray40& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray40& other); @@ -2216,6 +2492,18 @@ class ValueArray41 { return ValuesIn(array); } + ValueArray41(const ValueArray41& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray41& other); @@ -2307,6 +2595,18 @@ class ValueArray42 { return ValuesIn(array); } + ValueArray42(const ValueArray42& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray42& other); @@ -2399,6 +2699,19 @@ class ValueArray43 { return ValuesIn(array); } + ValueArray43(const ValueArray43& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray43& other); @@ -2493,6 +2806,19 @@ class ValueArray44 { return ValuesIn(array); } + ValueArray44(const ValueArray44& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray44& other); @@ -2589,6 +2915,19 @@ class ValueArray45 { return ValuesIn(array); } + ValueArray45(const ValueArray45& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray45& other); @@ -2687,6 +3026,19 @@ class ValueArray46 { return ValuesIn(array); } + ValueArray46(const ValueArray46& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray46& other); @@ -2787,6 +3139,20 @@ class ValueArray47 { return ValuesIn(array); } + ValueArray47(const ValueArray47& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray47& other); @@ -2889,6 +3255,20 @@ class ValueArray48 { return ValuesIn(array); } + ValueArray48(const ValueArray48& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_), v48_(other.v48_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray48& other); @@ -2992,6 +3372,20 @@ class ValueArray49 { return ValuesIn(array); } + ValueArray49(const ValueArray49& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_), v48_(other.v48_), v49_(other.v49_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray49& other); @@ -3096,6 +3490,20 @@ class ValueArray50 { return ValuesIn(array); } + ValueArray50(const ValueArray50& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_), v48_(other.v48_), v49_(other.v49_), v50_(other.v50_) {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray50& other); @@ -3208,7 +3616,7 @@ class CartesianProductGenerator2 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -3240,7 +3648,7 @@ class CartesianProductGenerator2 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_); + current_value_.reset(new ParamType(*current1_, *current2_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -3262,7 +3670,7 @@ class CartesianProductGenerator2 const typename ParamGenerator::iterator begin2_; const typename ParamGenerator::iterator end2_; typename ParamGenerator::iterator current2_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator2::Iterator // No implementation - assignment is unsupported. @@ -3331,7 +3739,7 @@ class CartesianProductGenerator3 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -3367,7 +3775,7 @@ class CartesianProductGenerator3 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_); + current_value_.reset(new ParamType(*current1_, *current2_, *current3_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -3393,7 +3801,7 @@ class CartesianProductGenerator3 const typename ParamGenerator::iterator begin3_; const typename ParamGenerator::iterator end3_; typename ParamGenerator::iterator current3_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator3::Iterator // No implementation - assignment is unsupported. @@ -3472,7 +3880,7 @@ class CartesianProductGenerator4 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -3512,8 +3920,8 @@ class CartesianProductGenerator4 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_); + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -3543,7 +3951,7 @@ class CartesianProductGenerator4 const typename ParamGenerator::iterator begin4_; const typename ParamGenerator::iterator end4_; typename ParamGenerator::iterator current4_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator4::Iterator // No implementation - assignment is unsupported. @@ -3630,7 +4038,7 @@ class CartesianProductGenerator5 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -3674,8 +4082,8 @@ class CartesianProductGenerator5 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_); + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -3709,7 +4117,7 @@ class CartesianProductGenerator5 const typename ParamGenerator::iterator begin5_; const typename ParamGenerator::iterator end5_; typename ParamGenerator::iterator current5_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator5::Iterator // No implementation - assignment is unsupported. @@ -3807,7 +4215,7 @@ class CartesianProductGenerator6 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -3855,8 +4263,8 @@ class CartesianProductGenerator6 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_); + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -3894,7 +4302,7 @@ class CartesianProductGenerator6 const typename ParamGenerator::iterator begin6_; const typename ParamGenerator::iterator end6_; typename ParamGenerator::iterator current6_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator6::Iterator // No implementation - assignment is unsupported. @@ -4001,7 +4409,7 @@ class CartesianProductGenerator7 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -4053,8 +4461,8 @@ class CartesianProductGenerator7 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_); + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -4096,7 +4504,7 @@ class CartesianProductGenerator7 const typename ParamGenerator::iterator begin7_; const typename ParamGenerator::iterator end7_; typename ParamGenerator::iterator current7_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator7::Iterator // No implementation - assignment is unsupported. @@ -4214,7 +4622,7 @@ class CartesianProductGenerator8 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -4270,8 +4678,8 @@ class CartesianProductGenerator8 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_); + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -4317,7 +4725,7 @@ class CartesianProductGenerator8 const typename ParamGenerator::iterator begin8_; const typename ParamGenerator::iterator end8_; typename ParamGenerator::iterator current8_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator8::Iterator // No implementation - assignment is unsupported. @@ -4443,7 +4851,7 @@ class CartesianProductGenerator9 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -4503,9 +4911,9 @@ class CartesianProductGenerator9 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_); + *current9_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -4555,7 +4963,7 @@ class CartesianProductGenerator9 const typename ParamGenerator::iterator begin9_; const typename ParamGenerator::iterator end9_; typename ParamGenerator::iterator current9_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator9::Iterator // No implementation - assignment is unsupported. @@ -4690,7 +5098,7 @@ class CartesianProductGenerator10 virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -4754,9 +5162,9 @@ class CartesianProductGenerator10 void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_, *current10_); + *current9_, *current10_)); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -4810,7 +5218,7 @@ class CartesianProductGenerator10 const typename ParamGenerator::iterator begin10_; const typename ParamGenerator::iterator end10_; typename ParamGenerator::iterator current10_; - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator10::Iterator // No implementation - assignment is unsupported. @@ -5141,6 +5549,4 @@ CartesianProductHolder10(const Generator1& g1, const Generator2& g2, } // namespace internal } // namespace testing -#endif // GTEST_HAS_PARAM_TEST - #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump index 5c7c47af0b..30dffe43c3 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump @@ -29,8 +29,7 @@ $var maxtuple = 10 $$ Maximum number of Combine arguments we want to support. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: vladl@google.com (Vlad Losev) + // Type and function utilities for implementing parameterized tests. // This file is generated by a SCRIPT. DO NOT EDIT BY HAND! @@ -42,17 +41,14 @@ $var maxtuple = 10 $$ Maximum number of Combine arguments we want to support. // by the maximum arity of the implementation of tuple which is // currently set at $maxtuple. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. #include "gtest/internal/gtest-param-util.h" #include "gtest/internal/gtest-port.h" -#if GTEST_HAS_PARAM_TEST - namespace testing { // Forward declarations of ValuesIn(), which is implemented in @@ -87,6 +83,8 @@ class ValueArray$i { return ValuesIn(array); } + ValueArray$i(const ValueArray$i& other) : $for j, [[v$(j)_(other.v$(j)_)]] {} + private: // No implementation - assignment is unsupported. void operator=(const ValueArray$i& other); @@ -165,7 +163,7 @@ $for k [[ virtual ParamIteratorInterface* Clone() const { return new Iterator(*this); } - virtual const ParamType* Current() const { return ¤t_value_; } + virtual const ParamType* Current() const { return current_value_.get(); } virtual bool Equals(const ParamIteratorInterface& other) const { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. @@ -197,7 +195,7 @@ $for k [[ void ComputeCurrentValue() { if (!AtEnd()) - current_value_ = ParamType($for j, [[*current$(j)_]]); + current_value_.reset(new ParamType($for j, [[*current$(j)_]])); } bool AtEnd() const { // We must report iterator past the end of the range when either of the @@ -222,7 +220,7 @@ $for j [[ typename ParamGenerator::iterator current$(j)_; ]] - ParamType current_value_; + linked_ptr current_value_; }; // class CartesianProductGenerator$i::Iterator // No implementation - assignment is unsupported. @@ -281,6 +279,4 @@ $for j [[ } // namespace internal } // namespace testing -#endif // GTEST_HAS_PARAM_TEST - #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h index 82cab9b020..d64f620c4c 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -26,11 +26,12 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: vladl@google.com (Vlad Losev) + // Type and function utilities for implementing parameterized tests. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ @@ -41,16 +42,11 @@ #include #include -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-linked_ptr.h" #include "gtest/internal/gtest-port.h" #include "gtest/gtest-printers.h" -#if GTEST_HAS_PARAM_TEST - namespace testing { // Input to a parameterized test name generator, describing a test parameter. @@ -472,7 +468,7 @@ class ParameterizedTestCaseInfoBase { virtual ~ParameterizedTestCaseInfoBase() {} // Base part of test case name for display purposes. - virtual const string& GetTestCaseName() const = 0; + virtual const std::string& GetTestCaseName() const = 0; // Test case id to verify identity. virtual TypeId GetTestCaseTypeId() const = 0; // UnitTest class invokes this method to register tests in this @@ -511,7 +507,7 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { : test_case_name_(name), code_location_(code_location) {} // Test case base name for display purposes. - virtual const string& GetTestCaseName() const { return test_case_name_; } + virtual const std::string& GetTestCaseName() const { return test_case_name_; } // Test case id to verify identity. virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } // TEST_P macro uses AddTestPattern() to record information @@ -529,11 +525,10 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { } // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information // about a generator. - int AddTestCaseInstantiation(const string& instantiation_name, + int AddTestCaseInstantiation(const std::string& instantiation_name, GeneratorCreationFunc* func, ParamNameGeneratorFunc* name_func, - const char* file, - int line) { + const char* file, int line) { instantiations_.push_back( InstantiationInfo(instantiation_name, func, name_func, file, line)); return 0; // Return value used only to run this method in namespace scope. @@ -550,13 +545,13 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { for (typename InstantiationContainer::iterator gen_it = instantiations_.begin(); gen_it != instantiations_.end(); ++gen_it) { - const string& instantiation_name = gen_it->name; + const std::string& instantiation_name = gen_it->name; ParamGenerator generator((*gen_it->generator)()); ParamNameGeneratorFunc* name_func = gen_it->name_func; const char* file = gen_it->file; int line = gen_it->line; - string test_case_name; + std::string test_case_name; if ( !instantiation_name.empty() ) test_case_name = instantiation_name + "/"; test_case_name += test_info->test_case_base_name; @@ -609,8 +604,8 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { test_base_name(a_test_base_name), test_meta_factory(a_test_meta_factory) {} - const string test_case_base_name; - const string test_base_name; + const std::string test_case_base_name; + const std::string test_base_name; const scoped_ptr > test_meta_factory; }; typedef ::std::vector > TestInfoContainer; @@ -651,7 +646,7 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { return true; } - const string test_case_name_; + const std::string test_case_name_; CodeLocation code_location_; TestInfoContainer tests_; InstantiationContainer instantiations_; @@ -726,6 +721,4 @@ class ParameterizedTestCaseRegistry { } // namespace internal } // namespace testing -#endif // GTEST_HAS_PARAM_TEST - #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h index 74ab949057..f83700e06d 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -27,7 +27,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the GTEST_OS_* macro. // It is separate from gtest-port.h so that custom/gtest-port.h can include it. @@ -54,6 +54,9 @@ # define GTEST_OS_WINDOWS_PHONE 1 # elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) # define GTEST_OS_WINDOWS_RT 1 +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) +# define GTEST_OS_WINDOWS_PHONE 1 +# define GTEST_OS_WINDOWS_TV_TITLE 1 # else // WINAPI_FAMILY defined but no known partition matched. // Default to desktop. @@ -69,6 +72,8 @@ # endif #elif defined __FreeBSD__ # define GTEST_OS_FREEBSD 1 +#elif defined __Fuchsia__ +# define GTEST_OS_FUCHSIA 1 #elif defined __linux__ # define GTEST_OS_LINUX 1 # if defined __ANDROID__ @@ -84,6 +89,8 @@ # define GTEST_OS_HPUX 1 #elif defined __native_client__ # define GTEST_OS_NACL 1 +#elif defined __NetBSD__ +# define GTEST_OS_NETBSD 1 #elif defined __OpenBSD__ # define GTEST_OS_OPENBSD 1 #elif defined __QNX__ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h index da57e65d33..786497d854 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -27,8 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Authors: wan@google.com (Zhanyong Wan) -// // Low-level types and utilities for porting Google Test to various // platforms. All macros ending with _ and symbols defined in an // internal namespace are subject to change without notice. Code @@ -40,6 +38,8 @@ // files are expected to #include this. Therefore, it cannot #include // any other Google Test header. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ @@ -73,11 +73,9 @@ // GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions // are enabled. // GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string -// is/isn't available (some systems define -// ::string, which is different to std::string). -// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string -// is/isn't available (some systems define -// ::wstring, which is different to std::wstring). +// is/isn't available +// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::wstring +// is/isn't available // GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular // expressions are/aren't available. // GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that @@ -109,6 +107,12 @@ // GTEST_CREATE_SHARED_LIBRARY // - Define to 1 when compiling Google Test itself // as a shared library. +// GTEST_DEFAULT_DEATH_TEST_STYLE +// - The default value of --gtest_death_test_style. +// The legacy default has been "fast" in the open +// source version since 2008. The recommended value +// is "threadsafe", and can be set in +// custom/gtest-port.h. // Platform-indicating macros // -------------------------- @@ -122,12 +126,14 @@ // GTEST_OS_AIX - IBM AIX // GTEST_OS_CYGWIN - Cygwin // GTEST_OS_FREEBSD - FreeBSD +// GTEST_OS_FUCHSIA - Fuchsia // GTEST_OS_HPUX - HP-UX // GTEST_OS_LINUX - Linux // GTEST_OS_LINUX_ANDROID - Google Android // GTEST_OS_MAC - Mac OS X // GTEST_OS_IOS - iOS // GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_NETBSD - NetBSD // GTEST_OS_OPENBSD - OpenBSD // GTEST_OS_QNX - QNX // GTEST_OS_SOLARIS - Sun Solaris @@ -169,15 +175,15 @@ // GTEST_HAS_COMBINE - the Combine() function (for value-parameterized // tests) // GTEST_HAS_DEATH_TEST - death tests -// GTEST_HAS_PARAM_TEST - value-parameterized tests // GTEST_HAS_TYPED_TEST - typed tests // GTEST_HAS_TYPED_TEST_P - type-parameterized tests // GTEST_IS_THREADSAFE - Google Test is thread-safe. +// GOOGLETEST_CM0007 DO NOT DELETE // GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with // GTEST_HAS_POSIX_RE (see above) which users can // define themselves. // GTEST_USES_SIMPLE_RE - our own simple regex is used; -// the above two are mutually exclusive. +// the above RE\b(s) are mutually exclusive. // GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). // Misc public macros @@ -206,6 +212,7 @@ // // C++11 feature wrappers: // +// testing::internal::forward - portability wrapper for std::forward. // testing::internal::move - portability wrapper for std::move. // // Synchronization: @@ -222,10 +229,10 @@ // // Regular expressions: // RE - a simple regular expression class using the POSIX -// Extended Regular Expression syntax on UNIX-like -// platforms, or a reduced regular exception syntax on -// other platforms, including Windows. -// +// Extended Regular Expression syntax on UNIX-like platforms +// GOOGLETEST_CM0008 DO NOT DELETE +// or a reduced regular exception syntax on other +// platforms, including Windows. // Logging: // GTEST_LOG_() - logs messages at the specified severity level. // LogToStderr() - directs all log messages to stderr. @@ -271,10 +278,12 @@ # include #endif +// Brings in the definition of HAS_GLOBAL_STRING. This must be done +// BEFORE we test HAS_GLOBAL_STRING. +#include // NOLINT #include // NOLINT #include // NOLINT #include // NOLINT -#include // NOLINT #include #include // NOLINT @@ -306,7 +315,7 @@ // GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385) // /* code that triggers warnings C4800 and C4385 */ // GTEST_DISABLE_MSC_WARNINGS_POP_() -#if _MSC_VER >= 1500 +#if _MSC_VER >= 1400 # define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ __pragma(warning(push)) \ __pragma(warning(disable: warnings)) @@ -318,12 +327,28 @@ # define GTEST_DISABLE_MSC_WARNINGS_POP_() #endif +// Clang on Windows does not understand MSVC's pragma warning. +// We need clang-specific way to disable function deprecation warning. +#ifdef __clang__ +# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ + _Pragma("clang diagnostic pop") +#else +# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + #ifndef GTEST_LANG_CXX11 // gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when // -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a // value for __cplusplus, and recent versions of clang, gcc, and // probably other compilers set that too in C++11 mode. -# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L +# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L || _MSC_VER >= 1900 // Compiling in at least C++11 mode. # define GTEST_LANG_CXX11 1 # else @@ -355,12 +380,16 @@ #if GTEST_STDLIB_CXX11 # define GTEST_HAS_STD_BEGIN_AND_END_ 1 # define GTEST_HAS_STD_FORWARD_LIST_ 1 -# define GTEST_HAS_STD_FUNCTION_ 1 +# if !defined(_MSC_VER) || (_MSC_FULL_VER >= 190023824) +// works only with VS2015U2 and better +# define GTEST_HAS_STD_FUNCTION_ 1 +# endif # define GTEST_HAS_STD_INITIALIZER_LIST_ 1 # define GTEST_HAS_STD_MOVE_ 1 -# define GTEST_HAS_STD_SHARED_PTR_ 1 -# define GTEST_HAS_STD_TYPE_TRAITS_ 1 # define GTEST_HAS_STD_UNIQUE_PTR_ 1 +# define GTEST_HAS_STD_SHARED_PTR_ 1 +# define GTEST_HAS_UNORDERED_MAP_ 1 +# define GTEST_HAS_UNORDERED_SET_ 1 #endif // C++11 specifies that provides std::tuple. @@ -368,7 +397,8 @@ #if GTEST_LANG_CXX11 # define GTEST_HAS_STD_TUPLE_ 1 # if defined(__clang__) -// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include +// Inspired by +// https://clang.llvm.org/docs/LanguageExtensions.html#include-file-checking-macros # if defined(__has_include) && !__has_include() # undef GTEST_HAS_STD_TUPLE_ # endif @@ -380,7 +410,7 @@ # elif defined(__GLIBCXX__) // Inspired by boost/config/stdlib/libstdcpp3.hpp, // http://gcc.gnu.org/gcc-4.2/changes.html and -// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x +// https://web.archive.org/web/20140227044429/gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x # if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2) # undef GTEST_HAS_STD_TUPLE_ # endif @@ -396,10 +426,16 @@ # include # endif // In order to avoid having to include , use forward declaration -// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION. +#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR) +// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two +// separate (equivalent) structs, instead of using typedef +typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#else +// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION. // This assumption is verified by // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION. -struct _RTL_CRITICAL_SECTION; +typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#endif #else // This assumes that non-Windows OSes provide unistd.h. For OSes where this // is not the case, we need to include headers that provide the functions @@ -453,8 +489,11 @@ struct _RTL_CRITICAL_SECTION; #ifndef GTEST_HAS_EXCEPTIONS // The user didn't tell us whether exceptions are enabled, so we need // to figure it out. -# if defined(_MSC_VER) || defined(__BORLANDC__) -// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS +# if defined(_MSC_VER) && defined(_CPPUNWIND) +// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__BORLANDC__) +// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS // macro to enable exceptions, so we'll do the same. // Assumes that exceptions are enabled by default. # ifndef _HAS_EXCEPTIONS @@ -498,21 +537,17 @@ struct _RTL_CRITICAL_SECTION; # define GTEST_HAS_STD_STRING 1 #elif !GTEST_HAS_STD_STRING // The user told us that ::std::string isn't available. -# error "Google Test cannot be used where ::std::string isn't available." +# error "::std::string isn't available." #endif // !defined(GTEST_HAS_STD_STRING) #ifndef GTEST_HAS_GLOBAL_STRING -// The user didn't tell us whether ::string is available, so we need -// to figure it out. - # define GTEST_HAS_GLOBAL_STRING 0 - #endif // GTEST_HAS_GLOBAL_STRING #ifndef GTEST_HAS_STD_WSTRING // The user didn't tell us whether ::std::wstring is available, so we need // to figure it out. -// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring +// FIXME: uses autoconf to detect whether ::std::wstring // is available. // Cygwin 1.7 and below doesn't support ::std::wstring. @@ -600,8 +635,9 @@ struct _RTL_CRITICAL_SECTION; // // To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 // to your compiler flags. -# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \ - || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL) +#define GTEST_HAS_PTHREAD \ + (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA) #endif // GTEST_HAS_PTHREAD #if GTEST_HAS_PTHREAD @@ -616,7 +652,7 @@ struct _RTL_CRITICAL_SECTION; // Determines if hash_map/hash_set are available. // Only used for testing against those containers. #if !defined(GTEST_HAS_HASH_MAP_) -# if _MSC_VER +# if defined(_MSC_VER) && (_MSC_VER < 1900) # define GTEST_HAS_HASH_MAP_ 1 // Indicates that hash_map is available. # define GTEST_HAS_HASH_SET_ 1 // Indicates that hash_set is available. # endif // _MSC_VER @@ -629,6 +665,14 @@ struct _RTL_CRITICAL_SECTION; # if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) // STLport, provided with the Android NDK, has neither or . # define GTEST_HAS_TR1_TUPLE 0 +# elif defined(_MSC_VER) && (_MSC_VER >= 1910) +// Prevent `warning C4996: 'std::tr1': warning STL4002: +// The non-Standard std::tr1 namespace and TR1-only machinery +// are deprecated and will be REMOVED.` +# define GTEST_HAS_TR1_TUPLE 0 +# elif GTEST_LANG_CXX11 && defined(_LIBCPP_VERSION) +// libc++ doesn't support TR1. +# define GTEST_HAS_TR1_TUPLE 0 # else // The user didn't tell us not to do it, so we assume it's OK. # define GTEST_HAS_TR1_TUPLE 1 @@ -638,6 +682,10 @@ struct _RTL_CRITICAL_SECTION; // Determines whether Google Test's own tr1 tuple implementation // should be used. #ifndef GTEST_USE_OWN_TR1_TUPLE +// We use our own tuple implementation on Symbian. +# if GTEST_OS_SYMBIAN +# define GTEST_USE_OWN_TR1_TUPLE 1 +# else // The user didn't tell us, so we need to figure it out. // We use our own TR1 tuple if we aren't sure the user has an @@ -651,7 +699,8 @@ struct _RTL_CRITICAL_SECTION; // support TR1 tuple. libc++ only provides std::tuple, in C++11 mode, // and it can be used with some compilers that define __GNUC__. # if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \ - && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600 + && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) \ + || (_MSC_VER >= 1600 && _MSC_VER < 1900) # define GTEST_ENV_HAS_TR1_TUPLE_ 1 # endif @@ -667,12 +716,11 @@ struct _RTL_CRITICAL_SECTION; # else # define GTEST_USE_OWN_TR1_TUPLE 1 # endif - +# endif // GTEST_OS_SYMBIAN #endif // GTEST_USE_OWN_TR1_TUPLE -// To avoid conditional compilation everywhere, we make it -// gtest-port.h's responsibility to #include the header implementing -// tuple. +// To avoid conditional compilation we make it gtest-port.h's responsibility +// to #include the header implementing tuple. #if GTEST_HAS_STD_TUPLE_ # include // IWYU pragma: export # define GTEST_TUPLE_NAMESPACE_ ::std @@ -687,22 +735,6 @@ struct _RTL_CRITICAL_SECTION; # if GTEST_USE_OWN_TR1_TUPLE # include "gtest/internal/gtest-tuple.h" // IWYU pragma: export // NOLINT -# elif GTEST_ENV_HAS_STD_TUPLE_ -# include -// C++11 puts its tuple into the ::std namespace rather than -// ::std::tr1. gtest expects tuple to live in ::std::tr1, so put it there. -// This causes undefined behavior, but supported compilers react in -// the way we intend. -namespace std { -namespace tr1 { -using ::std::get; -using ::std::make_tuple; -using ::std::tuple; -using ::std::tuple_element; -using ::std::tuple_size; -} -} - # elif GTEST_OS_SYMBIAN // On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to @@ -727,20 +759,22 @@ using ::std::tuple_size; // Until version 4.3.2, gcc has a bug that causes , // which is #included by , to not compile when RTTI is // disabled. _TR1_FUNCTIONAL is the header guard for -// . Hence the following #define is a hack to prevent +// . Hence the following #define is used to prevent // from being included. # define _TR1_FUNCTIONAL 1 # include # undef _TR1_FUNCTIONAL // Allows the user to #include - // if he chooses to. + // if they choose to. # else # include // NOLINT # endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 -# else -// If the compiler is not GCC 4.0+, we assume the user is using a -// spec-conforming TR1 implementation. +// VS 2010 now has tr1 support. +# elif _MSC_VER >= 1600 # include // IWYU pragma: export // NOLINT + +# else // GTEST_USE_OWN_TR1_TUPLE +# include // IWYU pragma: export // NOLINT # endif // GTEST_USE_OWN_TR1_TUPLE #endif // GTEST_HAS_TR1_TUPLE @@ -754,8 +788,12 @@ using ::std::tuple_size; # if GTEST_OS_LINUX && !defined(__ia64__) # if GTEST_OS_LINUX_ANDROID -// On Android, clone() is only available on ARM starting with Gingerbread. -# if defined(__arm__) && __ANDROID_API__ >= 9 +// On Android, clone() became available at different API levels for each 32-bit +// architecture. +# if defined(__LP64__) || \ + (defined(__arm__) && __ANDROID_API__ >= 9) || \ + (defined(__mips__) && __ANDROID_API__ >= 12) || \ + (defined(__i386__) && __ANDROID_API__ >= 17) # define GTEST_HAS_CLONE 1 # else # define GTEST_HAS_CLONE 0 @@ -786,19 +824,15 @@ using ::std::tuple_size; // Google Test does not support death tests for VC 7.1 and earlier as // abort() in a VC 7.1 application compiled as GUI in debug config // pops up a dialog window that cannot be suppressed programmatically. -#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ - (GTEST_OS_MAC && !GTEST_OS_IOS) || \ - (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \ - GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD) + GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD || \ + GTEST_OS_NETBSD || GTEST_OS_FUCHSIA) # define GTEST_HAS_DEATH_TEST 1 #endif -// We don't support MSVC 7.1 with exceptions disabled now. Therefore -// all the compilers we care about are adequate for supporting -// value-parameterized tests. -#define GTEST_HAS_PARAM_TEST 1 - // Determines whether to support type-driven tests. // Typed tests need and variadic macros, which GCC, VC++ 8.0, @@ -813,7 +847,7 @@ using ::std::tuple_size; // value-parameterized tests are enabled. The implementation doesn't // work on Sun Studio since it doesn't understand templated conversion // operators. -#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC) +#if (GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_) && !defined(__SUNPRO_CC) # define GTEST_HAS_COMBINE 1 #endif @@ -864,15 +898,39 @@ using ::std::tuple_size; # define GTEST_ATTRIBUTE_UNUSED_ #endif +#if GTEST_LANG_CXX11 +# define GTEST_CXX11_EQUALS_DELETE_ = delete +#else // GTEST_LANG_CXX11 +# define GTEST_CXX11_EQUALS_DELETE_ +#endif // GTEST_LANG_CXX11 + +// Use this annotation before a function that takes a printf format string. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) +# if defined(__MINGW_PRINTF_FORMAT) +// MinGW has two different printf implementations. Ensure the format macro +// matches the selected implementation. See +// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/. +# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \ + first_to_check))) +# else +# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +# endif +#else +# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) +#endif + + // A macro to disallow operator= // This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_ASSIGN_(type)\ - void operator=(type const &) +#define GTEST_DISALLOW_ASSIGN_(type) \ + void operator=(type const &) GTEST_CXX11_EQUALS_DELETE_ // A macro to disallow copy constructor and operator= // This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\ - type(type const &);\ +#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \ + type(type const &) GTEST_CXX11_EQUALS_DELETE_; \ GTEST_DISALLOW_ASSIGN_(type) // Tell the compiler to warn about unused return values for functions declared @@ -920,6 +978,11 @@ using ::std::tuple_size; #endif // GTEST_HAS_SEH +// GTEST_API_ qualifies all symbols that must be exported. The definitions below +// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in +// gtest/internal/custom/gtest-port.h +#ifndef GTEST_API_ + #ifdef _MSC_VER # if GTEST_LINKED_AS_SHARED_LIBRARY # define GTEST_API_ __declspec(dllimport) @@ -928,11 +991,17 @@ using ::std::tuple_size; # endif #elif __GNUC__ >= 4 || defined(__clang__) # define GTEST_API_ __attribute__((visibility ("default"))) -#endif // _MSC_VER +#endif // _MSC_VER + +#endif // GTEST_API_ #ifndef GTEST_API_ # define GTEST_API_ -#endif +#endif // GTEST_API_ + +#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE +# define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" +#endif // GTEST_DEFAULT_DEATH_TEST_STYLE #ifdef __GNUC__ // Ask the compiler to never inline a given function. @@ -942,10 +1011,12 @@ using ::std::tuple_size; #endif // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. -#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) -# define GTEST_HAS_CXXABI_H_ 1 -#else -# define GTEST_HAS_CXXABI_H_ 0 +#if !defined(GTEST_HAS_CXXABI_H_) +# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +# define GTEST_HAS_CXXABI_H_ 1 +# else +# define GTEST_HAS_CXXABI_H_ 0 +# endif #endif // A function level attribute to disable checking for use of uninitialized @@ -985,19 +1056,6 @@ using ::std::tuple_size; # define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ #endif // __clang__ -// A function level attribute to disable UndefinedBehaviorSanitizer's (defined) -// unsigned integer overflow instrumentation. -#if defined(__clang__) -# if defined(__has_attribute) && __has_attribute(no_sanitize) -# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \ - __attribute__((no_sanitize("unsigned-integer-overflow"))) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ -# endif // defined(__has_attribute) && __has_attribute(no_sanitize) -#else -# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ -#endif // __clang__ - namespace testing { class Message; @@ -1101,6 +1159,16 @@ struct StaticAssertTypeEqHelper { enum { value = true }; }; +// Same as std::is_same<>. +template +struct IsSame { + enum { value = false }; +}; +template +struct IsSame { + enum { value = true }; +}; + // Evaluates to the number of elements in 'array'. #define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0])) @@ -1164,6 +1232,10 @@ class scoped_ptr { // Defines RE. +#if GTEST_USES_PCRE +// if used, PCRE is injected by custom/gtest-port.h +#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE + // A simple C++ wrapper for . It uses the POSIX Extended // Regular Expression syntax. class GTEST_API_ RE { @@ -1175,11 +1247,11 @@ class GTEST_API_ RE { // Constructs an RE from a string. RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT -#if GTEST_HAS_GLOBAL_STRING +# if GTEST_HAS_GLOBAL_STRING RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT -#endif // GTEST_HAS_GLOBAL_STRING +# endif // GTEST_HAS_GLOBAL_STRING RE(const char* regex) { Init(regex); } // NOLINT ~RE(); @@ -1192,7 +1264,7 @@ class GTEST_API_ RE { // PartialMatch(str, re) returns true iff regular expression re // matches a substring of str (including str itself). // - // TODO(wan@google.com): make FullMatch() and PartialMatch() work + // FIXME: make FullMatch() and PartialMatch() work // when str contains NUL characters. static bool FullMatch(const ::std::string& str, const RE& re) { return FullMatch(str.c_str(), re); @@ -1201,7 +1273,7 @@ class GTEST_API_ RE { return PartialMatch(str.c_str(), re); } -#if GTEST_HAS_GLOBAL_STRING +# if GTEST_HAS_GLOBAL_STRING static bool FullMatch(const ::string& str, const RE& re) { return FullMatch(str.c_str(), re); @@ -1210,7 +1282,7 @@ class GTEST_API_ RE { return PartialMatch(str.c_str(), re); } -#endif // GTEST_HAS_GLOBAL_STRING +# endif // GTEST_HAS_GLOBAL_STRING static bool FullMatch(const char* str, const RE& re); static bool PartialMatch(const char* str, const RE& re); @@ -1219,25 +1291,27 @@ class GTEST_API_ RE { void Init(const char* regex); // We use a const char* instead of an std::string, as Google Test used to be - // used where std::string is not available. TODO(wan@google.com): change to + // used where std::string is not available. FIXME: change to // std::string. const char* pattern_; bool is_valid_; -#if GTEST_USES_POSIX_RE +# if GTEST_USES_POSIX_RE regex_t full_regex_; // For FullMatch(). regex_t partial_regex_; // For PartialMatch(). -#else // GTEST_USES_SIMPLE_RE +# else // GTEST_USES_SIMPLE_RE const char* full_pattern_; // For FullMatch(); -#endif +# endif GTEST_DISALLOW_ASSIGN_(RE); }; +#endif // GTEST_USES_PCRE + // Formats a source file path and a line number as they would appear // in an error message from the compiler used to compile this code. GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); @@ -1323,13 +1397,59 @@ inline void FlushInfoLog() { fflush(NULL); } GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ << gtest_error +// Adds reference to a type if it is not a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::add_reference, which is not widely available yet. +template +struct AddReference { typedef T& type; }; // NOLINT +template +struct AddReference { typedef T& type; }; // NOLINT + +// A handy wrapper around AddReference that works when the argument T +// depends on template parameters. +#define GTEST_ADD_REFERENCE_(T) \ + typename ::testing::internal::AddReference::type + +// Transforms "T" into "const T&" according to standard reference collapsing +// rules (this is only needed as a backport for C++98 compilers that do not +// support reference collapsing). Specifically, it transforms: +// +// char ==> const char& +// const char ==> const char& +// char& ==> char& +// const char& ==> const char& +// +// Note that the non-const reference will not have "const" added. This is +// standard, and necessary so that "T" can always bind to "const T&". +template +struct ConstRef { typedef const T& type; }; +template +struct ConstRef { typedef T& type; }; + +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + typename ::testing::internal::ConstRef::type + #if GTEST_HAS_STD_MOVE_ +using std::forward; using std::move; + +template +struct RvalueRef { + typedef T&& type; +}; #else // GTEST_HAS_STD_MOVE_ template const T& move(const T& t) { return t; } +template +GTEST_ADD_REFERENCE_(T) forward(GTEST_ADD_REFERENCE_(T) t) { return t; } + +template +struct RvalueRef { + typedef const T& type; +}; #endif // GTEST_HAS_STD_MOVE_ // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -1430,10 +1550,6 @@ GTEST_API_ void CaptureStderr(); GTEST_API_ std::string GetCapturedStderr(); #endif // GTEST_HAS_STREAM_REDIRECTION - -// Returns a path to temporary directory. -GTEST_API_ std::string TempDir(); - // Returns the size (in bytes) of a file. GTEST_API_ size_t GetFileSize(FILE* file); @@ -1441,14 +1557,18 @@ GTEST_API_ size_t GetFileSize(FILE* file); GTEST_API_ std::string ReadEntireFile(FILE* file); // All command line arguments. -GTEST_API_ const ::std::vector& GetArgvs(); +GTEST_API_ std::vector GetArgvs(); #if GTEST_HAS_DEATH_TEST -const ::std::vector& GetInjectableArgvs(); -void SetInjectableArgvs(const ::std::vector* - new_argvs); - +std::vector GetInjectableArgvs(); +// Deprecated: pass the args vector by value instead. +void SetInjectableArgvs(const std::vector* new_argvs); +void SetInjectableArgvs(const std::vector& new_argvs); +#if GTEST_HAS_GLOBAL_STRING +void SetInjectableArgvs(const std::vector< ::string>& new_argvs); +#endif // GTEST_HAS_GLOBAL_STRING +void ClearInjectableArgvs(); #endif // GTEST_HAS_DEATH_TEST @@ -1698,7 +1818,7 @@ class GTEST_API_ Mutex { // Initializes owner_thread_id_ and critical_section_ in static mutexes. void ThreadSafeLazyInit(); - // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx, + // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503, // we assume that 0 is an invalid value for thread IDs. unsigned int owner_thread_id_; @@ -1706,7 +1826,7 @@ class GTEST_API_ Mutex { // by the linker. MutexType type_; long critical_section_init_phase_; // NOLINT - _RTL_CRITICAL_SECTION* critical_section_; + GTEST_CRITICAL_SECTION* critical_section_; GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); }; @@ -1982,8 +2102,13 @@ class MutexBase { extern ::testing::internal::MutexBase mutex // Defines and statically (i.e. at link time) initializes a static mutex. -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ - ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, pthread_t() } +// The initialization list here does not explicitly initialize each field, +// instead relying on default initialization for the unspecified fields. In +// particular, the owner_ field (a pthread_t) is not explicitly initialized. +// This allows initialization to work whether pthread_t is a scalar or struct. +// The flag -Wmissing-field-initializers must not be specified for this to work. +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0} // The Mutex class can only be used for mutexes created at runtime. It // shares its API with MutexBase otherwise. @@ -2040,7 +2165,7 @@ extern "C" inline void DeleteThreadLocalValue(void* value_holder) { // Implements thread-local storage on pthreads-based systems. template -class ThreadLocal { +class GTEST_API_ ThreadLocal { public: ThreadLocal() : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {} @@ -2172,7 +2297,7 @@ class GTestMutexLock { typedef GTestMutexLock MutexLock; template -class ThreadLocal { +class GTEST_API_ ThreadLocal { public: ThreadLocal() : value_() {} explicit ThreadLocal(const T& value) : value_(value) {} @@ -2191,12 +2316,13 @@ class ThreadLocal { GTEST_API_ size_t GetThreadCount(); // Passing non-POD classes through ellipsis (...) crashes the ARM -// compiler and generates a warning in Sun Studio. The Nokia Symbian +// compiler and generates a warning in Sun Studio before 12u4. The Nokia Symbian // and the IBM XL C/C++ compiler try to instantiate a copy constructor // for objects passed through ellipsis (...), failing for uncopyable // objects. We define this to ensure that only POD is passed through // ellipsis on these systems. -#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC) +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || \ + (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5130) // We lose support for NULL detection where the compiler doesn't like // passing non-POD classes through ellipsis (...). # define GTEST_ELLIPSIS_NEEDS_POD_ 1 @@ -2222,6 +2348,13 @@ template const bool bool_constant::value; typedef bool_constant false_type; typedef bool_constant true_type; +template +struct is_same : public false_type {}; + +template +struct is_same : public true_type {}; + + template struct is_pointer : public false_type {}; @@ -2233,6 +2366,7 @@ struct IteratorTraits { typedef typename Iterator::value_type value_type; }; + template struct IteratorTraits { typedef T value_type; @@ -2364,7 +2498,7 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } // Functions deprecated by MSVC 8.0. -GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */) +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() inline const char* StrNCpy(char* dest, const char* src, size_t n) { return strncpy(dest, src, n); @@ -2398,7 +2532,7 @@ inline int Close(int fd) { return close(fd); } inline const char* StrError(int errnum) { return strerror(errnum); } #endif inline const char* GetEnv(const char* name) { -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT // We are on Windows CE, which has no environment variables. static_cast(name); // To prevent 'unused argument' warning. return NULL; @@ -2412,7 +2546,7 @@ inline const char* GetEnv(const char* name) { #endif } -GTEST_DISABLE_MSC_WARNINGS_POP_() +GTEST_DISABLE_MSC_DEPRECATED_POP_() #if GTEST_OS_WINDOWS_MOBILE // Windows CE has no C library. The abort() function is used in @@ -2528,15 +2662,15 @@ typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. # define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) # define GTEST_DECLARE_int32_(name) \ GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) -#define GTEST_DECLARE_string_(name) \ +# define GTEST_DECLARE_string_(name) \ GTEST_API_ extern ::std::string GTEST_FLAG(name) // Macros for defining flags. -#define GTEST_DEFINE_bool_(name, default_val, doc) \ +# define GTEST_DEFINE_bool_(name, default_val, doc) \ GTEST_API_ bool GTEST_FLAG(name) = (default_val) -#define GTEST_DEFINE_int32_(name, default_val, doc) \ +# define GTEST_DEFINE_int32_(name, default_val, doc) \ GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) -#define GTEST_DEFINE_string_(name, default_val, doc) \ +# define GTEST_DEFINE_string_(name, default_val, doc) \ GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) #endif // !defined(GTEST_DECLARE_bool_) @@ -2550,7 +2684,7 @@ typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. // Parses 'str' for a 32-bit signed integer. If successful, writes the result // to *value and returns true; otherwise leaves *value unchanged and returns // false. -// TODO(chandlerc): Find a better way to refactor flag and environment parsing +// FIXME: Find a better way to refactor flag and environment parsing // out of both gtest-port.cc and gtest.cc to avoid exporting this utility // function. bool ParseInt32(const Message& src_text, const char* str, Int32* value); @@ -2559,7 +2693,8 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value); // corresponding to the given Google Test flag. bool BoolFromGTestEnv(const char* flag, bool default_val); GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); -std::string StringFromGTestEnv(const char* flag, const char* default_val); +std::string OutputFlagAlsoCheckEnvVar(); +const char* StringFromGTestEnv(const char* flag, const char* default_val); } // namespace internal } // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h index 97f1a7fdd2..4c9b6262c3 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -27,17 +27,17 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file declares the String class and functions used internally by // Google Test. They are subject to change without notice. They should not used // by code external to Google Test. // -// This header file is #included by . +// This header file is #included by gtest-internal.h. // It should not be #included by other files. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h index e9b405340a..78a3a6a01f 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h @@ -30,11 +30,12 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + // Implements a subset of TR1 tuple needed by Google Test and Google Mock. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ @@ -42,7 +43,7 @@ // The compiler used in Symbian has a bug that prevents us from declaring the // tuple template as a friend (it complains that tuple is redefined). This -// hack bypasses the bug by declaring the members that should otherwise be +// bypasses the bug by declaring the members that should otherwise be // private as public. // Sun Studio versions < 12 also have the above bug. #if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump index 429ddfeeca..bb626e049f 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump @@ -29,11 +29,12 @@ $$ This meta comment fixes auto-indentation in Emacs. }} // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + // Implements a subset of TR1 tuple needed by Google Test and Google Mock. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ @@ -41,7 +42,7 @@ $$ This meta comment fixes auto-indentation in Emacs. }} // The compiler used in Symbian has a bug that prevents us from declaring the // tuple template as a friend (it complains that tuple is redefined). This -// hack bypasses the bug by declaring the members that should otherwise be +// bypasses the bug by declaring the members that should otherwise be // private as public. // Sun Studio versions < 12 also have the above bug. #if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h index e46f7cfcb4..28e4112453 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -30,8 +30,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + // Type utilities needed for implementing typed and type-parameterized // tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! @@ -41,6 +40,8 @@ // Please contact googletestframework@googlegroups.com if you need // more. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ @@ -57,6 +58,22 @@ namespace testing { namespace internal { +// Canonicalizes a given name with respect to the Standard C++ Library. +// This handles removing the inline namespace within `std` that is +// used by various standard libraries (e.g., `std::__1`). Names outside +// of namespace std are returned unmodified. +inline std::string CanonicalizeForStdLibVersioning(std::string s) { + static const char prefix[] = "std::__"; + if (s.compare(0, strlen(prefix), prefix) == 0) { + std::string::size_type end = s.find("::", strlen(prefix)); + if (end != s.npos) { + // Erase everything between the initial `std` and the second `::`. + s.erase(strlen("std"), end - strlen("std")); + } + } + return s; +} + // GetTypeName() returns a human-readable name of type T. // NB: This function is also used in Google Mock, so don't move it inside of // the typed-test-only section below. @@ -75,7 +92,7 @@ std::string GetTypeName() { char* const readable_name = __cxa_demangle(name, 0, 0, &status); const std::string name_str(status == 0 ? readable_name : name); free(readable_name); - return name_str; + return CanonicalizeForStdLibVersioning(name_str); # else return name; # endif // GTEST_HAS_CXXABI_H_ || __HP_aCC diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump index 251fdf025b..0001a5d39d 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump @@ -28,8 +28,7 @@ $var n = 50 $$ Maximum length of type lists we want to support. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + // Type utilities needed for implementing typed and type-parameterized // tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! @@ -39,6 +38,8 @@ $var n = 50 $$ Maximum length of type lists we want to support. // Please contact googletestframework@googlegroups.com if you need // more. +// GOOGLETEST_CM0001 DO NOT DELETE + #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ @@ -55,6 +56,22 @@ $var n = 50 $$ Maximum length of type lists we want to support. namespace testing { namespace internal { +// Canonicalizes a given name with respect to the Standard C++ Library. +// This handles removing the inline namespace within `std` that is +// used by various standard libraries (e.g., `std::__1`). Names outside +// of namespace std are returned unmodified. +inline std::string CanonicalizeForStdLibVersioning(std::string s) { + static const char prefix[] = "std::__"; + if (s.compare(0, strlen(prefix), prefix) == 0) { + std::string::size_type end = s.find("::", strlen(prefix)); + if (end != s.npos) { + // Erase everything between the initial `std` and the second `::`. + s.erase(strlen("std"), end - strlen("std")); + } + } + return s; +} + // GetTypeName() returns a human-readable name of type T. // NB: This function is also used in Google Mock, so don't move it inside of // the typed-test-only section below. @@ -73,7 +90,7 @@ std::string GetTypeName() { char* const readable_name = __cxa_demangle(name, 0, 0, &status); const std::string name_str(status == 0 ? readable_name : name); free(readable_name); - return name_str; + return CanonicalizeForStdLibVersioning(name_str); # else return name; # endif // GTEST_HAS_CXXABI_H_ || __HP_aCC diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc index 0a9cee5223..b217a18006 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc @@ -26,10 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: mheule@google.com (Markus Heule) -// -// Google C++ Testing Framework (Google Test) +// Google C++ Testing and Mocking Framework (Google Test) // // Sometimes it's desirable to build Google Test by compiling a single file. // This file serves this purpose. diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc index a01a369830..0908355161 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc @@ -26,8 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev) + // // This file implements death tests. @@ -62,26 +61,30 @@ # include # endif // GTEST_OS_QNX +# if GTEST_OS_FUCHSIA +# include +# include +# include +# include +# include +# endif // GTEST_OS_FUCHSIA + #endif // GTEST_HAS_DEATH_TEST #include "gtest/gtest-message.h" #include "gtest/internal/gtest-string.h" - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick exists to -// prevent the accidental inclusion of gtest-internal-inl.h in the -// user's code. -#define GTEST_IMPLEMENTATION_ 1 #include "src/gtest-internal-inl.h" -#undef GTEST_IMPLEMENTATION_ namespace testing { // Constants. // The default death test style. -static const char kDefaultDeathTestStyle[] = "fast"; +// +// This is defined in internal/gtest-port.h as "fast", but can be overridden by +// a definition in internal/custom/gtest-port.h. The recommended value, which is +// used internally at Google, is "threadsafe". +static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE; GTEST_DEFINE_string_( death_test_style, @@ -121,7 +124,7 @@ namespace internal { // Valid only for fast death tests. Indicates the code is running in the // child process of a fast style death test. -# if !GTEST_OS_WINDOWS +# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA static bool g_in_fast_death_test_child = false; # endif @@ -131,10 +134,10 @@ static bool g_in_fast_death_test_child = false; // tests. IMPORTANT: This is an internal utility. Using it may break the // implementation of death tests. User code MUST NOT use it. bool InDeathTestChild() { -# if GTEST_OS_WINDOWS +# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA - // On Windows, death tests are thread-safe regardless of the value of the - // death_test_style flag. + // On Windows and Fuchsia, death tests are thread-safe regardless of the value + // of the death_test_style flag. return !GTEST_FLAG(internal_run_death_test).empty(); # else @@ -154,7 +157,7 @@ ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { // ExitedWithCode function-call operator. bool ExitedWithCode::operator()(int exit_status) const { -# if GTEST_OS_WINDOWS +# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA return exit_status == exit_code_; @@ -162,10 +165,10 @@ bool ExitedWithCode::operator()(int exit_status) const { return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; -# endif // GTEST_OS_WINDOWS +# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA } -# if !GTEST_OS_WINDOWS +# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // KilledBySignal constructor. KilledBySignal::KilledBySignal(int signum) : signum_(signum) { } @@ -182,7 +185,7 @@ bool KilledBySignal::operator()(int exit_status) const { # endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; } -# endif // !GTEST_OS_WINDOWS +# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA namespace internal { @@ -193,7 +196,7 @@ namespace internal { static std::string ExitSummary(int exit_code) { Message m; -# if GTEST_OS_WINDOWS +# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA m << "Exited with exit status " << exit_code; @@ -209,7 +212,7 @@ static std::string ExitSummary(int exit_code) { m << " (core dumped)"; } # endif -# endif // GTEST_OS_WINDOWS +# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA return m.GetString(); } @@ -220,7 +223,7 @@ bool ExitedUnsuccessfully(int exit_status) { return !ExitedWithCode(0)(exit_status); } -# if !GTEST_OS_WINDOWS +# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Generates a textual failure message when a death test finds more than // one thread running, or cannot determine the number of threads, prior // to executing the given statement. It is the responsibility of the @@ -229,13 +232,19 @@ static std::string DeathTestThreadWarning(size_t thread_count) { Message msg; msg << "Death tests use fork(), which is unsafe particularly" << " in a threaded context. For this test, " << GTEST_NAME_ << " "; - if (thread_count == 0) + if (thread_count == 0) { msg << "couldn't detect the number of threads."; - else + } else { msg << "detected " << thread_count << " threads."; + } + msg << " See " + "https://github.com/google/googletest/blob/master/googletest/docs/" + "advanced.md#death-tests-and-threads" + << " for more explanation and suggested solutions, especially if" + << " this is the last message you see before your test times out."; return msg.GetString(); } -# endif // !GTEST_OS_WINDOWS +# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Flag characters for reporting a death test that did not die. static const char kDeathTestLived = 'L'; @@ -243,6 +252,13 @@ static const char kDeathTestReturned = 'R'; static const char kDeathTestThrew = 'T'; static const char kDeathTestInternalError = 'I'; +#if GTEST_OS_FUCHSIA + +// File descriptor used for the pipe in the child process. +static const int kFuchsiaReadPipeFd = 3; + +#endif + // An enumeration describing all of the possible ways that a death test can // conclude. DIED means that the process died while executing the test // code; LIVED means that process lived beyond the end of the test code; @@ -250,7 +266,7 @@ static const char kDeathTestInternalError = 'I'; // statement, which is not allowed; THREW means that the test statement // returned control by throwing an exception. IN_PROGRESS means the test // has not yet concluded. -// TODO(vladl@google.com): Unify names and possibly values for +// FIXME: Unify names and possibly values for // AbortReason, DeathTestOutcome, and flag characters above. enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; @@ -259,7 +275,7 @@ enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; // message is propagated back to the parent process. Otherwise, the // message is simply printed to stderr. In either case, the program // then exits with status 1. -void DeathTestAbort(const std::string& message) { +static void DeathTestAbort(const std::string& message) { // On a POSIX system, this function may be called from a threadsafe-style // death test child process, which operates on a very small stack. Use // the heap for any additional non-minuscule memory requirements. @@ -563,7 +579,12 @@ bool DeathTestImpl::Passed(bool status_ok) { break; case DIED: if (status_ok) { +# if GTEST_USES_PCRE + // PCRE regexes support embedded NULs. + const bool matched = RE::PartialMatch(error_message, *regex()); +# else const bool matched = RE::PartialMatch(error_message.c_str(), *regex()); +# endif // GTEST_USES_PCRE if (matched) { success = true; } else { @@ -779,7 +800,200 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { set_spawned(true); return OVERSEE_TEST; } -# else // We are not on Windows. + +# elif GTEST_OS_FUCHSIA + +class FuchsiaDeathTest : public DeathTestImpl { + public: + FuchsiaDeathTest(const char* a_statement, + const RE* a_regex, + const char* file, + int line) + : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} + virtual ~FuchsiaDeathTest() { + zx_status_t status = zx_handle_close(child_process_); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + status = zx_handle_close(port_); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + } + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + virtual TestRole AssumeRole(); + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + + zx_handle_t child_process_ = ZX_HANDLE_INVALID; + zx_handle_t port_ = ZX_HANDLE_INVALID; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { + args_.push_back(NULL); + } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector& arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); + ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { + return &args_[0]; + } + + int size() { + return args_.size() - 1; + } + + private: + std::vector args_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int FuchsiaDeathTest::Wait() { + if (!spawned()) + return 0; + + // Register to wait for the child process to terminate. + zx_status_t status_zx; + status_zx = zx_object_wait_async(child_process_, + port_, + 0 /* key */, + ZX_PROCESS_TERMINATED, + ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Wait for it to terminate, or an exception to be received. + zx_port_packet_t packet; + status_zx = zx_port_wait(port_, ZX_TIME_INFINITE, &packet); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + if (ZX_PKT_IS_EXCEPTION(packet.type)) { + // Process encountered an exception. Kill it directly rather than letting + // other handlers process the event. + status_zx = zx_task_kill(child_process_); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Now wait for |child_process_| to terminate. + zx_signals_t signals = 0; + status_zx = zx_object_wait_one( + child_process_, ZX_PROCESS_TERMINATED, ZX_TIME_INFINITE, &signals); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + GTEST_DEATH_TEST_CHECK_(signals & ZX_PROCESS_TERMINATED); + } else { + // Process terminated. + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED); + } + + ReadAndInterpretStatusByte(); + + zx_info_process_t buffer; + status_zx = zx_object_get_info( + child_process_, + ZX_INFO_PROCESS, + &buffer, + sizeof(buffer), + nullptr, + nullptr); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + GTEST_DEATH_TEST_CHECK_(buffer.exited); + set_status(buffer.return_code); + return status(); +} + +// The AssumeRole process for a Fuchsia death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != NULL) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(kFuchsiaReadPipeFd); + return EXECUTE_TEST; + } + + CaptureStderr(); + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // Build the child process command line. + const std::string filter_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + + info->test_case_name() + "." + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" + + file_ + "|" + + StreamableToString(line_) + "|" + + StreamableToString(death_test_index); + Arguments args; + args.AddArguments(GetInjectableArgvs()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + // Build the pipe for communication with the child. + zx_status_t status; + zx_handle_t child_pipe_handle; + uint32_t type; + status = fdio_pipe_half(&child_pipe_handle, &type); + GTEST_DEATH_TEST_CHECK_(status >= 0); + set_read_fd(status); + + // Set the pipe handle for the child. + fdio_spawn_action_t add_handle_action = {}; + add_handle_action.action = FDIO_SPAWN_ACTION_ADD_HANDLE; + add_handle_action.h.id = PA_HND(type, kFuchsiaReadPipeFd); + add_handle_action.h.handle = child_pipe_handle; + + // Spawn the child process. + status = fdio_spawn_etc(ZX_HANDLE_INVALID, FDIO_SPAWN_CLONE_ALL, + args.Argv()[0], args.Argv(), nullptr, 1, + &add_handle_action, &child_process_, nullptr); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + // Create an exception port and attach it to the |child_process_|, to allow + // us to suppress the system default exception handler from firing. + status = zx_port_create(0, &port_); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + status = zx_task_bind_exception_port( + child_process_, port_, 0 /* key */, 0 /*options */); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + set_spawned(true); + return OVERSEE_TEST; +} + +#else // We are neither on Windows, nor on Fuchsia. // ForkingDeathTest provides implementations for most of the abstract // methods of the DeathTest interface. Only the AssumeRole method is @@ -883,11 +1097,10 @@ class ExecDeathTest : public ForkingDeathTest { ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { } virtual TestRole AssumeRole(); private: - static ::std::vector - GetArgvsForDeathTestChildProcess() { - ::std::vector args = GetInjectableArgvs(); + static ::std::vector GetArgvsForDeathTestChildProcess() { + ::std::vector args = GetInjectableArgvs(); # if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) - ::std::vector extra_args = + ::std::vector extra_args = GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_(); args.insert(args.end(), extra_args.begin(), extra_args.end()); # endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) @@ -986,6 +1199,7 @@ static int ExecDeathTestChildMain(void* child_arg) { } # endif // !GTEST_OS_QNX +# if GTEST_HAS_CLONE // Two utility routines that together determine the direction the stack // grows. // This could be accomplished more elegantly by a single recursive @@ -995,20 +1209,22 @@ static int ExecDeathTestChildMain(void* child_arg) { // GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining // StackLowerThanAddress into StackGrowsDown, which then doesn't give // correct answer. -void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_; -void StackLowerThanAddress(const void* ptr, bool* result) { +static void StackLowerThanAddress(const void* ptr, + bool* result) GTEST_NO_INLINE_; +static void StackLowerThanAddress(const void* ptr, bool* result) { int dummy; *result = (&dummy < ptr); } // Make sure AddressSanitizer does not tamper with the stack here. GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ -bool StackGrowsDown() { +static bool StackGrowsDown() { int dummy; bool result; StackLowerThanAddress(&dummy, &result); return result; } +# endif // GTEST_HAS_CLONE // Spawns a child process with the same executable as the current process in // a thread-safe manner and instructs it to run the death test. The @@ -1200,6 +1416,13 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, *test = new WindowsDeathTest(statement, regex, file, line); } +# elif GTEST_OS_FUCHSIA + + if (GTEST_FLAG(death_test_style) == "threadsafe" || + GTEST_FLAG(death_test_style) == "fast") { + *test = new FuchsiaDeathTest(statement, regex, file, line); + } + # else if (GTEST_FLAG(death_test_style) == "threadsafe") { @@ -1224,7 +1447,7 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, // Recreates the pipe and event handles from the provided parameters, // signals the event, and returns a file descriptor wrapped around the pipe // handle. This function is called in the child process only. -int GetStatusFileDescriptor(unsigned int parent_process_id, +static int GetStatusFileDescriptor(unsigned int parent_process_id, size_t write_handle_as_size_t, size_t event_handle_as_size_t) { AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, @@ -1235,7 +1458,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id, StreamableToString(parent_process_id)); } - // TODO(vladl@google.com): Replace the following check with a + // FIXME: Replace the following check with a // compile-time assertion when available. GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); @@ -1243,7 +1466,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id, reinterpret_cast(write_handle_as_size_t); HANDLE dup_write_handle; - // The newly initialized handle is accessible only in in the parent + // The newly initialized handle is accessible only in the parent // process. To obtain one accessible within the child, we need to use // DuplicateHandle. if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, @@ -1320,6 +1543,16 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t, event_handle_as_size_t); + +# elif GTEST_OS_FUCHSIA + + if (fields.size() != 3 + || !ParseNaturalNumber(fields[1], &line) + || !ParseNaturalNumber(fields[2], &index)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + # else if (fields.size() != 4 diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc index 0292dc1195..a7e65c082a 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc @@ -26,14 +26,12 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: keith.ray@gmail.com (Keith Ray) -#include "gtest/gtest-message.h" #include "gtest/internal/gtest-filepath.h" -#include "gtest/internal/gtest-port.h" #include +#include "gtest/internal/gtest-port.h" +#include "gtest/gtest-message.h" #if GTEST_OS_WINDOWS_MOBILE # include @@ -48,6 +46,8 @@ # include // Some Linux distributions define PATH_MAX here. #endif // GTEST_OS_WINDOWS_MOBILE +#include "gtest/internal/gtest-string.h" + #if GTEST_OS_WINDOWS # define GTEST_PATH_MAX_ _MAX_PATH #elif defined(PATH_MAX) @@ -58,8 +58,6 @@ # define GTEST_PATH_MAX_ _POSIX_PATH_MAX #endif // GTEST_OS_WINDOWS -#include "gtest/internal/gtest-string.h" - namespace testing { namespace internal { @@ -130,7 +128,7 @@ FilePath FilePath::RemoveExtension(const char* extension) const { return *this; } -// Returns a pointer to the last occurence of a valid path separator in +// Returns a pointer to the last occurrence of a valid path separator in // the FilePath. On Windows, for example, both '/' and '\' are valid path // separators. Returns NULL if no path separator was found. const char* FilePath::FindLastPathSeparator() const { @@ -252,7 +250,7 @@ bool FilePath::DirectoryExists() const { // root directory per disk drive.) bool FilePath::IsRootDirectory() const { #if GTEST_OS_WINDOWS - // TODO(wan@google.com): on Windows a network share like + // FIXME: on Windows a network share like // \\server\share can be a root directory, although it cannot be the // current directory. Handle this properly. return pathname_.length() == 3 && IsAbsolutePath(); @@ -352,7 +350,7 @@ FilePath FilePath::RemoveTrailingPathSeparator() const { // Removes any redundant separators that might be in the pathname. // For example, "bar///foo" becomes "bar/foo". Does not eliminate other // redundancies that might be in a pathname involving "." or "..". -// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share). +// FIXME: handle Windows network shares (e.g. \\server\share). void FilePath::Normalize() { if (pathname_.c_str() == NULL) { pathname_ = ""; diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h index ed8a682a96..479004149b 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h +++ b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h @@ -27,24 +27,13 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// Utility functions and classes used by the Google C++ testing framework. -// -// Author: wan@google.com (Zhanyong Wan) -// +// Utility functions and classes used by the Google C++ testing framework.// // This file contains purely Google Test's internal implementation. Please // DO NOT #INCLUDE IT IN A USER PROGRAM. #ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ #define GTEST_SRC_GTEST_INTERNAL_INL_H_ -// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is -// part of Google Test's implementation; otherwise it's undefined. -#if !GTEST_IMPLEMENTATION_ -// If this file is included from the user's code, just say no. -# error "gtest-internal-inl.h is part of Google Test's internal implementation." -# error "It must not be included except by Google Test itself." -#endif // GTEST_IMPLEMENTATION_ - #ifndef _WIN32_WCE # include #endif // !_WIN32_WCE @@ -67,9 +56,12 @@ # include // NOLINT #endif // GTEST_OS_WINDOWS -#include "gtest/gtest.h" // NOLINT +#include "gtest/gtest.h" #include "gtest/gtest-spi.h" +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + namespace testing { // Declares the flags. @@ -94,6 +86,7 @@ const char kFilterFlag[] = "filter"; const char kListTestsFlag[] = "list_tests"; const char kOutputFlag[] = "output"; const char kPrintTimeFlag[] = "print_time"; +const char kPrintUTF8Flag[] = "print_utf8"; const char kRandomSeedFlag[] = "random_seed"; const char kRepeatFlag[] = "repeat"; const char kShuffleFlag[] = "shuffle"; @@ -174,6 +167,7 @@ class GTestFlagSaver { list_tests_ = GTEST_FLAG(list_tests); output_ = GTEST_FLAG(output); print_time_ = GTEST_FLAG(print_time); + print_utf8_ = GTEST_FLAG(print_utf8); random_seed_ = GTEST_FLAG(random_seed); repeat_ = GTEST_FLAG(repeat); shuffle_ = GTEST_FLAG(shuffle); @@ -195,6 +189,7 @@ class GTestFlagSaver { GTEST_FLAG(list_tests) = list_tests_; GTEST_FLAG(output) = output_; GTEST_FLAG(print_time) = print_time_; + GTEST_FLAG(print_utf8) = print_utf8_; GTEST_FLAG(random_seed) = random_seed_; GTEST_FLAG(repeat) = repeat_; GTEST_FLAG(shuffle) = shuffle_; @@ -216,6 +211,7 @@ class GTestFlagSaver { bool list_tests_; std::string output_; bool print_time_; + bool print_utf8_; internal::Int32 random_seed_; internal::Int32 repeat_; bool shuffle_; @@ -426,7 +422,7 @@ class OsStackTraceGetterInterface { // in the trace. // skip_count - the number of top frames to be skipped; doesn't count // against max_depth. - virtual string CurrentStackTrace(int max_depth, int skip_count) = 0; + virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0; // UponLeavingGTest() should be called immediately before Google Test calls // user code. It saves some information about the current stack that @@ -446,10 +442,20 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface { public: OsStackTraceGetter() {} - virtual string CurrentStackTrace(int max_depth, int skip_count); + virtual std::string CurrentStackTrace(int max_depth, int skip_count); virtual void UponLeavingGTest(); private: +#if GTEST_HAS_ABSL + Mutex mutex_; // Protects all internal state. + + // We save the stack frame below the frame that calls user code. + // We do this because the address of the frame immediately below + // the user code changes between the call to UponLeavingGTest() + // and any calls to the stack trace code from within the user code. + void* caller_frame_ = nullptr; +#endif // GTEST_HAS_ABSL + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); }; @@ -664,13 +670,11 @@ class GTEST_API_ UnitTestImpl { tear_down_tc)->AddTestInfo(test_info); } -#if GTEST_HAS_PARAM_TEST // Returns ParameterizedTestCaseRegistry object used to keep track of // value-parameterized tests and instantiate and register them. internal::ParameterizedTestCaseRegistry& parameterized_test_registry() { return parameterized_test_registry_; } -#endif // GTEST_HAS_PARAM_TEST // Sets the TestCase object for the test that's currently running. void set_current_test_case(TestCase* a_current_test_case) { @@ -845,14 +849,12 @@ class GTEST_API_ UnitTestImpl { // shuffled order. std::vector test_case_indices_; -#if GTEST_HAS_PARAM_TEST // ParameterizedTestRegistry object used to register value-parameterized // tests. internal::ParameterizedTestCaseRegistry parameterized_test_registry_; // Indicates whether RegisterParameterizedTests() has been called already. bool parameterized_tests_registered_; -#endif // GTEST_HAS_PARAM_TEST // Index of the last death test case registered. Initially -1. int last_death_test_case_; @@ -992,7 +994,7 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) { const bool parse_success = *end == '\0' && errno == 0; - // TODO(vladl@google.com): Convert this to compile time assertion when it is + // FIXME: Convert this to compile time assertion when it is // available. GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); @@ -1032,7 +1034,7 @@ class TestResultAccessor { #if GTEST_CAN_STREAM_RESULTS_ // Streams test results to the given port on the given host machine. -class GTEST_API_ StreamingListener : public EmptyTestEventListener { +class StreamingListener : public EmptyTestEventListener { public: // Abstract base class for writing strings to a socket. class AbstractSocketWriter { @@ -1040,21 +1042,19 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener { virtual ~AbstractSocketWriter() {} // Sends a string to the socket. - virtual void Send(const string& message) = 0; + virtual void Send(const std::string& message) = 0; // Closes the socket. virtual void CloseConnection() {} // Sends a string and a newline to the socket. - void SendLn(const string& message) { - Send(message + "\n"); - } + void SendLn(const std::string& message) { Send(message + "\n"); } }; // Concrete class for actually writing strings to a socket. class SocketWriter : public AbstractSocketWriter { public: - SocketWriter(const string& host, const string& port) + SocketWriter(const std::string& host, const std::string& port) : sockfd_(-1), host_name_(host), port_num_(port) { MakeConnection(); } @@ -1065,7 +1065,7 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener { } // Sends a string to the socket. - virtual void Send(const string& message) { + virtual void Send(const std::string& message) { GTEST_CHECK_(sockfd_ != -1) << "Send() can be called only when there is a connection."; @@ -1091,17 +1091,19 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener { } int sockfd_; // socket file descriptor - const string host_name_; - const string port_num_; + const std::string host_name_; + const std::string port_num_; GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); }; // class SocketWriter // Escapes '=', '&', '%', and '\n' characters in str as "%xx". - static string UrlEncode(const char* str); + static std::string UrlEncode(const char* str); - StreamingListener(const string& host, const string& port) - : socket_writer_(new SocketWriter(host, port)) { Start(); } + StreamingListener(const std::string& host, const std::string& port) + : socket_writer_(new SocketWriter(host, port)) { + Start(); + } explicit StreamingListener(AbstractSocketWriter* socket_writer) : socket_writer_(socket_writer) { Start(); } @@ -1162,13 +1164,13 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener { private: // Sends the given message and a newline to the socket. - void SendLn(const string& message) { socket_writer_->SendLn(message); } + void SendLn(const std::string& message) { socket_writer_->SendLn(message); } // Called at the start of streaming to notify the receiver what // protocol we are using. void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } - string FormatBool(bool value) { return value ? "1" : "0"; } + std::string FormatBool(bool value) { return value ? "1" : "0"; } const scoped_ptr socket_writer_; @@ -1180,4 +1182,6 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener { } // namespace internal } // namespace testing +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + #endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-port.cc b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc index e5bf3dd2be..fecb5d11c2 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-port.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc @@ -26,8 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + #include "gtest/internal/gtest-port.h" @@ -63,19 +62,16 @@ # include #endif // GTEST_OS_AIX +#if GTEST_OS_FUCHSIA +# include +# include +#endif // GTEST_OS_FUCHSIA + #include "gtest/gtest-spi.h" #include "gtest/gtest-message.h" #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-string.h" - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick exists to -// prevent the accidental inclusion of gtest-internal-inl.h in the -// user's code. -#define GTEST_IMPLEMENTATION_ 1 #include "src/gtest-internal-inl.h" -#undef GTEST_IMPLEMENTATION_ namespace testing { namespace internal { @@ -93,7 +89,7 @@ const int kStdErrFileno = STDERR_FILENO; namespace { template -T ReadProcFileField(const string& filename, int field) { +T ReadProcFileField(const std::string& filename, int field) { std::string dummy; std::ifstream file(filename.c_str()); while (field-- > 0) { @@ -107,7 +103,7 @@ T ReadProcFileField(const string& filename, int field) { // Returns the number of active threads, or 0 when there is an error. size_t GetThreadCount() { - const string filename = + const std::string filename = (Message() << "/proc/" << getpid() << "/stat").GetString(); return ReadProcFileField(filename, 19); } @@ -164,6 +160,25 @@ size_t GetThreadCount() { } } +#elif GTEST_OS_FUCHSIA + +size_t GetThreadCount() { + int dummy_buffer; + size_t avail; + zx_status_t status = zx_object_get_info( + zx_process_self(), + ZX_INFO_PROCESS_THREADS, + &dummy_buffer, + 0, + nullptr, + &avail); + if (status == ZX_OK) { + return avail; + } else { + return 0; + } +} + #else size_t GetThreadCount() { @@ -246,9 +261,9 @@ Mutex::Mutex() Mutex::~Mutex() { // Static mutexes are leaked intentionally. It is not thread-safe to try // to clean them up. - // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires + // FIXME: Switch to Slim Reader/Writer (SRW) Locks, which requires // nothing to clean it up but is available only on Vista and later. - // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx + // https://docs.microsoft.com/en-us/windows/desktop/Sync/slim-reader-writer--srw--locks if (type_ == kDynamic) { ::DeleteCriticalSection(critical_section_); delete critical_section_; @@ -279,6 +294,43 @@ void Mutex::AssertHeld() { << "The current thread is not holding the mutex @" << this; } +namespace { + +// Use the RAII idiom to flag mem allocs that are intentionally never +// deallocated. The motivation is to silence the false positive mem leaks +// that are reported by the debug version of MS's CRT which can only detect +// if an alloc is missing a matching deallocation. +// Example: +// MemoryIsNotDeallocated memory_is_not_deallocated; +// critical_section_ = new CRITICAL_SECTION; +// +class MemoryIsNotDeallocated +{ + public: + MemoryIsNotDeallocated() : old_crtdbg_flag_(0) { +#ifdef _MSC_VER + old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG); + // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT + // doesn't report mem leak if there's no matching deallocation. + _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); +#endif // _MSC_VER + } + + ~MemoryIsNotDeallocated() { +#ifdef _MSC_VER + // Restore the original _CRTDBG_ALLOC_MEM_DF flag + _CrtSetDbgFlag(old_crtdbg_flag_); +#endif // _MSC_VER + } + + private: + int old_crtdbg_flag_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated); +}; + +} // namespace + // Initializes owner_thread_id_ and critical_section_ in static mutexes. void Mutex::ThreadSafeLazyInit() { // Dynamic mutexes are initialized in the constructor. @@ -289,7 +341,11 @@ void Mutex::ThreadSafeLazyInit() { // If critical_section_init_phase_ was 0 before the exchange, we // are the first to test it and need to perform the initialization. owner_thread_id_ = 0; - critical_section_ = new CRITICAL_SECTION; + { + // Use RAII to flag that following mem alloc is never deallocated. + MemoryIsNotDeallocated memory_is_not_deallocated; + critical_section_ = new CRITICAL_SECTION; + } ::InitializeCriticalSection(critical_section_); // Updates the critical_section_init_phase_ to 2 to signal // initialization complete. @@ -328,7 +384,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase { Notification* thread_can_start) { ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start); DWORD thread_id; - // TODO(yukawa): Consider to use _beginthreadex instead. + // FIXME: Consider to use _beginthreadex instead. HANDLE thread_handle = ::CreateThread( NULL, // Default security. 0, // Default stack size. @@ -496,7 +552,7 @@ class ThreadLocalRegistryImpl { FALSE, thread_id); GTEST_CHECK_(thread != NULL); - // We need to to pass a valid thread ID pointer into CreateThread for it + // We need to pass a valid thread ID pointer into CreateThread for it // to work correctly under Win98. DWORD watcher_thread_id; HANDLE watcher_thread = ::CreateThread( @@ -531,7 +587,8 @@ class ThreadLocalRegistryImpl { // Returns map of thread local instances. static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() { mutex_.AssertHeld(); - static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals; + MemoryIsNotDeallocated memory_is_not_deallocated; + static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals(); return map; } @@ -671,7 +728,7 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { } // Helper function used by ValidateRegex() to format error messages. -std::string FormatRegexSyntaxError(const char* regex, int index) { +static std::string FormatRegexSyntaxError(const char* regex, int index) { return (Message() << "Syntax error at index " << index << " in simple regular expression \"" << regex << "\": ").GetString(); } @@ -680,7 +737,7 @@ std::string FormatRegexSyntaxError(const char* regex, int index) { // otherwise returns true. bool ValidateRegex(const char* regex) { if (regex == NULL) { - // TODO(wan@google.com): fix the source file location in the + // FIXME: fix the source file location in the // assertion failures to match where the regex is used in user // code. ADD_FAILURE() << "NULL is not a valid simple regular expression."; @@ -923,9 +980,10 @@ GTestLog::~GTestLog() { posix::Abort(); } } + // Disable Microsoft deprecation warnings for POSIX functions called from // this class (creat, dup, dup2, and close) -GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() #if GTEST_HAS_STREAM_REDIRECTION @@ -1009,13 +1067,14 @@ class CapturedStream { GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); }; -GTEST_DISABLE_MSC_WARNINGS_POP_() +GTEST_DISABLE_MSC_DEPRECATED_POP_() static CapturedStream* g_captured_stderr = NULL; static CapturedStream* g_captured_stdout = NULL; // Starts capturing an output stream (stdout/stderr). -void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { +static void CaptureStream(int fd, const char* stream_name, + CapturedStream** stream) { if (*stream != NULL) { GTEST_LOG_(FATAL) << "Only one " << stream_name << " capturer can exist at a time."; @@ -1024,7 +1083,7 @@ void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { } // Stops capturing the output stream and returns the captured string. -std::string GetCapturedStream(CapturedStream** captured_stream) { +static std::string GetCapturedStream(CapturedStream** captured_stream) { const std::string content = (*captured_stream)->GetCapturedString(); delete *captured_stream; @@ -1055,23 +1114,9 @@ std::string GetCapturedStderr() { #endif // GTEST_HAS_STREAM_REDIRECTION -std::string TempDir() { -#if GTEST_OS_WINDOWS_MOBILE - return "\\temp\\"; -#elif GTEST_OS_WINDOWS - const char* temp_dir = posix::GetEnv("TEMP"); - if (temp_dir == NULL || temp_dir[0] == '\0') - return "\\temp\\"; - else if (temp_dir[strlen(temp_dir) - 1] == '\\') - return temp_dir; - else - return std::string(temp_dir) + "\\"; -#elif GTEST_OS_LINUX_ANDROID - return "/sdcard/"; -#else - return "/tmp/"; -#endif // GTEST_OS_WINDOWS_MOBILE -} + + + size_t GetFileSize(FILE* file) { fseek(file, 0, SEEK_END); @@ -1101,22 +1146,36 @@ std::string ReadEntireFile(FILE* file) { } #if GTEST_HAS_DEATH_TEST +static const std::vector* g_injected_test_argvs = NULL; // Owned. -static const ::std::vector* g_injected_test_argvs = - NULL; // Owned. - -void SetInjectableArgvs(const ::std::vector* argvs) { - if (g_injected_test_argvs != argvs) - delete g_injected_test_argvs; - g_injected_test_argvs = argvs; -} - -const ::std::vector& GetInjectableArgvs() { +std::vector GetInjectableArgvs() { if (g_injected_test_argvs != NULL) { return *g_injected_test_argvs; } return GetArgvs(); } + +void SetInjectableArgvs(const std::vector* new_argvs) { + if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs; + g_injected_test_argvs = new_argvs; +} + +void SetInjectableArgvs(const std::vector& new_argvs) { + SetInjectableArgvs( + new std::vector(new_argvs.begin(), new_argvs.end())); +} + +#if GTEST_HAS_GLOBAL_STRING +void SetInjectableArgvs(const std::vector< ::string>& new_argvs) { + SetInjectableArgvs( + new std::vector(new_argvs.begin(), new_argvs.end())); +} +#endif // GTEST_HAS_GLOBAL_STRING + +void ClearInjectableArgvs() { + delete g_injected_test_argvs; + g_injected_test_argvs = NULL; +} #endif // GTEST_HAS_DEATH_TEST #if GTEST_OS_WINDOWS_MOBILE @@ -1191,11 +1250,12 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) { bool BoolFromGTestEnv(const char* flag, bool default_value) { #if defined(GTEST_GET_BOOL_FROM_ENV_) return GTEST_GET_BOOL_FROM_ENV_(flag, default_value); -#endif // defined(GTEST_GET_BOOL_FROM_ENV_) +#else const std::string env_var = FlagToEnvVar(flag); const char* const string_value = posix::GetEnv(env_var.c_str()); return string_value == NULL ? default_value : strcmp(string_value, "0") != 0; +#endif // defined(GTEST_GET_BOOL_FROM_ENV_) } // Reads and returns a 32-bit integer stored in the environment @@ -1204,7 +1264,7 @@ bool BoolFromGTestEnv(const char* flag, bool default_value) { Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { #if defined(GTEST_GET_INT32_FROM_ENV_) return GTEST_GET_INT32_FROM_ENV_(flag, default_value); -#endif // defined(GTEST_GET_INT32_FROM_ENV_) +#else const std::string env_var = FlagToEnvVar(flag); const char* const string_value = posix::GetEnv(env_var.c_str()); if (string_value == NULL) { @@ -1222,37 +1282,36 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { } return result; +#endif // defined(GTEST_GET_INT32_FROM_ENV_) +} + +// As a special case for the 'output' flag, if GTEST_OUTPUT is not +// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build +// system. The value of XML_OUTPUT_FILE is a filename without the +// "xml:" prefix of GTEST_OUTPUT. +// Note that this is meant to be called at the call site so it does +// not check that the flag is 'output' +// In essence this checks an env variable called XML_OUTPUT_FILE +// and if it is set we prepend "xml:" to its value, if it not set we return "" +std::string OutputFlagAlsoCheckEnvVar(){ + std::string default_value_for_output_flag = ""; + const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE"); + if (NULL != xml_output_file_env) { + default_value_for_output_flag = std::string("xml:") + xml_output_file_env; + } + return default_value_for_output_flag; } // Reads and returns the string environment variable corresponding to // the given flag; if it's not set, returns default_value. -std::string StringFromGTestEnv(const char* flag, const char* default_value) { +const char* StringFromGTestEnv(const char* flag, const char* default_value) { #if defined(GTEST_GET_STRING_FROM_ENV_) return GTEST_GET_STRING_FROM_ENV_(flag, default_value); -#endif // defined(GTEST_GET_STRING_FROM_ENV_) +#else const std::string env_var = FlagToEnvVar(flag); - const char* value = posix::GetEnv(env_var.c_str()); - if (value != NULL) { - return value; - } - - // As a special case for the 'output' flag, if GTEST_OUTPUT is not - // set, we look for XML_OUTPUT_FILE, which is set by the Bazel build - // system. The value of XML_OUTPUT_FILE is a filename without the - // "xml:" prefix of GTEST_OUTPUT. - // - // The net priority order after flag processing is thus: - // --gtest_output command line flag - // GTEST_OUTPUT environment variable - // XML_OUTPUT_FILE environment variable - // 'default_value' - if (strcmp(flag, "output") == 0) { - value = posix::GetEnv("XML_OUTPUT_FILE"); - if (value != NULL) { - return std::string("xml:") + value; - } - } - return default_value; + const char* const value = posix::GetEnv(env_var.c_str()); + return value == NULL ? default_value : value; +#endif // defined(GTEST_GET_STRING_FROM_ENV_) } } // namespace internal diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc index a2df412f8a..b5022549f9 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc @@ -26,10 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// Google Test - The Google C++ Testing Framework + +// Google Test - The Google C++ Testing and Mocking Framework // // This file implements a universal value printer that can print a // value of any type T: @@ -43,12 +42,13 @@ // defines Foo. #include "gtest/gtest-printers.h" -#include #include +#include #include #include // NOLINT #include #include "gtest/internal/gtest-port.h" +#include "src/gtest-internal-inl.h" namespace testing { @@ -89,7 +89,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, // If the object size is bigger than kThreshold, we'll have to omit // some details by printing only the first and the last kChunkSize // bytes. - // TODO(wan): let the user control the threshold using a flag. + // FIXME: let the user control the threshold using a flag. if (count < kThreshold) { PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); } else { @@ -123,7 +123,7 @@ namespace internal { // Depending on the value of a char (or wchar_t), we print it in one // of three formats: // - as is if it's a printable ASCII (e.g. 'a', '2', ' '), -// - as a hexidecimal escape sequence (e.g. '\x7F'), or +// - as a hexadecimal escape sequence (e.g. '\x7F'), or // - as a special escape sequence (e.g. '\r', '\n'). enum CharFormat { kAsIs, @@ -180,7 +180,10 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { *os << static_cast(c); return kAsIs; } else { - *os << "\\x" + String::FormatHexInt(static_cast(c)); + ostream::fmtflags flags = os->flags(); + *os << "\\x" << std::hex << std::uppercase + << static_cast(static_cast(c)); + os->flags(flags); return kHexEscape; } } @@ -227,7 +230,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) { return; *os << " (" << static_cast(c); - // For more convenience, we print c's code again in hexidecimal, + // For more convenience, we print c's code again in hexadecimal, // unless c was already printed in the form '\x##' or the code is in // [1, 9]. if (format == kHexEscape || (1 <= c && c <= 9)) { @@ -259,11 +262,12 @@ template GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ -static void PrintCharsAsStringTo( +static CharFormat PrintCharsAsStringTo( const CharType* begin, size_t len, ostream* os) { const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\""; *os << kQuoteBegin; bool is_previous_hex = false; + CharFormat print_format = kAsIs; for (size_t index = 0; index < len; ++index) { const CharType cur = begin[index]; if (is_previous_hex && IsXDigit(cur)) { @@ -273,8 +277,13 @@ static void PrintCharsAsStringTo( *os << "\" " << kQuoteBegin; } is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; + // Remember if any characters required hex escaping. + if (is_previous_hex) { + print_format = kHexEscape; + } } *os << "\""; + return print_format; } // Prints a (const) char/wchar_t array of 'len' elements, starting at address @@ -339,20 +348,95 @@ void PrintTo(const wchar_t* s, ostream* os) { *os << "NULL"; } else { *os << ImplicitCast_(s) << " pointing to "; - PrintCharsAsStringTo(s, std::wcslen(s), os); + PrintCharsAsStringTo(s, wcslen(s), os); } } #endif // wchar_t is native +namespace { + +bool ContainsUnprintableControlCodes(const char* str, size_t length) { + const unsigned char *s = reinterpret_cast(str); + + for (size_t i = 0; i < length; i++) { + unsigned char ch = *s++; + if (std::iscntrl(ch)) { + switch (ch) { + case '\t': + case '\n': + case '\r': + break; + default: + return true; + } + } + } + return false; +} + +bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; } + +bool IsValidUTF8(const char* str, size_t length) { + const unsigned char *s = reinterpret_cast(str); + + for (size_t i = 0; i < length;) { + unsigned char lead = s[i++]; + + if (lead <= 0x7f) { + continue; // single-byte character (ASCII) 0..7F + } + if (lead < 0xc2) { + return false; // trail byte or non-shortest form + } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) { + ++i; // 2-byte character + } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length && + IsUTF8TrailByte(s[i]) && + IsUTF8TrailByte(s[i + 1]) && + // check for non-shortest form and surrogate + (lead != 0xe0 || s[i] >= 0xa0) && + (lead != 0xed || s[i] < 0xa0)) { + i += 2; // 3-byte character + } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && + IsUTF8TrailByte(s[i]) && + IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i + 2]) && + // check for non-shortest form + (lead != 0xf0 || s[i] >= 0x90) && + (lead != 0xf4 || s[i] < 0x90)) { + i += 3; // 4-byte character + } else { + return false; + } + } + return true; +} + +void ConditionalPrintAsText(const char* str, size_t length, ostream* os) { + if (!ContainsUnprintableControlCodes(str, length) && + IsValidUTF8(str, length)) { + *os << "\n As Text: \"" << str << "\""; + } +} + +} // anonymous namespace + // Prints a ::string object. #if GTEST_HAS_GLOBAL_STRING void PrintStringTo(const ::string& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); + if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { + if (GTEST_FLAG(print_utf8)) { + ConditionalPrintAsText(s.data(), s.size(), os); + } + } } #endif // GTEST_HAS_GLOBAL_STRING void PrintStringTo(const ::std::string& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); + if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { + if (GTEST_FLAG(print_utf8)) { + ConditionalPrintAsText(s.data(), s.size(), os); + } + } } // Prints a ::wstring object. diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc index fb0e35425e..c88860d923 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc @@ -26,21 +26,12 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: mheule@google.com (Markus Heule) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) #include "gtest/gtest-test-part.h" - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick exists to -// prevent the accidental inclusion of gtest-internal-inl.h in the -// user's code. -#define GTEST_IMPLEMENTATION_ 1 #include "src/gtest-internal-inl.h" -#undef GTEST_IMPLEMENTATION_ namespace testing { diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc index df1eef4754..1dc2ad38ba 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc @@ -26,10 +26,10 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) + #include "gtest/gtest-typed-test.h" + #include "gtest/gtest.h" namespace testing { diff --git a/libs/libvpx/third_party/googletest/src/src/gtest.cc b/libs/libvpx/third_party/googletest/src/src/gtest.cc index 5a8932c73e..96b07c68ab 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest.cc @@ -26,10 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) +// The Google C++ Testing and Mocking Framework (Google Test) #include "gtest/gtest.h" #include "gtest/internal/custom/gtest.h" @@ -55,7 +54,7 @@ #if GTEST_OS_LINUX -// TODO(kenton@google.com): Use autoconf to detect availability of +// FIXME: Use autoconf to detect availability of // gettimeofday(). # define GTEST_HAS_GETTIMEOFDAY_ 1 @@ -94,9 +93,9 @@ # if GTEST_OS_WINDOWS_MINGW // MinGW has gettimeofday() but not _ftime64(). -// TODO(kenton@google.com): Use autoconf to detect availability of +// FIXME: Use autoconf to detect availability of // gettimeofday(). -// TODO(kenton@google.com): There are other ways to get the time on +// FIXME: There are other ways to get the time on // Windows, like GetTickCount() or GetSystemTimeAsFileTime(). MinGW // supports these. consider using them instead. # define GTEST_HAS_GETTIMEOFDAY_ 1 @@ -111,7 +110,7 @@ #else // Assume other platforms have gettimeofday(). -// TODO(kenton@google.com): Use autoconf to detect availability of +// FIXME: Use autoconf to detect availability of // gettimeofday(). # define GTEST_HAS_GETTIMEOFDAY_ 1 @@ -133,19 +132,25 @@ # include // NOLINT #endif -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 #include "src/gtest-internal-inl.h" -#undef GTEST_IMPLEMENTATION_ #if GTEST_OS_WINDOWS # define vsnprintf _vsnprintf #endif // GTEST_OS_WINDOWS +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS +#include +#endif +#endif + +#if GTEST_HAS_ABSL +#include "absl/debugging/failure_signal_handler.h" +#include "absl/debugging/stacktrace.h" +#include "absl/debugging/symbolize.h" +#include "absl/strings/str_cat.h" +#endif // GTEST_HAS_ABSL + namespace testing { using internal::CountIf; @@ -167,8 +172,10 @@ static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*"; // A test filter that matches everything. static const char kUniversalFilter[] = "*"; -// The default output file for XML output. -static const char kDefaultOutputFile[] = "test_detail.xml"; +// The default output format. +static const char kDefaultOutputFormat[] = "xml"; +// The default output file. +static const char kDefaultOutputFile[] = "test_detail"; // The environment variable name for the test shard index. static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; @@ -187,15 +194,31 @@ const char kStackTraceMarker[] = "\nStack trace:\n"; // specified on the command line. bool g_help_flag = false; +// Utilty function to Open File for Writing +static FILE* OpenFileForWriting(const std::string& output_file) { + FILE* fileout = NULL; + FilePath output_file_path(output_file); + FilePath output_dir(output_file_path.RemoveFileName()); + + if (output_dir.CreateDirectoriesRecursively()) { + fileout = posix::FOpen(output_file.c_str(), "w"); + } + if (fileout == NULL) { + GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\""; + } + return fileout; +} + } // namespace internal +// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY +// environment variable. static const char* GetDefaultFilter() { -#ifdef GTEST_TEST_FILTER_ENV_VAR_ - const char* const testbridge_test_only = getenv(GTEST_TEST_FILTER_ENV_VAR_); + const char* const testbridge_test_only = + internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY"); if (testbridge_test_only != NULL) { return testbridge_test_only; } -#endif // GTEST_TEST_FILTER_ENV_VAR_ return kUniversalFilter; } @@ -232,15 +255,28 @@ GTEST_DEFINE_string_( "exclude). A test is run if it matches one of the positive " "patterns and does not match any of the negative patterns."); +GTEST_DEFINE_bool_( + install_failure_signal_handler, + internal::BoolFromGTestEnv("install_failure_signal_handler", false), + "If true and supported on the current platform, " GTEST_NAME_ " should " + "install a signal handler that dumps debugging information when fatal " + "signals are raised."); + GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them."); +// The net priority order after flag processing is thus: +// --gtest_output command line flag +// GTEST_OUTPUT environment variable +// XML_OUTPUT_FILE environment variable +// '' GTEST_DEFINE_string_( output, - internal::StringFromGTestEnv("output", ""), - "A format (currently must be \"xml\"), optionally followed " - "by a colon and an output file name or directory. A directory " - "is indicated by a trailing pathname separator. " + internal::StringFromGTestEnv("output", + internal::OutputFlagAlsoCheckEnvVar().c_str()), + "A format (defaults to \"xml\" but can be specified to be \"json\"), " + "optionally followed by a colon and an output file name or directory. " + "A directory is indicated by a trailing pathname separator. " "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " "If a directory is specified, output files will be created " "within that directory, with file-names based on the test " @@ -253,6 +289,12 @@ GTEST_DEFINE_bool_( "True iff " GTEST_NAME_ " should display elapsed time in text output."); +GTEST_DEFINE_bool_( + print_utf8, + internal::BoolFromGTestEnv("print_utf8", true), + "True iff " GTEST_NAME_ + " prints UTF8 characters as text."); + GTEST_DEFINE_int32_( random_seed, internal::Int32FromGTestEnv("random_seed", 0), @@ -294,7 +336,7 @@ GTEST_DEFINE_bool_( internal::BoolFromGTestEnv("throw_on_failure", false), "When this flag is specified, a failed assertion will throw an exception " "if exceptions are enabled or exit the program with a non-zero code " - "otherwise."); + "otherwise. For use with an external test framework."); #if GTEST_USE_OWN_FLAGFILE_FLAG_ GTEST_DEFINE_string_( @@ -308,10 +350,10 @@ namespace internal { // Generates a random number from [0, range), using a Linear // Congruential Generator (LCG). Crashes if 'range' is 0 or greater // than kMaxRange. -GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ UInt32 Random::Generate(UInt32 range) { // These constants are the same as are used in glibc's rand(3). - state_ = (1103515245U*state_ + 12345U) % kMaxRange; + // Use wider types than necessary to prevent unsigned overflow diagnostics. + state_ = static_cast(1103515245ULL*state_ + 12345U) % kMaxRange; GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0)."; @@ -385,12 +427,15 @@ void AssertHelper::operator=(const Message& message) const { GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex); // A copy of all command line arguments. Set by InitGoogleTest(). -::std::vector g_argvs; +static ::std::vector g_argvs; -const ::std::vector& GetArgvs() { +::std::vector GetArgvs() { #if defined(GTEST_CUSTOM_GET_ARGVS_) - return GTEST_CUSTOM_GET_ARGVS_(); -#else // defined(GTEST_CUSTOM_GET_ARGVS_) + // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or + // ::string. This code converts it to the appropriate type. + const auto& custom = GTEST_CUSTOM_GET_ARGVS_(); + return ::std::vector(custom.begin(), custom.end()); +#else // defined(GTEST_CUSTOM_GET_ARGVS_) return g_argvs; #endif // defined(GTEST_CUSTOM_GET_ARGVS_) } @@ -414,8 +459,6 @@ FilePath GetCurrentExecutableName() { // Returns the output format, or "" for normal printed output. std::string UnitTestOptions::GetOutputFormat() { const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); - if (gtest_output_flag == NULL) return std::string(""); - const char* const colon = strchr(gtest_output_flag, ':'); return (colon == NULL) ? std::string(gtest_output_flag) : @@ -426,19 +469,22 @@ std::string UnitTestOptions::GetOutputFormat() { // was explicitly specified. std::string UnitTestOptions::GetAbsolutePathToOutputFile() { const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); - if (gtest_output_flag == NULL) - return ""; + + std::string format = GetOutputFormat(); + if (format.empty()) + format = std::string(kDefaultOutputFormat); const char* const colon = strchr(gtest_output_flag, ':'); if (colon == NULL) - return internal::FilePath::ConcatPaths( + return internal::FilePath::MakeFileName( internal::FilePath( UnitTest::GetInstance()->original_working_dir()), - internal::FilePath(kDefaultOutputFile)).string(); + internal::FilePath(kDefaultOutputFile), 0, + format.c_str()).string(); internal::FilePath output_name(colon + 1); if (!output_name.IsAbsolutePath()) - // TODO(wan@google.com): on Windows \some\path is not an absolute + // FIXME: on Windows \some\path is not an absolute // path (as its meaning depends on the current drive), yet the // following logic for turning it into an absolute path is wrong. // Fix it. @@ -629,12 +675,12 @@ extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); // This predicate-formatter checks that 'results' contains a test part // failure of the given type and that the failure message contains the // given substring. -AssertionResult HasOneFailure(const char* /* results_expr */, - const char* /* type_expr */, - const char* /* substr_expr */, - const TestPartResultArray& results, - TestPartResult::Type type, - const string& substr) { +static AssertionResult HasOneFailure(const char* /* results_expr */, + const char* /* type_expr */, + const char* /* substr_expr */, + const TestPartResultArray& results, + TestPartResult::Type type, + const std::string& substr) { const std::string expected(type == TestPartResult::kFatalFailure ? "1 fatal failure" : "1 non-fatal failure"); @@ -668,13 +714,10 @@ AssertionResult HasOneFailure(const char* /* results_expr */, // The constructor of SingleFailureChecker remembers where to look up // test part results, what type of failure we expect, and what // substring the failure message should contain. -SingleFailureChecker:: SingleFailureChecker( - const TestPartResultArray* results, - TestPartResult::Type type, - const string& substr) - : results_(results), - type_(type), - substr_(substr) {} +SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, + const std::string& substr) + : results_(results), type_(type), substr_(substr) {} // The destructor of SingleFailureChecker verifies that the given // TestPartResultArray contains exactly one failure that has the given @@ -815,7 +858,7 @@ TimeInMillis GetTimeInMillis() { SYSTEMTIME now_systime; FILETIME now_filetime; ULARGE_INTEGER now_int64; - // TODO(kenton@google.com): Shouldn't this just use + // FIXME: Shouldn't this just use // GetSystemTimeAsFileTime()? GetSystemTime(&now_systime); if (SystemTimeToFileTime(&now_systime, &now_filetime)) { @@ -831,11 +874,11 @@ TimeInMillis GetTimeInMillis() { // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 // (deprecated function) there. - // TODO(kenton@google.com): Use GetTickCount()? Or use + // FIXME: Use GetTickCount()? Or use // SystemTimeToFileTime() - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) + GTEST_DISABLE_MSC_DEPRECATED_PUSH_() _ftime64(&now); - GTEST_DISABLE_MSC_WARNINGS_POP_() + GTEST_DISABLE_MSC_DEPRECATED_POP_() return static_cast(now.time) * 1000 + now.millitm; #elif GTEST_HAS_GETTIMEOFDAY_ @@ -1172,7 +1215,7 @@ class Hunk { // Print a unified diff header for one hunk. // The format is // "@@ -, +, @@" - // where the left/right parts are ommitted if unnecessary. + // where the left/right parts are omitted if unnecessary. void PrintHeader(std::ostream* ss) const { *ss << "@@ "; if (removes_) { @@ -1316,13 +1359,14 @@ AssertionResult EqFailure(const char* lhs_expression, const std::string& rhs_value, bool ignoring_case) { Message msg; - msg << " Expected: " << lhs_expression; + msg << "Expected equality of these values:"; + msg << "\n " << lhs_expression; if (lhs_value != lhs_expression) { - msg << "\n Which is: " << lhs_value; + msg << "\n Which is: " << lhs_value; } - msg << "\nTo be equal to: " << rhs_expression; + msg << "\n " << rhs_expression; if (rhs_value != rhs_expression) { - msg << "\n Which is: " << rhs_value; + msg << "\n Which is: " << rhs_value; } if (ignoring_case) { @@ -1369,7 +1413,7 @@ AssertionResult DoubleNearPredFormat(const char* expr1, const double diff = fabs(val1 - val2); if (diff <= abs_error) return AssertionSuccess(); - // TODO(wan): do not print the value of an expression if it's + // FIXME: do not print the value of an expression if it's // already a literal. return AssertionFailure() << "The difference between " << expr1 << " and " << expr2 @@ -1664,7 +1708,7 @@ namespace { AssertionResult HRESULTFailureHelper(const char* expr, const char* expected, long hr) { // NOLINT -# if GTEST_OS_WINDOWS_MOBILE +# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE // Windows CE doesn't support FormatMessage. const char error_text[] = ""; @@ -1721,7 +1765,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT // Utility functions for encoding Unicode text (wide strings) in // UTF-8. -// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8 +// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8 // like this: // // Code-point length Encoding @@ -1785,7 +1829,7 @@ std::string CodePointToUtf8(UInt32 code_point) { return str; } -// The following two functions only make sense if the the system +// The following two functions only make sense if the system // uses UTF-16 for wide string encoding. All supported systems // with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. @@ -2097,13 +2141,8 @@ static const char* const kReservedTestSuiteAttributes[] = { // The list of reserved attributes used in the element of XML output. static const char* const kReservedTestCaseAttributes[] = { - "classname", - "name", - "status", - "time", - "type_param", - "value_param" -}; + "classname", "name", "status", "time", + "type_param", "value_param", "file", "line"}; template std::vector ArrayAsVector(const char* const (&array)[kSize]) { @@ -2139,8 +2178,9 @@ static std::string FormatWordList(const std::vector& words) { return word_list.GetString(); } -bool ValidateTestPropertyName(const std::string& property_name, - const std::vector& reserved_names) { +static bool ValidateTestPropertyName( + const std::string& property_name, + const std::vector& reserved_names) { if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != reserved_names.end()) { ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name @@ -2437,6 +2477,8 @@ Result HandleExceptionsInMethodIfSupported( #if GTEST_HAS_EXCEPTIONS try { return HandleSehExceptionsInMethodIfSupported(object, method, location); + } catch (const AssertionException&) { // NOLINT + // This failure was reported already. } catch (const internal::GoogleTestFailureException&) { // NOLINT // This exception type can only be thrown by a failed Google // Test assertion with the intention of letting another testing @@ -2558,7 +2600,6 @@ TestInfo* MakeAndRegisterTestInfo( return test_info; } -#if GTEST_HAS_PARAM_TEST void ReportInvalidTestCaseType(const char* test_case_name, CodeLocation code_location) { Message errors; @@ -2572,13 +2613,10 @@ void ReportInvalidTestCaseType(const char* test_case_name, << "probably rename one of the classes to put the tests into different\n" << "test cases."; - fprintf(stderr, "%s %s", - FormatFileLocation(code_location.file.c_str(), - code_location.line).c_str(), - errors.GetString().c_str()); + GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(), + code_location.line) + << " " << errors.GetString(); } -#endif // GTEST_HAS_PARAM_TEST - } // namespace internal namespace { @@ -2616,12 +2654,10 @@ namespace internal { // and INSTANTIATE_TEST_CASE_P into regular tests and registers those. // This will be done just once during the program runtime. void UnitTestImpl::RegisterParameterizedTests() { -#if GTEST_HAS_PARAM_TEST if (!parameterized_tests_registered_) { parameterized_test_registry_.RegisterTests(); parameterized_tests_registered_ = true; } -#endif } } // namespace internal @@ -2649,18 +2685,18 @@ void TestInfo::Run() { factory_, &internal::TestFactoryBase::CreateTest, "the test fixture's constructor"); - // Runs the test only if the test object was created and its - // constructor didn't generate a fatal failure. - if ((test != NULL) && !Test::HasFatalFailure()) { + // Runs the test if the constructor didn't generate a fatal failure. + // Note that the object will not be null + if (!Test::HasFatalFailure()) { // This doesn't throw as all user code that can throw are wrapped into // exception handling code. test->Run(); } - // Deletes the test object. - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - test, &Test::DeleteSelf_, "the test fixture's destructor"); + // Deletes the test object. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + test, &Test::DeleteSelf_, "the test fixture's destructor"); result_.set_elapsed_time(internal::GetTimeInMillis() - start); @@ -2886,10 +2922,10 @@ enum GTestColor { }; #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ - !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW // Returns the character attribute for the given color. -WORD GetColorAttribute(GTestColor color) { +static WORD GetColorAttribute(GTestColor color) { switch (color) { case COLOR_RED: return FOREGROUND_RED; case COLOR_GREEN: return FOREGROUND_GREEN; @@ -2898,11 +2934,42 @@ WORD GetColorAttribute(GTestColor color) { } } +static int GetBitOffset(WORD color_mask) { + if (color_mask == 0) return 0; + + int bitOffset = 0; + while ((color_mask & 1) == 0) { + color_mask >>= 1; + ++bitOffset; + } + return bitOffset; +} + +static WORD GetNewColor(GTestColor color, WORD old_color_attrs) { + // Let's reuse the BG + static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN | + BACKGROUND_RED | BACKGROUND_INTENSITY; + static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN | + FOREGROUND_RED | FOREGROUND_INTENSITY; + const WORD existing_bg = old_color_attrs & background_mask; + + WORD new_color = + GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY; + static const int bg_bitOffset = GetBitOffset(background_mask); + static const int fg_bitOffset = GetBitOffset(foreground_mask); + + if (((new_color & background_mask) >> bg_bitOffset) == + ((new_color & foreground_mask) >> fg_bitOffset)) { + new_color ^= FOREGROUND_INTENSITY; // invert intensity + } + return new_color; +} + #else // Returns the ANSI color code for the given color. COLOR_DEFAULT is // an invalid input. -const char* GetAnsiColorCode(GTestColor color) { +static const char* GetAnsiColorCode(GTestColor color) { switch (color) { case COLOR_RED: return "1"; case COLOR_GREEN: return "2"; @@ -2918,7 +2985,7 @@ bool ShouldUseColor(bool stdout_is_tty) { const char* const gtest_color = GTEST_FLAG(color).c_str(); if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { -#if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW // On Windows the TERM variable is usually not set, but the // console there does support colors. return stdout_is_tty; @@ -2954,7 +3021,7 @@ bool ShouldUseColor(bool stdout_is_tty) { // cannot simply emit special characters and have the terminal change colors. // This routine must actually emit the characters rather than return a string // that would be colored when printed, as can be done on Linux. -void ColoredPrintf(GTestColor color, const char* fmt, ...) { +static void ColoredPrintf(GTestColor color, const char* fmt, ...) { va_list args; va_start(args, fmt); @@ -2975,20 +3042,21 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) { } #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ - !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); // Gets the current text color. CONSOLE_SCREEN_BUFFER_INFO buffer_info; GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); const WORD old_color_attrs = buffer_info.wAttributes; + const WORD new_color = GetNewColor(color, old_color_attrs); // We need to flush the stream buffers into the console before each // SetConsoleTextAttribute call lest it affect the text that is already // printed but has not yet reached the console. fflush(stdout); - SetConsoleTextAttribute(stdout_handle, - GetColorAttribute(color) | FOREGROUND_INTENSITY); + SetConsoleTextAttribute(stdout_handle, new_color); + vprintf(fmt, args); fflush(stdout); @@ -3002,12 +3070,12 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) { va_end(args); } -// Text printed in Google Test's text output and --gunit_list_tests +// Text printed in Google Test's text output and --gtest_list_tests // output to label the type parameter and value parameter for a test. static const char kTypeParamLabel[] = "TypeParam"; static const char kValueParamLabel[] = "GetParam()"; -void PrintFullTestCommentIfPresent(const TestInfo& test_info) { +static void PrintFullTestCommentIfPresent(const TestInfo& test_info) { const char* const type_param = test_info.type_param(); const char* const value_param = test_info.value_param(); @@ -3278,7 +3346,7 @@ void TestEventRepeater::Append(TestEventListener *listener) { listeners_.push_back(listener); } -// TODO(vladl@google.com): Factor the search functionality into Vector::Find. +// FIXME: Factor the search functionality into Vector::Find. TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { for (size_t i = 0; i < listeners_.size(); ++i) { if (listeners_[i] == listener) { @@ -3352,6 +3420,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { explicit XmlUnitTestResultPrinter(const char* output_file); virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + void ListTestsMatchingFilter(const std::vector& test_cases); + + // Prints an XML summary of all unit tests. + static void PrintXmlTestsList(std::ostream* stream, + const std::vector& test_cases); private: // Is c a whitespace character that is normalized to a space character @@ -3413,6 +3486,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { // to delimit this attribute from prior attributes. static std::string TestPropertiesAsXmlAttributes(const TestResult& result); + // Streams an XML representation of the test properties of a TestResult + // object. + static void OutputXmlTestProperties(std::ostream* stream, + const TestResult& result); + // The output file. const std::string output_file_; @@ -3422,46 +3500,30 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { // Creates a new XmlUnitTestResultPrinter. XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) : output_file_(output_file) { - if (output_file_.c_str() == NULL || output_file_.empty()) { - fprintf(stderr, "XML output file may not be null\n"); - fflush(stderr); - exit(EXIT_FAILURE); + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "XML output file may not be null"; } } // Called after the unit test ends. void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, int /*iteration*/) { - FILE* xmlout = NULL; - FilePath output_file(output_file_); - FilePath output_dir(output_file.RemoveFileName()); - - if (output_dir.CreateDirectoriesRecursively()) { - xmlout = posix::FOpen(output_file_.c_str(), "w"); - } - if (xmlout == NULL) { - // TODO(wan): report the reason of the failure. - // - // We don't do it for now as: - // - // 1. There is no urgent need for it. - // 2. It's a bit involved to make the errno variable thread-safe on - // all three operating systems (Linux, Windows, and Mac OS). - // 3. To interpret the meaning of errno in a thread-safe way, - // we need the strerror_r() function, which is not available on - // Windows. - fprintf(stderr, - "Unable to open file \"%s\"\n", - output_file_.c_str()); - fflush(stderr); - exit(EXIT_FAILURE); - } + FILE* xmlout = OpenFileForWriting(output_file_); std::stringstream stream; PrintXmlUnitTest(&stream, unit_test); fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); fclose(xmlout); } +void XmlUnitTestResultPrinter::ListTestsMatchingFilter( + const std::vector& test_cases) { + FILE* xmlout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintXmlTestsList(&stream, test_cases); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + // Returns an XML-escaped copy of the input string str. If is_attribute // is true, the text is meant to appear as an attribute value, and // normalizable whitespace is preserved by replacing it with character @@ -3472,7 +3534,7 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, // module will consist of ordinary English text. // If this module is ever modified to produce version 1.1 XML output, // most invalid characters can be retained using character references. -// TODO(wan): It might be nice to have a minimally invasive, human-readable +// FIXME: It might be nice to have a minimally invasive, human-readable // escaping scheme for invalid characters, rather than dropping them. std::string XmlUnitTestResultPrinter::EscapeXml( const std::string& str, bool is_attribute) { @@ -3533,6 +3595,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( // The following routines generate an XML representation of a UnitTest // object. +// GOOGLETEST_CM0009 DO NOT DELETE // // This is how Google Test concepts map to the DTD: // @@ -3622,13 +3685,17 @@ void XmlUnitTestResultPrinter::OutputXmlAttribute( } // Prints an XML representation of a TestInfo object. -// TODO(wan): There is also value in printing properties with the plain printer. +// FIXME: There is also value in printing properties with the plain printer. void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, const char* test_case_name, const TestInfo& test_info) { const TestResult& result = *test_info.result(); const std::string kTestcase = "testcase"; + if (test_info.is_in_another_shard()) { + return; + } + *stream << " \n"; + return; + } OutputXmlAttribute(stream, kTestcase, "status", test_info.should_run() ? "run" : "notrun"); OutputXmlAttribute(stream, kTestcase, "time", FormatTimeInMillisAsSeconds(result.elapsed_time())); OutputXmlAttribute(stream, kTestcase, "classname", test_case_name); - *stream << TestPropertiesAsXmlAttributes(result); int failures = 0; for (int i = 0; i < result.total_part_count(); ++i) { @@ -3654,22 +3727,28 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, if (++failures == 1) { *stream << ">\n"; } - const string location = internal::FormatCompilerIndependentFileLocation( - part.file_name(), part.line_number()); - const string summary = location + "\n" + part.summary(); + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string summary = location + "\n" + part.summary(); *stream << " "; - const string detail = location + "\n" + part.message(); + const std::string detail = location + "\n" + part.message(); OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); *stream << "\n"; } } - if (failures == 0) + if (failures == 0 && result.test_property_count() == 0) { *stream << " />\n"; - else + } else { + if (failures == 0) { + *stream << ">\n"; + } + OutputXmlTestProperties(stream, result); *stream << " \n"; + } } // Prints an XML representation of a TestCase object @@ -3680,17 +3759,18 @@ void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream, OutputXmlAttribute(stream, kTestsuite, "name", test_case.name()); OutputXmlAttribute(stream, kTestsuite, "tests", StreamableToString(test_case.reportable_test_count())); - OutputXmlAttribute(stream, kTestsuite, "failures", - StreamableToString(test_case.failed_test_count())); - OutputXmlAttribute( - stream, kTestsuite, "disabled", - StreamableToString(test_case.reportable_disabled_test_count())); - OutputXmlAttribute(stream, kTestsuite, "errors", "0"); - OutputXmlAttribute(stream, kTestsuite, "time", - FormatTimeInMillisAsSeconds(test_case.elapsed_time())); - *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result()) - << ">\n"; - + if (!GTEST_FLAG(list_tests)) { + OutputXmlAttribute(stream, kTestsuite, "failures", + StreamableToString(test_case.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuite, "disabled", + StreamableToString(test_case.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuite, "errors", "0"); + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(test_case.elapsed_time())); + *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result()); + } + *stream << ">\n"; for (int i = 0; i < test_case.total_test_count(); ++i) { if (test_case.GetTestInfo(i)->is_reportable()) OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); @@ -3724,7 +3804,6 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, OutputXmlAttribute(stream, kTestsuites, "random_seed", StreamableToString(unit_test.random_seed())); } - *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); @@ -3737,6 +3816,28 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, *stream << "\n"; } +void XmlUnitTestResultPrinter::PrintXmlTestsList( + std::ostream* stream, const std::vector& test_cases) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + int total_tests = 0; + for (size_t i = 0; i < test_cases.size(); ++i) { + total_tests += test_cases[i]->total_test_count(); + } + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(total_tests)); + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (size_t i = 0; i < test_cases.size(); ++i) { + PrintXmlTestCase(stream, *test_cases[i]); + } + *stream << "\n"; +} + // Produces a string representing the test properties in a result as space // delimited XML attributes based on the property key="value" pairs. std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( @@ -3750,8 +3851,390 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( return attributes.GetString(); } +void XmlUnitTestResultPrinter::OutputXmlTestProperties( + std::ostream* stream, const TestResult& result) { + const std::string kProperties = "properties"; + const std::string kProperty = "property"; + + if (result.test_property_count() <= 0) { + return; + } + + *stream << "<" << kProperties << ">\n"; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + *stream << "<" << kProperty; + *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\""; + *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\""; + *stream << "/>\n"; + } + *stream << "\n"; +} + // End XmlUnitTestResultPrinter +// This class generates an JSON output file. +class JsonUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit JsonUnitTestResultPrinter(const char* output_file); + + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + + // Prints an JSON summary of all unit tests. + static void PrintJsonTestList(::std::ostream* stream, + const std::vector& test_cases); + + private: + // Returns an JSON-escaped copy of the input string str. + static std::string EscapeJson(const std::string& str); + + //// Verifies that the given attribute belongs to the given element and + //// streams the attribute as JSON. + static void OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value, + const std::string& indent, + bool comma = true); + static void OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, + int value, + const std::string& indent, + bool comma = true); + + // Streams a JSON representation of a TestInfo object. + static void OutputJsonTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info); + + // Prints a JSON representation of a TestCase object + static void PrintJsonTestCase(::std::ostream* stream, + const TestCase& test_case); + + // Prints a JSON summary of unit_test to output stream out. + static void PrintJsonUnitTest(::std::ostream* stream, + const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as + // a JSON dictionary. + static std::string TestPropertiesAsJson(const TestResult& result, + const std::string& indent); + + // The output file. + const std::string output_file_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter); +}; + +// Creates a new JsonUnitTestResultPrinter. +JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "JSON output file may not be null"; + } +} + +void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* jsonout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintJsonUnitTest(&stream, unit_test); + fprintf(jsonout, "%s", StringStreamToString(&stream).c_str()); + fclose(jsonout); +} + +// Returns an JSON-escaped copy of the input string str. +std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '\\': + case '"': + case '/': + m << '\\' << ch; + break; + case '\b': + m << "\\b"; + break; + case '\t': + m << "\\t"; + break; + case '\n': + m << "\\n"; + break; + case '\f': + m << "\\f"; + break; + case '\r': + m << "\\r"; + break; + default: + if (ch < ' ') { + m << "\\u00" << String::FormatByte(static_cast(ch)); + } else { + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// The following routines generate an JSON representation of a UnitTest +// object. + +// Formats the given time in milliseconds as seconds. +static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3) << "s"; + return ss.str(); +} + +// Converts the given epoch time in milliseconds to a date string in the +// RFC3339 format, without the timezone information. +static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "Z"; +} + +static inline std::string Indent(int width) { + return std::string(width, ' '); +} + +void JsonUnitTestResultPrinter::OutputJsonKey( + std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value, + const std::string& indent, + bool comma) { + const std::vector& allowed_names = + GetReservedAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\""; + if (comma) + *stream << ",\n"; +} + +void JsonUnitTestResultPrinter::OutputJsonKey( + std::ostream* stream, + const std::string& element_name, + const std::string& name, + int value, + const std::string& indent, + bool comma) { + const std::vector& allowed_names = + GetReservedAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": " << StreamableToString(value); + if (comma) + *stream << ",\n"; +} + +// Prints a JSON representation of a TestInfo object. +void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + const std::string kTestcase = "testcase"; + const std::string kIndent = Indent(10); + + *stream << Indent(8) << "{\n"; + OutputJsonKey(stream, kTestcase, "name", test_info.name(), kIndent); + + if (test_info.value_param() != NULL) { + OutputJsonKey(stream, kTestcase, "value_param", + test_info.value_param(), kIndent); + } + if (test_info.type_param() != NULL) { + OutputJsonKey(stream, kTestcase, "type_param", test_info.type_param(), + kIndent); + } + if (GTEST_FLAG(list_tests)) { + OutputJsonKey(stream, kTestcase, "file", test_info.file(), kIndent); + OutputJsonKey(stream, kTestcase, "line", test_info.line(), kIndent, false); + *stream << "\n" << Indent(8) << "}"; + return; + } + + OutputJsonKey(stream, kTestcase, "status", + test_info.should_run() ? "RUN" : "NOTRUN", kIndent); + OutputJsonKey(stream, kTestcase, "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent); + OutputJsonKey(stream, kTestcase, "classname", test_case_name, kIndent, false); + *stream << TestPropertiesAsJson(result, kIndent); + + int failures = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult& part = result.GetTestPartResult(i); + if (part.failed()) { + *stream << ",\n"; + if (++failures == 1) { + *stream << kIndent << "\"" << "failures" << "\": [\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string message = EscapeJson(location + "\n" + part.message()); + *stream << kIndent << " {\n" + << kIndent << " \"failure\": \"" << message << "\",\n" + << kIndent << " \"type\": \"\"\n" + << kIndent << " }"; + } + } + + if (failures > 0) + *stream << "\n" << kIndent << "]"; + *stream << "\n" << Indent(8) << "}"; +} + +// Prints an JSON representation of a TestCase object +void JsonUnitTestResultPrinter::PrintJsonTestCase(std::ostream* stream, + const TestCase& test_case) { + const std::string kTestsuite = "testsuite"; + const std::string kIndent = Indent(6); + + *stream << Indent(4) << "{\n"; + OutputJsonKey(stream, kTestsuite, "name", test_case.name(), kIndent); + OutputJsonKey(stream, kTestsuite, "tests", test_case.reportable_test_count(), + kIndent); + if (!GTEST_FLAG(list_tests)) { + OutputJsonKey(stream, kTestsuite, "failures", test_case.failed_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuite, "disabled", + test_case.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent); + OutputJsonKey(stream, kTestsuite, "time", + FormatTimeInMillisAsDuration(test_case.elapsed_time()), + kIndent, false); + *stream << TestPropertiesAsJson(test_case.ad_hoc_test_result(), kIndent) + << ",\n"; + } + + *stream << kIndent << "\"" << kTestsuite << "\": [\n"; + + bool comma = false; + for (int i = 0; i < test_case.total_test_count(); ++i) { + if (test_case.GetTestInfo(i)->is_reportable()) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + OutputJsonTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); + } + } + *stream << "\n" << kIndent << "]\n" << Indent(4) << "}"; +} + +// Prints a JSON summary of unit_test to output stream out. +void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, + const UnitTest& unit_test) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + + OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "disabled", + unit_test.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent); + if (GTEST_FLAG(shuffle)) { + OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(), + kIndent); + } + OutputJsonKey(stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuites, "time", + FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent, + false); + + *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent) + << ",\n"; + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + bool comma = false; + for (int i = 0; i < unit_test.total_test_case_count(); ++i) { + if (unit_test.GetTestCase(i)->reportable_test_count() > 0) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + PrintJsonTestCase(stream, *unit_test.GetTestCase(i)); + } + } + + *stream << "\n" << kIndent << "]\n" << "}\n"; +} + +void JsonUnitTestResultPrinter::PrintJsonTestList( + std::ostream* stream, const std::vector& test_cases) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + int total_tests = 0; + for (size_t i = 0; i < test_cases.size(); ++i) { + total_tests += test_cases[i]->total_test_count(); + } + OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent); + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + for (size_t i = 0; i < test_cases.size(); ++i) { + if (i != 0) { + *stream << ",\n"; + } + PrintJsonTestCase(stream, *test_cases[i]); + } + + *stream << "\n" + << kIndent << "]\n" + << "}\n"; +} +// Produces a string representing the test properties in a result as +// a JSON dictionary. +std::string JsonUnitTestResultPrinter::TestPropertiesAsJson( + const TestResult& result, const std::string& indent) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << ",\n" << indent << "\"" << property.key() << "\": " + << "\"" << EscapeJson(property.value()) << "\""; + } + return attributes.GetString(); +} + +// End JsonUnitTestResultPrinter + #if GTEST_CAN_STREAM_RESULTS_ // Checks if str contains '=', '&', '%' or '\n' characters. If yes, @@ -3759,8 +4242,8 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( // example, replaces "=" with "%3D". This algorithm is O(strlen(str)) // in both time and space -- important as the input str may contain an // arbitrarily long test failure message and stack trace. -string StreamingListener::UrlEncode(const char* str) { - string result; +std::string StreamingListener::UrlEncode(const char* str) { + std::string result; result.reserve(strlen(str) + 1); for (char ch = *str; ch != '\0'; ch = *++str) { switch (ch) { @@ -3822,47 +4305,82 @@ void StreamingListener::SocketWriter::MakeConnection() { // End of class Streaming Listener #endif // GTEST_CAN_STREAM_RESULTS__ -// Class ScopedTrace - -// Pushes the given source file location and message onto a per-thread -// trace stack maintained by Google Test. -ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { - TraceInfo trace; - trace.file = file; - trace.line = line; - trace.message = message.GetString(); - - UnitTest::GetInstance()->PushGTestTrace(trace); -} - -// Pops the info pushed by the c'tor. -ScopedTrace::~ScopedTrace() - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { - UnitTest::GetInstance()->PopGTestTrace(); -} - - // class OsStackTraceGetter const char* const OsStackTraceGetterInterface::kElidedFramesMarker = "... " GTEST_NAME_ " internal frames ..."; -string OsStackTraceGetter::CurrentStackTrace(int /*max_depth*/, - int /*skip_count*/) { +std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) + GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + std::string result; + + if (max_depth <= 0) { + return result; + } + + max_depth = std::min(max_depth, kMaxStackTraceDepth); + + std::vector raw_stack(max_depth); + // Skips the frames requested by the caller, plus this function. + const int raw_stack_size = + absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1); + + void* caller_frame = nullptr; + { + MutexLock lock(&mutex_); + caller_frame = caller_frame_; + } + + for (int i = 0; i < raw_stack_size; ++i) { + if (raw_stack[i] == caller_frame && + !GTEST_FLAG(show_internal_stack_frames)) { + // Add a marker to the trace and stop adding frames. + absl::StrAppend(&result, kElidedFramesMarker, "\n"); + break; + } + + char tmp[1024]; + const char* symbol = "(unknown)"; + if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) { + symbol = tmp; + } + + char line[1024]; + snprintf(line, sizeof(line), " %p: %s\n", raw_stack[i], symbol); + result += line; + } + + return result; + +#else // !GTEST_HAS_ABSL + static_cast(max_depth); + static_cast(skip_count); return ""; +#endif // GTEST_HAS_ABSL } -void OsStackTraceGetter::UponLeavingGTest() {} +void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + void* caller_frame = nullptr; + if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) { + caller_frame = nullptr; + } + + MutexLock lock(&mutex_); + caller_frame_ = caller_frame; +#endif // GTEST_HAS_ABSL +} // A helper class that creates the premature-exit file in its // constructor and deletes the file in its destructor. class ScopedPrematureExitFile { public: explicit ScopedPrematureExitFile(const char* premature_exit_filepath) - : premature_exit_filepath_(premature_exit_filepath) { + : premature_exit_filepath_(premature_exit_filepath ? + premature_exit_filepath : "") { // If a path to the premature-exit file is specified... - if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') { + if (!premature_exit_filepath_.empty()) { // create the file with a single "0" character in it. I/O // errors are ignored as there's nothing better we can do and we // don't want to fail the test because of this. @@ -3873,13 +4391,18 @@ class ScopedPrematureExitFile { } ~ScopedPrematureExitFile() { - if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') { - remove(premature_exit_filepath_); + if (!premature_exit_filepath_.empty()) { + int retval = remove(premature_exit_filepath_.c_str()); + if (retval) { + GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \"" + << premature_exit_filepath_ << "\" with error " + << retval; + } } } private: - const char* const premature_exit_filepath_; + const std::string premature_exit_filepath_; GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); }; @@ -4149,6 +4672,11 @@ void UnitTest::AddTestPartResult( // when a failure happens and both the --gtest_break_on_failure and // the --gtest_catch_exceptions flags are specified. DebugBreak(); +#elif (!defined(__native_client__)) && \ + ((defined(__clang__) || defined(__GNUC__)) && \ + (defined(__x86_64__) || defined(__i386__))) + // with clang/gcc we can achieve the same effect on x86 by invoking int3 + asm("int3"); #else // Dereference NULL through a volatile pointer to prevent the compiler // from removing. We use this rather than abort() or __builtin_trap() for @@ -4216,7 +4744,7 @@ int UnitTest::Run() { // used for the duration of the program. impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); -#if GTEST_HAS_SEH +#if GTEST_OS_WINDOWS // Either the user wants Google Test to catch exceptions thrown by the // tests or this is executing in the context of death test child // process. In either case the user does not want to see pop-up dialogs @@ -4245,7 +4773,7 @@ int UnitTest::Run() { // VC++ doesn't define _set_abort_behavior() prior to the version 8.0. // Users of prior VC versions shall suffer the agony and pain of // clicking through the countless debug dialogs. - // TODO(vladl@google.com): find a way to suppress the abort dialog() in the + // FIXME: find a way to suppress the abort dialog() in the // debug mode when compiled with VC 7.1 or lower. if (!GTEST_FLAG(break_on_failure)) _set_abort_behavior( @@ -4253,7 +4781,7 @@ int UnitTest::Run() { _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. # endif } -#endif // GTEST_HAS_SEH +#endif // GTEST_OS_WINDOWS return internal::HandleExceptionsInMethodIfSupported( impl(), @@ -4286,7 +4814,6 @@ const TestInfo* UnitTest::current_test_info() const // Returns the random seed used at the start of the current test run. int UnitTest::random_seed() const { return impl_->random_seed(); } -#if GTEST_HAS_PARAM_TEST // Returns ParameterizedTestCaseRegistry object used to keep track of // value-parameterized tests and instantiate and register them. internal::ParameterizedTestCaseRegistry& @@ -4294,7 +4821,6 @@ internal::ParameterizedTestCaseRegistry& GTEST_LOCK_EXCLUDED_(mutex_) { return impl_->parameterized_test_registry(); } -#endif // GTEST_HAS_PARAM_TEST // Creates an empty UnitTest. UnitTest::UnitTest() { @@ -4333,10 +4859,8 @@ UnitTestImpl::UnitTestImpl(UnitTest* parent) &default_global_test_part_result_reporter_), per_thread_test_part_result_reporter_( &default_per_thread_test_part_result_reporter_), -#if GTEST_HAS_PARAM_TEST parameterized_test_registry_(), parameterized_tests_registered_(false), -#endif // GTEST_HAS_PARAM_TEST last_death_test_case_(-1), current_test_case_(NULL), current_test_info_(NULL), @@ -4403,10 +4927,12 @@ void UnitTestImpl::ConfigureXmlOutput() { if (output_format == "xml") { listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format == "json") { + listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); } else if (output_format != "") { - printf("WARNING: unrecognized output format \"%s\" ignored.\n", - output_format.c_str()); - fflush(stdout); + GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \"" + << output_format << "\" ignored."; } } @@ -4421,9 +4947,8 @@ void UnitTestImpl::ConfigureStreamingOutput() { listeners()->Append(new StreamingListener(target.substr(0, pos), target.substr(pos+1))); } else { - printf("WARNING: unrecognized streaming target \"%s\" ignored.\n", - target.c_str()); - fflush(stdout); + GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target + << "\" ignored."; } } } @@ -4462,6 +4987,13 @@ void UnitTestImpl::PostFlagParsingInit() { // Configures listeners for streaming test results to the specified server. ConfigureStreamingOutput(); #endif // GTEST_CAN_STREAM_RESULTS_ + +#if GTEST_HAS_ABSL + if (GTEST_FLAG(install_failure_signal_handler)) { + absl::FailureSignalHandlerOptions options; + absl::InstallFailureSignalHandler(options); + } +#endif // GTEST_HAS_ABSL } } @@ -4505,11 +5037,11 @@ TestCase* UnitTestImpl::GetTestCase(const char* test_case_name, Test::SetUpTestCaseFunc set_up_tc, Test::TearDownTestCaseFunc tear_down_tc) { // Can we find a TestCase with the given name? - const std::vector::const_iterator test_case = - std::find_if(test_cases_.begin(), test_cases_.end(), + const std::vector::const_reverse_iterator test_case = + std::find_if(test_cases_.rbegin(), test_cases_.rend(), TestCaseNameIs(test_case_name)); - if (test_case != test_cases_.end()) + if (test_case != test_cases_.rend()) return *test_case; // No. Let's create one. @@ -4550,13 +5082,8 @@ static void TearDownEnvironment(Environment* env) { env->TearDown(); } // All other functions called from RunAllTests() may safely assume that // parameterized tests are ready to be counted and run. bool UnitTestImpl::RunAllTests() { - // Makes sure InitGoogleTest() was called. - if (!GTestIsInitialized()) { - printf("%s", - "\nThis test program did NOT call ::testing::InitGoogleTest " - "before calling RUN_ALL_TESTS(). Please fix it.\n"); - return false; - } + // True iff Google Test is initialized before RUN_ALL_TESTS() is called. + const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized(); // Do not run any test if the --help flag was specified. if (g_help_flag) @@ -4684,6 +5211,20 @@ bool UnitTestImpl::RunAllTests() { repeater->OnTestProgramEnd(*parent_); + if (!gtest_is_initialized_before_run_all_tests) { + ColoredPrintf( + COLOR_RED, + "\nIMPORTANT NOTICE - DO NOT IGNORE:\n" + "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_ + "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_ + " will start to enforce the valid usage. " + "Please fix it ASAP, or IT WILL START TO FAIL.\n"); // NOLINT +#if GTEST_FOR_GOOGLE_ + ColoredPrintf(COLOR_RED, + "For more details, see http://wiki/Main/ValidGUnitMain.\n"); +#endif // GTEST_FOR_GOOGLE_ + } + return !failed; } @@ -4785,8 +5326,8 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { // each TestCase and TestInfo object. // If shard_tests == true, further filters tests based on sharding // variables in the environment - see -// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide. -// Returns the number of tests that should run. +// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md +// . Returns the number of tests that should run. int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? Int32FromEnvOrDie(kTestTotalShards, -1) : -1; @@ -4825,10 +5366,11 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && matches_filter; - const bool is_selected = is_runnable && - (shard_tests == IGNORE_SHARDING_PROTOCOL || - ShouldRunTestOnShard(total_shards, shard_index, - num_runnable_tests)); + const bool is_in_another_shard = + shard_tests != IGNORE_SHARDING_PROTOCOL && + !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests); + test_info->is_in_another_shard_ = is_in_another_shard; + const bool is_selected = is_runnable && !is_in_another_shard; num_runnable_tests += is_runnable; num_selected_tests += is_selected; @@ -4898,6 +5440,23 @@ void UnitTestImpl::ListTestsMatchingFilter() { } } fflush(stdout); + const std::string& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml" || output_format == "json") { + FILE* fileout = OpenFileForWriting( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()); + std::stringstream stream; + if (output_format == "xml") { + XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintXmlTestsList(&stream, test_cases_); + } else if (output_format == "json") { + JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintJsonTestList(&stream, test_cases_); + } + fprintf(fileout, "%s", StringStreamToString(&stream).c_str()); + fclose(fileout); + } } // Sets the OS stack trace getter. @@ -4928,11 +5487,15 @@ OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { return os_stack_trace_getter_; } -// Returns the TestResult for the test that's currently running, or -// the TestResult for the ad hoc test if no test is running. +// Returns the most specific TestResult currently running. TestResult* UnitTestImpl::current_test_result() { - return current_test_info_ ? - &(current_test_info_->result_) : &ad_hoc_test_result_; + if (current_test_info_ != NULL) { + return ¤t_test_info_->result_; + } + if (current_test_case_ != NULL) { + return ¤t_test_case_->ad_hoc_test_result_; + } + return &ad_hoc_test_result_; } // Shuffles all test cases, and the tests within each test case, @@ -5013,9 +5576,8 @@ bool SkipPrefix(const char* prefix, const char** pstr) { // part can be omitted. // // Returns the value of the flag, or NULL if the parsing failed. -const char* ParseFlagValue(const char* str, - const char* flag, - bool def_optional) { +static const char* ParseFlagValue(const char* str, const char* flag, + bool def_optional) { // str and flag must not be NULL. if (str == NULL || flag == NULL) return NULL; @@ -5051,7 +5613,7 @@ const char* ParseFlagValue(const char* str, // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -bool ParseBoolFlag(const char* str, const char* flag, bool* value) { +static bool ParseBoolFlag(const char* str, const char* flag, bool* value) { // Gets the value of the flag as a string. const char* const value_str = ParseFlagValue(str, flag, true); @@ -5085,7 +5647,8 @@ bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -bool ParseStringFlag(const char* str, const char* flag, std::string* value) { +template +static bool ParseStringFlag(const char* str, const char* flag, String* value) { // Gets the value of the flag as a string. const char* const value_str = ParseFlagValue(str, flag, false); @@ -5121,7 +5684,7 @@ static bool HasGoogleTestFlagPrefix(const char* str) { // @Y changes the color to yellow. // @D changes to the default terminal text color. // -// TODO(wan@google.com): Write tests for this once we add stdout +// FIXME: Write tests for this once we add stdout // capturing to Google Test. static void PrintColorEncoded(const char* str) { GTestColor color = COLOR_DEFAULT; // The current color. @@ -5187,24 +5750,25 @@ static const char kColorEncodedHelpMessage[] = " Enable/disable colored output. The default is @Gauto@D.\n" " -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n" " Don't print the elapsed time of each test.\n" -" @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G" +" @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" -" Generate an XML report in the given directory or with the given file\n" -" name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" -#if GTEST_CAN_STREAM_RESULTS_ +" Generate a JSON or XML report in the given directory or with the given\n" +" file name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" +# if GTEST_CAN_STREAM_RESULTS_ " @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" " Stream test results to the given server.\n" -#endif // GTEST_CAN_STREAM_RESULTS_ +# endif // GTEST_CAN_STREAM_RESULTS_ "\n" "Assertion Behavior:\n" -#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS " @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" " Set the default death test style.\n" -#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +# endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS " @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" " Turn assertion failures into debugger break-points.\n" " @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n" -" Turn assertion failures into C++ exceptions.\n" +" Turn assertion failures into C++ exceptions for use by an external\n" +" test framework.\n" " @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n" " Do not report exceptions as test failures. Instead, allow them\n" " to crash the program or throw a pop-up (on Windows).\n" @@ -5221,7 +5785,7 @@ static const char kColorEncodedHelpMessage[] = "(not one in your own code or tests), please report it to\n" "@G<" GTEST_DEV_EMAIL_ ">@D.\n"; -bool ParseGoogleTestFlag(const char* const arg) { +static bool ParseGoogleTestFlag(const char* const arg) { return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, >EST_FLAG(also_run_disabled_tests)) || ParseBoolFlag(arg, kBreakOnFailureFlag, @@ -5239,6 +5803,7 @@ bool ParseGoogleTestFlag(const char* const arg) { ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || + ParseBoolFlag(arg, kPrintUTF8Flag, >EST_FLAG(print_utf8)) || ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || @@ -5251,14 +5816,11 @@ bool ParseGoogleTestFlag(const char* const arg) { } #if GTEST_USE_OWN_FLAGFILE_FLAG_ -void LoadFlagsFromFile(const std::string& path) { +static void LoadFlagsFromFile(const std::string& path) { FILE* flagfile = posix::FOpen(path.c_str(), "r"); if (!flagfile) { - fprintf(stderr, - "Unable to open file \"%s\"\n", - GTEST_FLAG(flagfile).c_str()); - fflush(stderr); - exit(EXIT_FAILURE); + GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile) + << "\""; } std::string contents(ReadEntireFile(flagfile)); posix::FClose(flagfile); @@ -5332,6 +5894,17 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { // other parts of Google Test. void ParseGoogleTestFlagsOnly(int* argc, char** argv) { ParseGoogleTestFlagsOnlyImpl(argc, argv); + + // Fix the value of *_NSGetArgc() on macOS, but iff + // *_NSGetArgv() == argv + // Only applicable to char** version of argv +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS + if (*_NSGetArgv() == argv) { + *_NSGetArgc() = *argc; + } +#endif +#endif } void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { ParseGoogleTestFlagsOnlyImpl(argc, argv); @@ -5353,6 +5926,10 @@ void InitGoogleTestImpl(int* argc, CharType** argv) { g_argvs.push_back(StreamableToString(argv[i])); } +#if GTEST_HAS_ABSL + absl::InitializeSymbolizer(g_argvs[0].c_str()); +#endif // GTEST_HAS_ABSL + ParseGoogleTestFlagsOnly(argc, argv); GetUnitTestImpl()->PostFlagParsingInit(); } @@ -5386,4 +5963,45 @@ void InitGoogleTest(int* argc, wchar_t** argv) { #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } +std::string TempDir() { +#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) + return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + return "\\temp\\"; +#elif GTEST_OS_WINDOWS + const char* temp_dir = internal::posix::GetEnv("TEMP"); + if (temp_dir == NULL || temp_dir[0] == '\0') + return "\\temp\\"; + else if (temp_dir[strlen(temp_dir) - 1] == '\\') + return temp_dir; + else + return std::string(temp_dir) + "\\"; +#elif GTEST_OS_LINUX_ANDROID + return "/sdcard/"; +#else + return "/tmp/"; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Class ScopedTrace + +// Pushes the given source file location and message onto a per-thread +// trace stack maintained by Google Test. +void ScopedTrace::PushTrace(const char* file, int line, std::string message) { + internal::TraceInfo trace; + trace.file = file; + trace.line = line; + trace.message.swap(message); + + UnitTest::GetInstance()->PushGTestTrace(trace); +} + +// Pops the info pushed by the c'tor. +ScopedTrace::~ScopedTrace() + GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + UnitTest::GetInstance()->PopGTestTrace(); +} + } // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/src/gtest_main.cc b/libs/libvpx/third_party/googletest/src/src/gtest_main.cc index f302822552..2113f621e6 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest_main.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest_main.cc @@ -28,11 +28,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include - #include "gtest/gtest.h" GTEST_API_ int main(int argc, char **argv) { - printf("Running main() from gtest_main.cc\n"); + printf("Running main() from %s\n", __FILE__); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/libs/libvpx/third_party/libwebm/Android.mk b/libs/libvpx/third_party/libwebm/Android.mk index 8149a083f4..b46ba101d4 100644 --- a/libs/libvpx/third_party/libwebm/Android.mk +++ b/libs/libvpx/third_party/libwebm/Android.mk @@ -3,7 +3,7 @@ LOCAL_PATH:= $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE:= libwebm LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat +LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11 LOCAL_C_INCLUDES:= $(LOCAL_PATH) LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH) diff --git a/libs/libvpx/third_party/libwebm/README.libvpx b/libs/libvpx/third_party/libwebm/README.libvpx index ebb5ff2f4d..714f5d0eb5 100644 --- a/libs/libvpx/third_party/libwebm/README.libvpx +++ b/libs/libvpx/third_party/libwebm/README.libvpx @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74 +Version: 81de00c43ea3c087b48a8c20337db7531b9f7612 License: BSD License File: LICENSE.txt @@ -7,4 +7,14 @@ Description: libwebm is used to handle WebM container I/O. Local Changes: -* +Only keep: + - Android.mk + - AUTHORS.TXT + - common/ + file_util.cc/h + hdr_util.cc/h + webmids.h + - LICENSE.TXT + - mkvmuxer/ + - mkvparser/ + - PATENTS.TXT diff --git a/libs/libvpx/third_party/libwebm/common/file_util.cc b/libs/libvpx/third_party/libwebm/common/file_util.cc index 6dab146dd9..6eb6428b98 100644 --- a/libs/libvpx/third_party/libwebm/common/file_util.cc +++ b/libs/libvpx/third_party/libwebm/common/file_util.cc @@ -17,14 +17,15 @@ #include #include #include +#include namespace libwebm { std::string GetTempFileName() { #if !defined _MSC_VER && !defined __MINGW32__ std::string temp_file_name_template_str = - std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") : - ".") + + std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") + : ".") + "/libwebm_temp.XXXXXX"; char* temp_file_name_template = new char[temp_file_name_template_str.length() + 1]; @@ -41,7 +42,12 @@ std::string GetTempFileName() { return temp_file_name; #else char tmp_file_name[_MAX_PATH]; +#if defined _MSC_VER || defined MINGW_HAS_SECURE_API errno_t err = tmpnam_s(tmp_file_name); +#else + char* fname_pointer = tmpnam(tmp_file_name); + int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1; +#endif if (err == 0) { return std::string(tmp_file_name); } @@ -65,6 +71,15 @@ uint64_t GetFileSize(const std::string& file_name) { return file_size; } +bool GetFileContents(const std::string& file_name, std::string* contents) { + std::ifstream file(file_name.c_str()); + *contents = std::string(static_cast(GetFileSize(file_name)), 0); + if (file.good() && contents->size()) { + file.read(&(*contents)[0], contents->size()); + } + return !file.fail(); +} + TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); } TempFileDeleter::~TempFileDeleter() { diff --git a/libs/libvpx/third_party/libwebm/common/file_util.h b/libs/libvpx/third_party/libwebm/common/file_util.h index 0e71eac11e..a873734641 100644 --- a/libs/libvpx/third_party/libwebm/common/file_util.h +++ b/libs/libvpx/third_party/libwebm/common/file_util.h @@ -22,6 +22,9 @@ std::string GetTempFileName(); // Returns size of file specified by |file_name|, or 0 upon failure. uint64_t GetFileSize(const std::string& file_name); +// Gets the contents file_name as a string. Returns false on error. +bool GetFileContents(const std::string& file_name, std::string* contents); + // Manages life of temporary file specified at time of construction. Deletes // file upon destruction. class TempFileDeleter { @@ -38,4 +41,4 @@ class TempFileDeleter { } // namespace libwebm -#endif // LIBWEBM_COMMON_FILE_UTIL_H_ \ No newline at end of file +#endif // LIBWEBM_COMMON_FILE_UTIL_H_ diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.cc b/libs/libvpx/third_party/libwebm/common/hdr_util.cc index e1618ce75a..916f7170b6 100644 --- a/libs/libvpx/third_party/libwebm/common/hdr_util.cc +++ b/libs/libvpx/third_party/libwebm/common/hdr_util.cc @@ -36,10 +36,10 @@ bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm, if (MasteringMetadataValuePresent(parser_mm.luminance_min)) muxer_mm->set_luminance_min(parser_mm.luminance_min); - PrimaryChromaticityPtr r_ptr(NULL); - PrimaryChromaticityPtr g_ptr(NULL); - PrimaryChromaticityPtr b_ptr(NULL); - PrimaryChromaticityPtr wp_ptr(NULL); + PrimaryChromaticityPtr r_ptr(nullptr); + PrimaryChromaticityPtr g_ptr(nullptr); + PrimaryChromaticityPtr b_ptr(nullptr); + PrimaryChromaticityPtr wp_ptr(nullptr); if (parser_mm.r) { if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr)) diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.h b/libs/libvpx/third_party/libwebm/common/hdr_util.h index 3ef5388fd0..78e2eeb705 100644 --- a/libs/libvpx/third_party/libwebm/common/hdr_util.h +++ b/libs/libvpx/third_party/libwebm/common/hdr_util.h @@ -47,15 +47,7 @@ struct Vp9CodecFeatures { int chroma_subsampling; }; -// disable deprecation warnings for auto_ptr -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif -typedef std::auto_ptr PrimaryChromaticityPtr; -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic pop -#endif +typedef std::unique_ptr PrimaryChromaticityPtr; bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, PrimaryChromaticityPtr* muxer_pc); diff --git a/libs/libvpx/third_party/libwebm/common/webmids.h b/libs/libvpx/third_party/libwebm/common/webmids.h index 89d722a71b..fc0c208140 100644 --- a/libs/libvpx/third_party/libwebm/common/webmids.h +++ b/libs/libvpx/third_party/libwebm/common/webmids.h @@ -93,6 +93,7 @@ enum MkvId { kMkvDisplayHeight = 0x54BA, kMkvDisplayUnit = 0x54B2, kMkvAspectRatioType = 0x54B3, + kMkvColourSpace = 0x2EB524, kMkvFrameRate = 0x2383E3, // end video // colour diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc index 15b9a908d8..5120312119 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -8,6 +8,8 @@ #include "mkvmuxer/mkvmuxer.h" +#include + #include #include #include @@ -24,11 +26,6 @@ #include "mkvmuxer/mkvwriter.h" #include "mkvparser/mkvparser.h" -// disable deprecation warnings for auto_ptr -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - namespace mkvmuxer { const float PrimaryChromaticity::kChromaticityMin = 0.0f; @@ -72,7 +69,7 @@ bool StrCpy(const char* src, char** dst_ptr) { return true; } -typedef std::auto_ptr PrimaryChromaticityPtr; +typedef std::unique_ptr PrimaryChromaticityPtr; bool CopyChromaticity(const PrimaryChromaticity* src, PrimaryChromaticityPtr* dst) { if (!dst) @@ -776,6 +773,14 @@ bool Track::Write(IMkvWriter* writer) const { if (!type_ || !codec_id_) return false; + // AV1 tracks require a CodecPrivate. See + // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md + // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to + // point to a stable version once it is finalized, or our own WebM mappings + // page on webmproject.org should we decide to release them. + if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_) + return false; + // |size| may be bigger than what is written out in this function because // derived classes may write out more data in the Track element. const uint64_t payload_size = PayloadSize(); @@ -1030,19 +1035,16 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) { return false; } - if (r_ && - !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX, - libwebm::kMkvPrimaryRChromaticityY)) { + if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX, + libwebm::kMkvPrimaryRChromaticityY)) { return false; } - if (g_ && - !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX, - libwebm::kMkvPrimaryGChromaticityY)) { + if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX, + libwebm::kMkvPrimaryGChromaticityY)) { return false; } - if (b_ && - !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX, - libwebm::kMkvPrimaryBChromaticityY)) { + if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX, + libwebm::kMkvPrimaryBChromaticityY)) { return false; } if (white_point_ && @@ -1057,22 +1059,22 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { bool MasteringMetadata::SetChromaticity( const PrimaryChromaticity* r, const PrimaryChromaticity* g, const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) { - PrimaryChromaticityPtr r_ptr(NULL); + PrimaryChromaticityPtr r_ptr(nullptr); if (r) { if (!CopyChromaticity(r, &r_ptr)) return false; } - PrimaryChromaticityPtr g_ptr(NULL); + PrimaryChromaticityPtr g_ptr(nullptr); if (g) { if (!CopyChromaticity(g, &g_ptr)) return false; } - PrimaryChromaticityPtr b_ptr(NULL); + PrimaryChromaticityPtr b_ptr(nullptr); if (b) { if (!CopyChromaticity(b, &b_ptr)) return false; } - PrimaryChromaticityPtr wp_ptr(NULL); + PrimaryChromaticityPtr wp_ptr(nullptr); if (white_point) { if (!CopyChromaticity(white_point, &wp_ptr)) return false; @@ -1238,7 +1240,7 @@ bool Colour::Write(IMkvWriter* writer) const { } bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { - std::auto_ptr mm_ptr(new MasteringMetadata()); + std::unique_ptr mm_ptr(new MasteringMetadata()); if (!mm_ptr.get()) return false; @@ -1424,6 +1426,7 @@ VideoTrack::VideoTrack(unsigned int* seed) stereo_mode_(0), alpha_mode_(0), width_(0), + colour_space_(NULL), colour_(NULL), projection_(NULL) {} @@ -1521,6 +1524,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const { static_cast(alpha_mode_))) return false; } + if (colour_space_) { + if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_)) + return false; + } if (frame_rate_ > 0.0) { if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate, static_cast(frame_rate_))) { @@ -1545,8 +1552,24 @@ bool VideoTrack::Write(IMkvWriter* writer) const { return true; } +void VideoTrack::set_colour_space(const char* colour_space) { + if (colour_space) { + delete[] colour_space_; + + const size_t length = strlen(colour_space) + 1; + colour_space_ = new (std::nothrow) char[length]; // NOLINT + if (colour_space_) { +#ifdef _MSC_VER + strcpy_s(colour_space_, length, colour_space); +#else + strcpy(colour_space_, colour_space); +#endif + } + } +} + bool VideoTrack::SetColour(const Colour& colour) { - std::auto_ptr colour_ptr(new Colour()); + std::unique_ptr colour_ptr(new Colour()); if (!colour_ptr.get()) return false; @@ -1574,7 +1597,7 @@ bool VideoTrack::SetColour(const Colour& colour) { } bool VideoTrack::SetProjection(const Projection& projection) { - std::auto_ptr projection_ptr(new Projection()); + std::unique_ptr projection_ptr(new Projection()); if (!projection_ptr.get()) return false; @@ -1628,6 +1651,8 @@ uint64_t VideoTrack::VideoPayloadSize() const { if (frame_rate_ > 0.0) size += EbmlElementSize(libwebm::kMkvFrameRate, static_cast(frame_rate_)); + if (colour_space_) + size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_); if (colour_) size += colour_->ColourSize(); if (projection_) @@ -1705,9 +1730,9 @@ bool AudioTrack::Write(IMkvWriter* writer) const { const char Tracks::kOpusCodecId[] = "A_OPUS"; const char Tracks::kVorbisCodecId[] = "A_VORBIS"; +const char Tracks::kAv1CodecId[] = "V_AV1"; const char Tracks::kVp8CodecId[] = "V_VP8"; const char Tracks::kVp9CodecId[] = "V_VP9"; -const char Tracks::kVp10CodecId[] = "V_VP10"; const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS"; const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS"; const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA"; @@ -2666,7 +2691,7 @@ bool Cluster::QueueOrWriteFrame(const Frame* const frame) { // and write it if it is okay to do so (i.e.) no other track has an held back // frame with timestamp <= the timestamp of the frame in question. std::vector::iterator> frames_to_erase; - for (std::list::iterator + for (std::list::iterator current_track_iterator = stored_frames_[track_number].begin(), end = --stored_frames_[track_number].end(); current_track_iterator != end; ++current_track_iterator) { @@ -4168,8 +4193,8 @@ bool Segment::DocTypeIsWebm() const { // TODO(vigneshv): Tweak .clang-format. const char* kWebmCodecIds[kNumCodecIds] = { Tracks::kOpusCodecId, Tracks::kVorbisCodecId, - Tracks::kVp8CodecId, Tracks::kVp9CodecId, - Tracks::kVp10CodecId, Tracks::kWebVttCaptionsId, + Tracks::kAv1CodecId, Tracks::kVp8CodecId, + Tracks::kVp9CodecId, Tracks::kWebVttCaptionsId, Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId, Tracks::kWebVttSubtitlesId}; diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h index 46b0029dc4..f2db377145 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -795,6 +795,8 @@ class VideoTrack : public Track { uint64_t alpha_mode() { return alpha_mode_; } void set_width(uint64_t width) { width_ = width; } uint64_t width() const { return width_; } + void set_colour_space(const char* colour_space); + const char* colour_space() const { return colour_space_; } Colour* colour() { return colour_; } @@ -824,6 +826,7 @@ class VideoTrack : public Track { uint64_t stereo_mode_; uint64_t alpha_mode_; uint64_t width_; + char* colour_space_; Colour* colour_; Projection* projection_; @@ -871,9 +874,9 @@ class Tracks { static const char kOpusCodecId[]; static const char kVorbisCodecId[]; + static const char kAv1CodecId[]; static const char kVp8CodecId[]; static const char kVp9CodecId[]; - static const char kVp10CodecId[]; static const char kWebVttCaptionsId[]; static const char kWebVttDescriptionsId[]; static const char kWebVttMetadataId[]; diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 355d4e22b3..7636a9f4ef 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -136,9 +136,8 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode, return false; } - if (!frame->is_key() && - !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock, - reference_block_timestamp)) { + if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock, + reference_block_timestamp)) { return false; } @@ -563,10 +562,10 @@ uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode) return 0; - return frame->CanBeSimpleBlock() ? - WriteSimpleBlock(writer, frame, relative_timecode) : - WriteBlock(writer, frame, relative_timecode, - cluster->timecode_scale()); + return frame->CanBeSimpleBlock() + ? WriteSimpleBlock(writer, frame, relative_timecode) + : WriteBlock(writer, frame, relative_timecode, + cluster->timecode_scale()); } uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h index 132388da59..3355428bd1 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h @@ -31,6 +31,9 @@ const int64 kMaxBlockTimecode = 0x07FFFLL; // Writes out |value| in Big Endian order. Returns 0 on success. int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size); +// Writes out |f| in Big Endian order. Returns 0 on success. +int32 SerializeFloat(IMkvWriter* writer, float f); + // Returns the size in bytes of the element. int32 GetUIntSize(uint64 value); int32 GetIntSize(int64 value); diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc index 84655d802a..d668384d85 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc @@ -78,6 +78,8 @@ int32 MkvWriter::Position(int64 position) { #ifdef _MSC_VER return _fseeki64(file_, position, SEEK_SET); +#elif defined(_WIN32) + return fseeko64(file_, static_cast(position), SEEK_SET); #else return fseeko(file_, static_cast(position), SEEK_SET); #endif diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc index 37f230d0a9..ace65bd595 100644 --- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc +++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc @@ -22,12 +22,8 @@ #include "common/webmids.h" -// disable deprecation warnings for auto_ptr -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - namespace mkvparser { +const long long kStringElementSizeLimit = 20 * 1000 * 1000; const float MasteringMetadata::kValueNotPresent = FLT_MAX; const long long Colour::kValueNotPresent = LLONG_MAX; const float Projection::kValueNotPresent = FLT_MAX; @@ -40,8 +36,6 @@ inline bool isnan(double val) { return std::isnan(val); } inline bool isinf(double val) { return std::isinf(val); } #endif // MSC_COMPAT -IMkvReader::~IMkvReader() {} - template Type* SafeArrayAlloc(unsigned long long num_elements, unsigned long long element_size) { @@ -330,7 +324,7 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size, delete[] str; str = NULL; - if (size >= LONG_MAX || size < 0) + if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit) return E_FILE_FORMAT_INVALID; // +1 for '\0' terminator @@ -4236,6 +4230,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, new (std::nothrow) ContentEncryption*[encryption_count]; if (!encryption_entries_) { delete[] compression_entries_; + compression_entries_ = NULL; return -1; } encryption_entries_end_ = encryption_entries_; @@ -4267,6 +4262,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, delete compression; return status; } + assert(compression_count > 0); *compression_entries_end_++ = compression; } else if (id == libwebm::kMkvContentEncryption) { ContentEncryption* const encryption = @@ -4279,6 +4275,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, delete encryption; return status; } + assert(encryption_count > 0); *encryption_entries_end_++ = encryption; } @@ -4331,6 +4328,12 @@ long ContentEncoding::ParseCompressionEntry(long long start, long long size, return status; } + // There should be only one settings element per content compression. + if (compression->settings != NULL) { + delete[] buf; + return E_FILE_FORMAT_INVALID; + } + compression->settings = buf; compression->settings_len = buflen; } @@ -5015,7 +5018,7 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, if (!reader || *mm) return false; - std::auto_ptr mm_ptr(new MasteringMetadata()); + std::unique_ptr mm_ptr(new MasteringMetadata()); if (!mm_ptr.get()) return false; @@ -5035,6 +5038,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } mm_ptr->luminance_max = static_cast(value); if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 || mm_ptr->luminance_max > 9999.99) { @@ -5044,6 +5051,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } mm_ptr->luminance_min = static_cast(value); if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 || mm_ptr->luminance_min > 999.9999) { @@ -5096,7 +5107,7 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start, if (!reader || *colour) return false; - std::auto_ptr colour_ptr(new Colour()); + std::unique_ptr colour_ptr(new Colour()); if (!colour_ptr.get()) return false; @@ -5194,7 +5205,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, if (!reader || *projection) return false; - std::auto_ptr projection_ptr(new Projection()); + std::unique_ptr projection_ptr(new Projection()); if (!projection_ptr.get()) return false; @@ -5270,6 +5281,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, VideoTrack::VideoTrack(Segment* pSegment, long long element_start, long long element_size) : Track(pSegment, element_start, element_size), + m_colour_space(NULL), m_colour(NULL), m_projection(NULL) {} @@ -5295,6 +5307,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, long long stereo_mode = 0; double rate = 0.0; + char* colour_space = NULL; IMkvReader* const pReader = pSegment->m_pReader; @@ -5307,8 +5320,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, const long long stop = pos + s.size; - Colour* colour = NULL; - Projection* projection = NULL; + std::unique_ptr colour_ptr; + std::unique_ptr projection_ptr; while (pos < stop) { long long id, size; @@ -5357,11 +5370,23 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, if (rate <= 0) return E_FILE_FORMAT_INVALID; } else if (id == libwebm::kMkvColour) { - if (!Colour::Parse(pReader, pos, size, &colour)) + Colour* colour = NULL; + if (!Colour::Parse(pReader, pos, size, &colour)) { return E_FILE_FORMAT_INVALID; + } else { + colour_ptr.reset(colour); + } } else if (id == libwebm::kMkvProjection) { - if (!Projection::Parse(pReader, pos, size, &projection)) + Projection* projection = NULL; + if (!Projection::Parse(pReader, pos, size, &projection)) { return E_FILE_FORMAT_INVALID; + } else { + projection_ptr.reset(projection); + } + } else if (id == libwebm::kMkvColourSpace) { + const long status = UnserializeString(pReader, pos, size, colour_space); + if (status < 0) + return status; } pos += size; // consume payload @@ -5392,8 +5417,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, pTrack->m_display_unit = display_unit; pTrack->m_stereo_mode = stereo_mode; pTrack->m_rate = rate; - pTrack->m_colour = colour; - pTrack->m_projection = projection; + pTrack->m_colour = colour_ptr.release(); + pTrack->m_colour_space = colour_space; + pTrack->m_projection = projection_ptr.release(); pResult = pTrack; return 0; // success @@ -7903,6 +7929,10 @@ long Block::Parse(const Cluster* pCluster) { return E_FILE_FORMAT_INVALID; curr.len = static_cast(frame_size); + // Check if size + curr.len could overflow. + if (size > LLONG_MAX - curr.len) { + return E_FILE_FORMAT_INVALID; + } size += curr.len; // contribution of this frame --frame_count; @@ -7964,6 +7994,11 @@ long long Block::GetTimeCode(const Cluster* pCluster) const { const long long tc0 = pCluster->GetTimeCode(); assert(tc0 >= 0); + // Check if tc0 + m_timecode would overflow. + if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) { + return -1; + } + const long long tc = tc0 + m_timecode; return tc; // unscaled timecode units @@ -7981,6 +8016,10 @@ long long Block::GetTime(const Cluster* pCluster) const { const long long scale = pInfo->GetTimeCodeScale(); assert(scale >= 1); + // Check if tc * scale could overflow. + if (tc != 0 && scale > LLONG_MAX / tc) { + return -1; + } const long long ns = tc * scale; return ns; diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h index 26c2b7e5eb..848d01f03e 100644 --- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h +++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h @@ -22,7 +22,7 @@ class IMkvReader { virtual int Length(long long* total, long long* available) = 0; protected: - virtual ~IMkvReader(); + virtual ~IMkvReader() {} }; template @@ -527,6 +527,8 @@ class VideoTrack : public Track { Projection* GetProjection() const; + const char* GetColourSpace() const { return m_colour_space; } + private: long long m_width; long long m_height; @@ -534,7 +536,7 @@ class VideoTrack : public Track { long long m_display_height; long long m_display_unit; long long m_stereo_mode; - + char* m_colour_space; double m_rate; Colour* m_colour; diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc index 23d68f5089..9d19c1be56 100644 --- a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc +++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc @@ -118,6 +118,8 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) { if (status) return -1; // error +#elif defined(_WIN32) + fseeko64(m_file, static_cast(offset), SEEK_SET); #else fseeko(m_file, static_cast(offset), SEEK_SET); #endif diff --git a/libs/libvpx/third_party/libyuv/LICENSE b/libs/libvpx/third_party/libyuv/LICENSE new file mode 100644 index 0000000000..c911747a6b --- /dev/null +++ b/libs/libvpx/third_party/libyuv/LICENSE @@ -0,0 +1,29 @@ +Copyright 2011 The LibYuv Project Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/libs/libvpx/third_party/libyuv/README.libvpx b/libs/libvpx/third_party/libyuv/README.libvpx index 485f79c0ff..9519dc4bee 100644 --- a/libs/libvpx/third_party/libyuv/README.libvpx +++ b/libs/libvpx/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv -Version: de944ed8c74909ea6fbd743a22efe1e55e851b83 +Version: a37e7bfece9e0676ae90a1700b0ec85b0f4f22a1 License: BSD License File: LICENSE @@ -8,15 +8,16 @@ Description: libyuv is an open source project that includes YUV conversion and scaling functionality. -The optimized scaler in libyuv is used in multiple resolution encoder example, -which down-samples the original input video (f.g. 1280x720) a number of times -in order to encode multiple resolution bit streams. +The optimized scaler in libyuv is used in the multiple resolution encoder +example which down-samples the original input video (f.g. 1280x720) a number of +times in order to encode multiple resolution bit streams. Local Modifications: -rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \ - LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \ - all.gyp build_overrides/ chromium/ codereview.settings docs/ \ - download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \ - include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \ - libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \ - third_party/ tools/ unit_test/ util/ winarm.mk +Disable ARGBToRGB24Row_AVX512VBMI due to build failure on Mac. +rm libyuv/include/libyuv.h libyuv/include/libyuv/compare_row.h +mv libyuv/include tmp/ +mv libyuv/source tmp/ +mv libyuv/LICENSE tmp/ +rm -rf libyuv + +mv tmp/* third_party/libyuv/ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h b/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h index 54a2181430..01d9dfc773 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h @@ -8,82 +8,36 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ #define INCLUDE_LIBYUV_BASIC_TYPES_H_ -#include // for NULL, size_t +#include // For size_t and NULL + +#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG) +#define INT_TYPES_DEFINED #if defined(_MSC_VER) && (_MSC_VER < 1600) #include // for uintptr_t on x86 +typedef unsigned __int64 uint64_t; +typedef __int64 int64_t; +typedef unsigned int uint32_t; +typedef int int32_t; +typedef unsigned short uint16_t; +typedef short int16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; #else -#include // for uintptr_t -#endif - -#ifndef GG_LONGLONG -#ifndef INT_TYPES_DEFINED -#define INT_TYPES_DEFINED -#ifdef COMPILER_MSVC -typedef unsigned __int64 uint64; -typedef __int64 int64; -#ifndef INT64_C -#define INT64_C(x) x ## I64 -#endif -#ifndef UINT64_C -#define UINT64_C(x) x ## UI64 -#endif -#define INT64_F "I64" -#else // COMPILER_MSVC -#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) -typedef unsigned long uint64; // NOLINT -typedef long int64; // NOLINT -#ifndef INT64_C -#define INT64_C(x) x ## L -#endif -#ifndef UINT64_C -#define UINT64_C(x) x ## UL -#endif -#define INT64_F "l" -#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) -typedef unsigned long long uint64; // NOLINT -typedef long long int64; // NOLINT -#ifndef INT64_C -#define INT64_C(x) x ## LL -#endif -#ifndef UINT64_C -#define UINT64_C(x) x ## ULL -#endif -#define INT64_F "ll" -#endif // __LP64__ -#endif // COMPILER_MSVC -typedef unsigned int uint32; -typedef int int32; -typedef unsigned short uint16; // NOLINT -typedef short int16; // NOLINT -typedef unsigned char uint8; -typedef signed char int8; +#include // for uintptr_t and C99 types +#endif // defined(_MSC_VER) && (_MSC_VER < 1600) +typedef uint64_t uint64; +typedef int64_t int64; +typedef uint32_t uint32; +typedef int32_t int32; +typedef uint16_t uint16; +typedef int16_t int16; +typedef uint8_t uint8; +typedef int8_t int8; #endif // INT_TYPES_DEFINED -#endif // GG_LONGLONG - -// Detect compiler is for x86 or x64. -#if defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86) -#define CPU_X86 1 -#endif -// Detect compiler is for ARM. -#if defined(__arm__) || defined(_M_ARM) -#define CPU_ARM 1 -#endif - -#ifndef ALIGNP -#ifdef __cplusplus -#define ALIGNP(p, t) \ - (reinterpret_cast(((reinterpret_cast(p) + \ - ((t) - 1)) & ~((t) - 1)))) -#else -#define ALIGNP(p, t) \ - ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */ -#endif -#endif #if !defined(LIBYUV_API) #if defined(_WIN32) || defined(__CYGWIN__) @@ -95,24 +49,17 @@ typedef signed char int8; #define LIBYUV_API #endif // LIBYUV_BUILDING_SHARED_LIBRARY #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ - (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ - defined(LIBYUV_USING_SHARED_LIBRARY)) -#define LIBYUV_API __attribute__ ((visibility ("default"))) + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__((visibility("default"))) #else #define LIBYUV_API #endif // __GNUC__ #endif // LIBYUV_API +// TODO(fbarchard): Remove bool macros. #define LIBYUV_BOOL int #define LIBYUV_FALSE 0 #define LIBYUV_TRUE 1 -// Visual C x86 or GCC little endian. -#if defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86) || \ - defined(__arm__) || defined(_M_ARM) || \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -#define LIBYUV_LITTLE_ENDIAN -#endif - -#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/compare.h b/libs/libvpx/third_party/libyuv/include/libyuv/compare.h index 08b2bb2ecf..3353ad71c6 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/compare.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/compare.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_COMPARE_H_ #define INCLUDE_LIBYUV_COMPARE_H_ #include "libyuv/basic_types.h" @@ -20,59 +20,92 @@ extern "C" { // Compute a hash for specified memory. Seed of 5381 recommended. LIBYUV_API -uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed); + +// Hamming Distance +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count); // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API -uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height); +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height); // Sum Square Error - used to compute Mean Square Error or PSNR. LIBYUV_API -uint64 ComputeSumSquareError(const uint8* src_a, - const uint8* src_b, int count); +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count); LIBYUV_API -uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); static const int kMaxPsnr = 128; LIBYUV_API -double SumSquareErrorToPsnr(uint64 sse, uint64 count); +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count); LIBYUV_API -double CalcFramePsnr(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +double CalcFramePsnr(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); LIBYUV_API -double I420Psnr(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height); +double I420Psnr(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height); LIBYUV_API -double CalcFrameSsim(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +double CalcFrameSsim(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); LIBYUV_API -double I420Ssim(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height); +double I420Ssim(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT +#endif // INCLUDE_LIBYUV_COMPARE_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert.h index fcfcf544e1..d12ef24f79 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/convert.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_H_ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" @@ -16,8 +16,8 @@ #include "libyuv/rotate.h" // For enum RotationMode. // TODO(fbarchard): fix WebRTC source to include following libyuv headers: -#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 -#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 +#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 +#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 #include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618 #ifdef __cplusplus @@ -27,195 +27,335 @@ extern "C" { // Convert I444 to I420. LIBYUV_API -int I444ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I444ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert I422 to I420. LIBYUV_API -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert I411 to I420. -LIBYUV_API -int I411ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I422ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Copy I420 to I420. #define I420ToI420 I420Copy LIBYUV_API -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy I010 to I010 +#define I010ToI010 I010Copy +#define H010ToH010 I010Copy +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert 10 bit YUV to 8 bit +#define H010ToH420 I010ToI420 +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert I400 (grey) to I420. LIBYUV_API -int I400ToI420(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I400ToI420(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); #define J400ToJ420 I400ToI420 // Convert NV12 to I420. LIBYUV_API -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert NV21 to I420. LIBYUV_API -int NV21ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int NV21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert YUY2 to I420. LIBYUV_API -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int YUY2ToI420(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert UYVY to I420. LIBYUV_API -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int UYVYToI420(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8* src_m420, int src_stride_m420, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int M420ToI420(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert Android420 to I420. LIBYUV_API -int Android420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int pixel_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int Android420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // ARGB little endian (bgra in memory) to I420. LIBYUV_API -int ARGBToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // BGRA little endian (argb in memory) to I420. LIBYUV_API -int BGRAToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int BGRAToI420(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // ABGR little endian (rgba in memory) to I420. LIBYUV_API -int ABGRToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ABGRToI420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGBA little endian (abgr in memory) to I420. LIBYUV_API -int RGBAToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGBAToI420(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB little endian (bgr in memory) to I420. LIBYUV_API -int RGB24ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGB24ToI420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB big endian (rgb in memory) to I420. LIBYUV_API -int RAWToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RAWToI420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB16 (RGBP fourcc) little endian to I420. LIBYUV_API -int RGB565ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGB565ToI420(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB15 (RGBO fourcc) little endian to I420. LIBYUV_API -int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGB1555ToI420(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB12 (R444 fourcc) little endian to I420. LIBYUV_API -int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGB4444ToI420(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); #ifdef HAVE_JPEG // src_width/height provided by capture. // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToI420(const uint8* sample, size_t sample_size, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, - int dst_width, int dst_height); +int MJPGToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height); // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8* sample, size_t sample_size, - int* width, int* height); +int MJPGSize(const uint8_t* sample, + size_t sample_size, + int* width, + int* height); #endif // Convert camera sample to I420 with cropping, rotation and vertical flip. @@ -238,22 +378,29 @@ int MJPGSize(const uint8* sample, size_t sample_size, // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. -// "format" is a fourcc. ie 'I420', 'YUY2' +// "fourcc" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API -int ConvertToI420(const uint8* src_frame, size_t src_size, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 format); + uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h index 19672f3269..ab772b6c32 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_ARGB_H_ #include "libyuv/basic_types.h" @@ -30,258 +30,621 @@ extern "C" { // Copy ARGB to ARGB. LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I420 to ARGB. LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Duplicate prototype for function in convert_from.h for remoting. LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert I422 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I444 to ARGB. LIBYUV_API -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J444 to ARGB. LIBYUV_API -int J444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I444 to ABGR. LIBYUV_API -int I444ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); - -// Convert I411 to ARGB. -LIBYUV_API -int I411ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert I420 with Alpha to preattenuated ARGB. LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int attenuate); +int I420AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate); // Convert I420 with Alpha to preattenuated ABGR. LIBYUV_API -int I420AlphaToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height, int attenuate); +int I420AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate); // Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J400 (jpeg grey) to ARGB. LIBYUV_API -int J400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Alias. #define YToARGB I400ToARGB // Convert NV12 to ARGB. LIBYUV_API -int NV12ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int NV12ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert NV21 to ARGB. LIBYUV_API -int NV21ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int NV21ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert NV12 to ABGR. +int NV12ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert NV21 to ABGR. +LIBYUV_API +int NV21ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); // Convert M420 to ARGB. LIBYUV_API -int M420ToARGB(const uint8* src_m420, int src_stride_m420, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int M420ToARGB(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert YUY2 to ARGB. LIBYUV_API -int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int YUY2ToARGB(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert UYVY to ARGB. LIBYUV_API -int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int UYVYToARGB(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J420 to ARGB. LIBYUV_API -int J420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J422 to ARGB. LIBYUV_API -int J422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J420 to ABGR. LIBYUV_API -int J420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int J420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert J422 to ABGR. LIBYUV_API -int J422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int J422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert H420 to ARGB. LIBYUV_API -int H420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int H420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert H422 to ARGB. LIBYUV_API -int H422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int H422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert H420 to ABGR. LIBYUV_API -int H420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int H420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert H422 to ABGR. LIBYUV_API -int H422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int H422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); // BGRA little endian (argb in memory) to ARGB. LIBYUV_API -int BGRAToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int BGRAToARGB(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // ABGR little endian (rgba in memory) to ARGB. LIBYUV_API -int ABGRToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ABGRToARGB(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGBA little endian (abgr in memory) to ARGB. LIBYUV_API -int RGBAToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGBAToARGB(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Deprecated function name. #define BG24ToARGB RGB24ToARGB // RGB little endian (bgr in memory) to ARGB. LIBYUV_API -int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGB24ToARGB(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB big endian (rgb in memory) to ARGB. LIBYUV_API -int RAWToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RAWToARGB(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB16 (RGBP fourcc) little endian to ARGB. LIBYUV_API -int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGB565ToARGB(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB15 (RGBO fourcc) little endian to ARGB. LIBYUV_API -int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGB1555ToARGB(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB12 (R444 fourcc) little endian to ARGB. LIBYUV_API -int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGB4444ToARGB(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Aliases +#define AB30ToARGB AR30ToABGR +#define AB30ToABGR AR30ToARGB +#define AB30ToAR30 AR30ToAB30 + +// Convert AR30 To ARGB. +LIBYUV_API +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert AR30 To ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert AR30 To AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); #ifdef HAVE_JPEG // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToARGB(const uint8* sample, size_t sample_size, - uint8* dst_argb, int dst_stride_argb, - int src_width, int src_height, - int dst_width, int dst_height); +int MJPGToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height); #endif +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert Android420 to ABGR. +LIBYUV_API +int Android420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + // Convert camera sample to ARGB with cropping, rotation and vertical flip. -// "src_size" is needed to parse MJPG. +// "sample_size" is needed to parse MJPG. // "dst_stride_argb" number of bytes in a row of the dst_argb plane. // Normally this would be the same as dst_width, with recommended alignment // to 16 bytes for better efficiency. @@ -300,20 +663,25 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. -// "format" is a fourcc. ie 'I420', 'YUY2' +// "fourcc" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API -int ConvertToARGB(const uint8* src_frame, size_t src_size, - uint8* dst_argb, int dst_stride_argb, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 format); + uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h index 39e1578a0e..5cd8a4bfc0 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_H_ #include "libyuv/basic_types.h" @@ -21,159 +21,322 @@ extern "C" { // See Also convert.h for conversions from formats to I420. -// I420Copy in convert to I420ToI420. +// Convert 8 bit YUV to 10 bit. +#define H420ToH010 I420ToI010 +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); LIBYUV_API -int I420ToI422(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420ToI422(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); LIBYUV_API -int I420ToI444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int I420ToI411(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. LIBYUV_API -int I400Copy(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); +int I400Copy(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); LIBYUV_API -int I420ToNV12(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); +int I420ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); LIBYUV_API -int I420ToNV21(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int I420ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); LIBYUV_API -int I420ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); LIBYUV_API -int I420ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); LIBYUV_API -int I420ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); LIBYUV_API -int I420ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); LIBYUV_API -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); LIBYUV_API -int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); LIBYUV_API -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. // The order of the dither matrix is first byte is upper left. LIBYUV_API -int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - const uint8* dither4x4, int width, int height); +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height); LIBYUV_API -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); LIBYUV_API -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); // Convert I420 to specified format. // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the // buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. LIBYUV_API -int ConvertFromI420(const uint8* y, int y_stride, - const uint8* u, int u_stride, - const uint8* v, int v_stride, - uint8* dst_sample, int dst_sample_stride, - int width, int height, - uint32 format); +int ConvertFromI420(const uint8_t* y, + int y_stride, + const uint8_t* u, + int u_stride, + const uint8_t* v, + int v_stride, + uint8_t* dst_sample, + int dst_sample_stride, + int width, + int height, + uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h index 1df53200dd..05c815a093 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #include "libyuv/basic_types.h" @@ -21,170 +21,267 @@ extern "C" { // Copy ARGB to ARGB. #define ARGBToARGB ARGBCopy LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert ARGB To BGRA. LIBYUV_API -int ARGBToBGRA(const uint8* src_argb, int src_stride_argb, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height); +int ARGBToBGRA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); // Convert ARGB To ABGR. LIBYUV_API -int ARGBToABGR(const uint8* src_argb, int src_stride_argb, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int ARGBToABGR(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert ARGB To RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); +int ARGBToRGBA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +// Aliases +#define ARGBToAB30 ABGRToAR30 +#define ABGRToAB30 ARGBToAR30 + +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); // Convert ARGB To RGB24. LIBYUV_API -int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height); +int ARGBToRGB24(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); // Convert ARGB To RAW. LIBYUV_API -int ARGBToRAW(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb, int dst_stride_rgb, - int width, int height); +int ARGBToRAW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); // Convert ARGB To RGB565. LIBYUV_API -int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height); +int ARGBToRGB565(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. // The order of the dither matrix is first byte is upper left. // TODO(fbarchard): Consider pointer to 2d array for dither4x4. -// const uint8(*dither)[4][4]; +// const uint8_t(*dither)[4][4]; LIBYUV_API -int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height); +int ARGBToRGB565Dither(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height); // Convert ARGB To ARGB1555. LIBYUV_API -int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height); +int ARGBToARGB1555(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); // Convert ARGB To ARGB4444. LIBYUV_API -int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height); +int ARGBToARGB4444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); // Convert ARGB To I444. LIBYUV_API -int ARGBToI444(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB To I422. LIBYUV_API -int ARGBToI422(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB To I420. (also in convert.h) LIBYUV_API -int ARGBToI420(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ420(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToJ420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J422. LIBYUV_API -int ARGBToJ422(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert ARGB To I411. -LIBYUV_API -int ARGBToI411(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToJ422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J400. (JPeg full range). LIBYUV_API -int ARGBToJ400(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - int width, int height); +int ARGBToJ400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height); // Convert ARGB to I400. LIBYUV_API -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height); +int ARGBToI400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) LIBYUV_API -int ARGBToG(const uint8* src_argb, int src_stride_argb, - uint8* dst_g, int dst_stride_g, - int width, int height); +int ARGBToG(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_g, + int dst_stride_g, + int width, + int height); // Convert ARGB To NV12. LIBYUV_API -int ARGBToNV12(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); +int ARGBToNV12(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); // Convert ARGB To NV21. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); // Convert ARGB To NV21. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); // Convert ARGB To YUY2. LIBYUV_API -int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height); +int ARGBToYUY2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); // Convert ARGB To UYVY. LIBYUV_API -int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height); +int ARGBToUYVY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h b/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h index dfb7445e2f..0229cb5e73 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ #define INCLUDE_LIBYUV_CPU_ID_H_ #include "libyuv/basic_types.h" @@ -31,50 +31,89 @@ static const int kCpuHasX86 = 0x10; static const int kCpuHasSSE2 = 0x20; static const int kCpuHasSSSE3 = 0x40; static const int kCpuHasSSE41 = 0x80; -static const int kCpuHasSSE42 = 0x100; +static const int kCpuHasSSE42 = 0x100; // unused at this time. static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; static const int kCpuHasFMA3 = 0x1000; -static const int kCpuHasAVX3 = 0x2000; -// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. +static const int kCpuHasF16C = 0x2000; +static const int kCpuHasGFNI = 0x4000; +static const int kCpuHasAVX512BW = 0x8000; +static const int kCpuHasAVX512VL = 0x10000; +static const int kCpuHasAVX512VBMI = 0x20000; +static const int kCpuHasAVX512VBMI2 = 0x40000; +static const int kCpuHasAVX512VBITALG = 0x80000; +static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; // These flags are only valid on MIPS processors. -static const int kCpuHasMIPS = 0x10000; -static const int kCpuHasDSPR2 = 0x20000; +static const int kCpuHasMIPS = 0x200000; +static const int kCpuHasMSA = 0x400000; -// Internal function used to auto-init. +// Optional init function. TestCpuFlag does an auto-init. +// Returns cpu_info flags. LIBYUV_API int InitCpuFlags(void); +// Detect CPU has SSE2 etc. +// Test_flag parameter should be one of kCpuHas constants above. +// Returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED); +#else + int cpu_info = cpu_info_; +#endif + return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag; +} + // Internal function for parsing /proc/cpuinfo. LIBYUV_API int ArmCpuCaps(const char* cpuinfo_name); -// Detect CPU has SSE2 etc. -// Test_flag parameter should be one of kCpuHas constants above. -// returns non-zero if instruction set is detected -static __inline int TestCpuFlag(int test_flag) { - LIBYUV_API extern int cpu_info_; - return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag; -} - // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. // MaskCpuFlags(-1) to enable all cpu specific optimizations. // MaskCpuFlags(1) to disable all cpu specific optimizations. +// MaskCpuFlags(0) to reset state so next call will auto init. +// Returns cpu_info flags. LIBYUV_API -void MaskCpuFlags(int enable_flags); +int MaskCpuFlags(int enable_flags); + +// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags| +// should be a valid combination of the kCpuHas constants above and include +// kCpuInitialized. Use this method when running in a sandboxed process where +// the detection code might fail (as it might access /proc/cpuinfo). In such +// cases the cpu_info can be obtained from a non sandboxed process by calling +// InitCpuFlags() and passed to the sandboxed process (via command line +// parameters, IPC...) which can then call this method to initialize the CPU +// flags. +// Notes: +// - when specifying 0 for |cpu_flags|, the auto initialization is enabled +// again. +// - enabling CPU features that are not supported by the CPU will result in +// undefined behavior. +// TODO(fbarchard): consider writing a helper function that translates from +// other library CPU info to libyuv CPU info and add a .md doc that explains +// CPU detection. +static __inline void SetCpuFlags(int cpu_flags) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED); +#else + cpu_info_ = cpu_flags; +#endif +} // Low level cpuid for X86. Returns zeros on other CPUs. // eax is the info type that you want. // ecx is typically the cpu number, and should normally be zero. LIBYUV_API -void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); +void CpuId(int info_eax, int info_ecx, int* cpu_info); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/macros_msa.h b/libs/libvpx/third_party/libyuv/include/libyuv/macros_msa.h new file mode 100644 index 0000000000..bba0e8aeda --- /dev/null +++ b/libs/libvpx/third_party/libyuv/include/libyuv/macros_msa.h @@ -0,0 +1,233 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ +#define INCLUDE_LIBYUV_MACROS_MSA_H_ + +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include +#include + +#if (__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ + uint32_t val_m; \ + asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint64_t val_m = 0; \ + asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64_t)(val1_m); /* NOLINT */ \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#if (__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint64_t val_m = (val); \ + asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ + : [pdst_sd_m] "=m"(*pdst_sd_m) \ + : [val_m] "r"(val_m)); \ + }) +#else // !(__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // !(__mips == 64) +#else // !(__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ + uint32_t val_m; \ + asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint64_t val_m = 0; \ + asm volatile("uld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64_t)(val1_m); /* NOLINT */ \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // (__mips_isa_rev >= 6) + +// TODO(fbarchard): Consider removing __VAR_ARGS versions. +#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ +#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) + +/* Description : Load two vectors with 16 'byte' sized elements + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__) + +/* Description : Store two vectors with stride each having 16 'byte' sized + elements + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) + +// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) + +#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ + +#endif // INCLUDE_LIBYUV_MACROS_MSA_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h index 8423121d11..275f8d4c18 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ #define INCLUDE_LIBYUV_MJPEG_DECODER_H_ #include "libyuv/basic_types.h" @@ -26,25 +26,24 @@ namespace libyuv { extern "C" { #endif -LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size); +LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size); #ifdef __cplusplus } // extern "C" #endif -static const uint32 kUnknownDataSize = 0xFFFFFFFF; +static const uint32_t kUnknownDataSize = 0xFFFFFFFF; enum JpegSubsamplingType { kJpegYuv420, kJpegYuv422, - kJpegYuv411, kJpegYuv444, kJpegYuv400, kJpegUnknown }; struct Buffer { - const uint8* data; + const uint8_t* data; int len; }; @@ -66,7 +65,7 @@ struct SetJmpErrorMgr; class LIBYUV_API MJpegDecoder { public: typedef void (*CallbackFunction)(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows); @@ -86,7 +85,7 @@ class LIBYUV_API MJpegDecoder { // If return value is LIBYUV_TRUE, then the values for all the following // getters are populated. // src_len is the size of the compressed mjpeg frame in bytes. - LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len); + LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len); // Returns width of the last loaded frame in pixels. int GetWidth(); @@ -139,18 +138,22 @@ class LIBYUV_API MJpegDecoder { // at least GetComponentSize(i). The pointers in planes are incremented // to point to after the end of the written data. // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. - LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height); + LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height); // Decodes the entire image and passes the data via repeated calls to a // callback function. Each call will get the data for a whole number of // image scanlines. // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. - LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque, - int dst_width, int dst_height); + LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height); // The helper function which recognizes the jpeg sub-sampling type. static JpegSubsamplingType JpegSubsamplingTypeHelper( - int* subsample_x, int* subsample_y, int number_of_components); + int* subsample_x, + int* subsample_y, + int number_of_components); private: void AllocOutputBuffers(int num_outbufs); @@ -159,7 +162,7 @@ class LIBYUV_API MJpegDecoder { LIBYUV_BOOL StartDecode(); LIBYUV_BOOL FinishDecode(); - void SetScanlinePointers(uint8** data); + void SetScanlinePointers(uint8_t** data); LIBYUV_BOOL DecodeImcuRow(); int GetComponentScanlinePadding(int component); @@ -178,15 +181,15 @@ class LIBYUV_API MJpegDecoder { // Temporaries used to point to scanline outputs. int num_outbufs_; // Outermost size of all arrays below. - uint8*** scanlines_; + uint8_t*** scanlines_; int* scanlines_sizes_; // Temporary buffer used for decoding when we can't decode directly to the // output buffers. Large enough for just one iMCU row. - uint8** databuf_; + uint8_t** databuf_; int* databuf_strides_; }; } // namespace libyuv #endif // __cplusplus -#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h b/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h index 9662516c57..91137baba2 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #include "libyuv/basic_types.h" @@ -22,449 +22,10 @@ namespace libyuv { extern "C" { #endif -// Copy a plane of data. -LIBYUV_API -void CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); - -LIBYUV_API -void CopyPlane_16(const uint16* src_y, int src_stride_y, - uint16* dst_y, int dst_stride_y, - int width, int height); - -// Set a plane of data to a 32 bit value. -LIBYUV_API -void SetPlane(uint8* dst_y, int dst_stride_y, - int width, int height, - uint32 value); - -// Split interleaved UV plane into separate U and V planes. -LIBYUV_API -void SplitUVPlane(const uint8* src_uv, int src_stride_uv, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Merge separate U and V planes into one interleaved UV plane. -LIBYUV_API -void MergeUVPlane(const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uv, int dst_stride_uv, - int width, int height); - -// Copy I400. Supports inverting. -LIBYUV_API -int I400ToI400(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); - -#define J400ToJ400 I400ToI400 - -// Copy I422 to I422. -#define I422ToI422 I422Copy -LIBYUV_API -int I422Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Copy I444 to I444. -#define I444ToI444 I444Copy -LIBYUV_API -int I444Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert YUY2 to I422. -LIBYUV_API -int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert UYVY to I422. -LIBYUV_API -int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); - -LIBYUV_API -int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); - -// Convert I420 to I400. (calls CopyPlane ignoring u/v). -LIBYUV_API -int I420ToI400(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - int width, int height); - -// Alias -#define J420ToJ400 I420ToI400 -#define I420ToI420Mirror I420Mirror - -// I420 mirror. -LIBYUV_API -int I420Mirror(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Alias -#define I400ToI400Mirror I400Mirror - -// I400 mirror. A single plane is mirrored horizontally. -// Pass negative height to achieve 180 degree rotation. -LIBYUV_API -int I400Mirror(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); - -// Alias -#define ARGBToARGBMirror ARGBMirror - -// ARGB mirror. -LIBYUV_API -int ARGBMirror(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Convert NV12 to RGB565. -LIBYUV_API -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height); - -// I422ToARGB is in convert_argb.h -// Convert I422 to BGRA. -LIBYUV_API -int I422ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height); - -// Convert I422 to ABGR. -LIBYUV_API -int I422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); - -// Convert I422 to RGBA. -LIBYUV_API -int I422ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); - -// Alias -#define RGB24ToRAW RAWToRGB24 - -LIBYUV_API -int RAWToRGB24(const uint8* src_raw, int src_stride_raw, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height); - -// Draw a rectangle into I420. -LIBYUV_API -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, int width, int height, - int value_y, int value_u, int value_v); - -// Draw a rectangle into ARGB. -LIBYUV_API -int ARGBRect(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height, uint32 value); - -// Convert ARGB to gray scale ARGB. -LIBYUV_API -int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Make a rectangle of ARGB gray scale. -LIBYUV_API -int ARGBGray(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height); - -// Make a rectangle of ARGB Sepia tone. -LIBYUV_API -int ARGBSepia(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height); - -// Apply a matrix rotation to each ARGB pixel. -// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. -// The first 4 coefficients apply to B, G, R, A and produce B of the output. -// The next 4 coefficients apply to B, G, R, A and produce G of the output. -// The next 4 coefficients apply to B, G, R, A and produce R of the output. -// The last 4 coefficients apply to B, G, R, A and produce A of the output. -LIBYUV_API -int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const int8* matrix_argb, - int width, int height); - -// Deprecated. Use ARGBColorMatrix instead. -// Apply a matrix rotation to each ARGB pixel. -// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. -// The first 4 coefficients apply to B, G, R, A and produce B of the output. -// The next 4 coefficients apply to B, G, R, A and produce G of the output. -// The last 4 coefficients apply to B, G, R, A and produce R of the output. -LIBYUV_API -int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, - const int8* matrix_rgb, - int x, int y, int width, int height); - -// Apply a color table each ARGB pixel. -// Table contains 256 ARGB values. -LIBYUV_API -int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int x, int y, int width, int height); - -// Apply a color table each ARGB pixel but preserve destination alpha. -// Table contains 256 ARGB values. -LIBYUV_API -int RGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int x, int y, int width, int height); - -// Apply a luma/color table each ARGB pixel but preserve destination alpha. -// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from -// RGB (YJ style) and C is an 8 bit color component (R, G or B). -LIBYUV_API -int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const uint8* luma_rgb_table, - int width, int height); - -// Apply a 3 term polynomial to ARGB values. -// poly points to a 4x4 matrix. The first row is constants. The 2nd row is -// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, -// g squared, r squared and a squared. The 4rd row is coefficients for b to -// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and -// result clamped to 0 to 255. -// A polynomial approximation can be dirived using software such as 'R'. - -LIBYUV_API -int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const float* poly, - int width, int height); - -// Quantize a rectangle of ARGB. Alpha unaffected. -// scale is a 16 bit fractional fixed point scaler between 0 and 65535. -// interval_size should be a value between 1 and 255. -// interval_offset should be a value between 0 and 255. -LIBYUV_API -int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, - int scale, int interval_size, int interval_offset, - int x, int y, int width, int height); - -// Copy ARGB to ARGB. -LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Copy Alpha channel of ARGB to alpha of ARGB. -LIBYUV_API -int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Extract the alpha channel from ARGB. -LIBYUV_API -int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_a, int dst_stride_a, - int width, int height); - -// Copy Y channel to Alpha of ARGB. -LIBYUV_API -int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width); - -// Get function to Alpha Blend ARGB pixels and store to destination. -LIBYUV_API -ARGBBlendRow GetARGBBlend(); - -// Alpha Blend ARGB images and store to destination. -// Source is pre-multiplied by alpha using ARGBAttenuate. -// Alpha of destination is set to 255. -LIBYUV_API -int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Alpha Blend plane and store to destination. -// Source is not pre-multiplied by alpha. -LIBYUV_API -int BlendPlane(const uint8* src_y0, int src_stride_y0, - const uint8* src_y1, int src_stride_y1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - int width, int height); - -// Alpha Blend YUV images and store to destination. -// Source is not pre-multiplied by alpha. -// Alpha is full width x height and subsampled to half size to apply to UV. -LIBYUV_API -int I420Blend(const uint8* src_y0, int src_stride_y0, - const uint8* src_u0, int src_stride_u0, - const uint8* src_v0, int src_stride_v0, - const uint8* src_y1, int src_stride_y1, - const uint8* src_u1, int src_stride_u1, - const uint8* src_v1, int src_stride_v1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. -LIBYUV_API -int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Add ARGB image with ARGB image. Saturates to 255. -LIBYUV_API -int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. -LIBYUV_API -int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Convert I422 to YUY2. -LIBYUV_API -int I422ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -// Convert I422 to UYVY. -LIBYUV_API -int I422ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -// Convert unattentuated ARGB to preattenuated ARGB. -LIBYUV_API -int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Convert preattentuated ARGB to unattenuated ARGB. -LIBYUV_API -int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Internal function - do not call directly. -// Computes table of cumulative sum for image where the value is the sum -// of all values above and to the left of the entry. Used by ARGBBlur. -LIBYUV_API -int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height); - -// Blur ARGB image. -// dst_cumsum table of width * (height + 1) * 16 bytes aligned to -// 16 byte boundary. -// dst_stride32_cumsum is number of ints in a row (width * 4). -// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. -// Blur is optimized for radius of 5 (11x11) or less. -LIBYUV_API -int ARGBBlur(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height, int radius); - -// Multiply ARGB image by ARGB value. -LIBYUV_API -int ARGBShade(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, uint32 value); - -// Interpolate between two images using specified amount of interpolation -// (0 to 255) and store to destination. -// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 -// and 255 means 1% src0 and 99% src1. -LIBYUV_API -int InterpolatePlane(const uint8* src0, int src_stride0, - const uint8* src1, int src_stride1, - uint8* dst, int dst_stride, - int width, int height, int interpolation); - -// Interpolate between two ARGB images using specified amount of interpolation -// Internally calls InterpolatePlane with width * 4 (bpp). -LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int interpolation); - -// Interpolate between two YUV images using specified amount of interpolation -// Internally calls InterpolatePlane on each plane where the U and V planes -// are half width and half height. -LIBYUV_API -int I420Interpolate(const uint8* src0_y, int src0_stride_y, - const uint8* src0_u, int src0_stride_u, - const uint8* src0_v, int src0_stride_v, - const uint8* src1_y, int src1_stride_y, - const uint8* src1_u, int src1_stride_u, - const uint8* src1_v, int src1_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, int interpolation); - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +// TODO(fbarchard): Move cpu macros to row.h +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 @@ -479,43 +40,808 @@ int I420Interpolate(const uint8* src0_y, int src0_stride_y, #define HAS_ARGBAFFINEROW_SSE2 #endif +// Copy a plane of data. +LIBYUV_API +void CopyPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +LIBYUV_API +void CopyPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height); + +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height); + +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 1024 for 10 bits + int width, + int height); + +// Set a plane of data to a 32 bit value. +LIBYUV_API +void SetPlane(uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + uint32_t value); + +// Split interleaved UV plane into separate U and V planes. +LIBYUV_API +void SplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Merge separate U and V planes into one interleaved UV plane. +LIBYUV_API +void MergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Split interleaved RGB plane into separate R, G and B planes. +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +// Merge separate R, G and B planes into one interleaved RGB plane. +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height); + +// Copy I400. Supports inverting. +LIBYUV_API +int I400ToI400(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +#define J400ToJ400 I400ToI400 + +// Copy I422 to I422. +#define I422ToI422 I422Copy +LIBYUV_API +int I422Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy I444 to I444. +#define I444ToI444 I444Copy +LIBYUV_API +int I444Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +LIBYUV_API +int YUY2ToNV12(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int UYVYToNV12(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int YUY2ToY(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Convert I420 to I400. (calls CopyPlane ignoring u/v). +LIBYUV_API +int I420ToI400(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alias +#define J420ToJ400 I420ToI400 +#define I420ToI420Mirror I420Mirror + +// I420 mirror. +LIBYUV_API +int I420Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Alias +#define I400ToI400Mirror I400Mirror + +// I400 mirror. A single plane is mirrored horizontally. +// Pass negative height to achieve 180 degree rotation. +LIBYUV_API +int I400Mirror(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alias +#define ARGBToARGBMirror ARGBMirror + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +// I422ToARGB is in convert_argb.h +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +// Alias +#define RGB24ToRAW RAWToRGB24 + +LIBYUV_API +int RAWToRGB24(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Draw a rectangle into I420. +LIBYUV_API +int I420Rect(uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v); + +// Draw a rectangle into ARGB. +LIBYUV_API +int ARGBRect(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, + uint32_t value); + +// Convert ARGB to gray scale ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_argb, + int width, + int height); + +// Deprecated. Use ARGBColorMatrix instead. +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The last 4 coefficients apply to B, G, R, A and produce R of the output. +LIBYUV_API +int RGBColorMatrix(uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_rgb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a luma/color table each ARGB pixel but preserve destination alpha. +// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from +// RGB (YJ style) and C is an 8 bit color component (R, G or B). +LIBYUV_API +int ARGBLumaColorTable(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* luma, + int width, + int height); + +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const float* poly, + int width, + int height); + +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + float scale, + int width, + int height); + +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width); + +// Quantize a rectangle of ARGB. Alpha unaffected. +// scale is a 16 bit fractional fixed point scaler between 0 and 65535. +// interval_size should be a value between 1 and 255. +// interval_offset should be a value between 0 and 255. +LIBYUV_API +int ARGBQuantize(uint8_t* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Copy Alpha channel of ARGB to alpha of ARGB. +LIBYUV_API +int ARGBCopyAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Extract the alpha channel from ARGB. +LIBYUV_API +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height); + +// Copy Y channel to Alpha of ARGB. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +typedef void (*ARGBBlendRow)(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +LIBYUV_API +ARGBBlendRow GetARGBBlend(); + +// Alpha Blend ARGB images and store to destination. +// Source is pre-multiplied by alpha using ARGBAttenuate. +// Alpha of destination is set to 255. +LIBYUV_API +int ARGBBlend(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Alpha Blend plane and store to destination. +// Source is not pre-multiplied by alpha. +LIBYUV_API +int BlendPlane(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alpha Blend YUV images and store to destination. +// Source is not pre-multiplied by alpha. +// Alpha is full width x height and subsampled to half size to apply to UV. +LIBYUV_API +int I420Blend(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_u0, + int src_stride_u0, + const uint8_t* src_v0, + int src_stride_v0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* src_u1, + int src_stride_u1, + const uint8_t* src_v1, + int src_stride_v1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. +LIBYUV_API +int ARGBMultiply(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Add ARGB image with ARGB image. Saturates to 255. +LIBYUV_API +int ARGBAdd(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. +LIBYUV_API +int ARGBSubtract(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I422 to YUY2. +LIBYUV_API +int I422ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); + +// Convert I422 to UYVY. +LIBYUV_API +int I422ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); + +// Convert unattentuated ARGB to preattenuated ARGB. +LIBYUV_API +int ARGBAttenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Internal function - do not call directly. +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8_t* src_argb, + int src_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height); + +// Blur ARGB image. +// dst_cumsum table of width * (height + 1) * 16 bytes aligned to +// 16 byte boundary. +// dst_stride32_cumsum is number of ints in a row (width * 4). +// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. +LIBYUV_API +int ARGBBlur(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius); + +// Multiply ARGB image by ARGB value. +LIBYUV_API +int ARGBShade(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32_t value); + +// Interpolate between two images using specified amount of interpolation +// (0 to 255) and store to destination. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 +// and 255 means 1% src0 and 99% src1. +LIBYUV_API +int InterpolatePlane(const uint8_t* src0, + int src_stride0, + const uint8_t* src1, + int src_stride1, + uint8_t* dst, + int dst_stride, + int width, + int height, + int interpolation); + +// Interpolate between two ARGB images using specified amount of interpolation +// Internally calls InterpolatePlane with width * 4 (bpp). +LIBYUV_API +int ARGBInterpolate(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation); + +// Interpolate between two YUV images using specified amount of interpolation +// Internally calls InterpolatePlane on each plane where the U and V planes +// are half width and half height. +LIBYUV_API +int I420Interpolate(const uint8_t* src0_y, + int src0_stride_y, + const uint8_t* src0_u, + int src0_stride_u, + const uint8_t* src0_v, + int src0_stride_v, + const uint8_t* src1_y, + int src1_stride_y, + const uint8_t* src1_u, + int src1_stride_u, + const uint8_t* src1_v, + int src1_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation); + // Row function for copying pixels from a source with a slope to a row // of destination. Useful for scaling, rotation, mirror, texture mapping. LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); +// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); // Shuffle ARGB channel order. e.g. BGRA to ARGB. // shuffler is 16 bytes and must be aligned. LIBYUV_API -int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - const uint8* shuffler, int width, int height); +int ARGBShuffle(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* shuffler, + int width, + int height); // Sobel ARGB effect with planar output. LIBYUV_API -int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height); +int ARGBSobelToPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); // Sobel ARGB effect. LIBYUV_API -int ARGBSobel(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBSobel(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. LIBYUV_API -int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBSobelXY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h index 8af60b8955..76b692be8b 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_H_ #define INCLUDE_LIBYUV_ROTATE_H_ #include "libyuv/basic_types.h" @@ -20,8 +20,8 @@ extern "C" { // Supported rotation. typedef enum RotationMode { - kRotate0 = 0, // No rotation. - kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. kRotate180 = 180, // Rotate 180 degrees. kRotate270 = 270, // Rotate 270 degrees clockwise. @@ -33,85 +33,132 @@ typedef enum RotationMode { // Rotate I420 frame. LIBYUV_API -int I420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, enum RotationMode mode); +int I420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); // Rotate NV12 input and store in I420. LIBYUV_API -int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, enum RotationMode mode); +int NV12ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); // Rotate a plane by 0, 90, 180, or 270. LIBYUV_API -int RotatePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int src_width, int src_height, enum RotationMode mode); +int RotatePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode); // Rotate planes by 90, 180, 270. Deprecated. LIBYUV_API -void RotatePlane90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane90(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotatePlane180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane180(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotatePlane270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane270(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotateUV90(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); // Rotations for when U and V are interleaved. // These functions take one input pointer and // split the data into two buffers while // rotating them. Deprecated. LIBYUV_API -void RotateUV180(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); LIBYUV_API -void RotateUV270(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); // The 90 and 270 functions are based on transposes. // Doing a transpose with reversing the read/write // order will result in a rotation by +- 90 degrees. // Deprecated. LIBYUV_API -void TransposePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void TransposePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void TransposeUV(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void TransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h index 660ff5573e..20432949ab 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ #define INCLUDE_LIBYUV_ROTATE_ARGB_H_ #include "libyuv/basic_types.h" @@ -21,13 +21,17 @@ extern "C" { // Rotate ARGB frame LIBYUV_API -int ARGBRotate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int src_width, int src_height, enum RotationMode mode); +int ARGBRotate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + enum RotationMode mode); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h index ebc487f9ab..5edc0fcf13 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ #define INCLUDE_LIBYUV_ROTATE_ROW_H_ #include "libyuv/basic_types.h" @@ -18,10 +18,14 @@ namespace libyuv { extern "C" { #endif -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) @@ -29,93 +33,162 @@ extern "C" { #endif #endif // The following are available for Visual C and clangcl 32 bit: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif -// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +// The following are available for GCC 32 or 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) #define HAS_TRANSPOSEWX8_SSSE3 #endif -// The following are available for 64 bit GCC but not NaCL: -#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ - defined(__x86_64__) +// The following are available for 64 bit GCC: +#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) #define HAS_TRANSPOSEWX8_FAST_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ +#if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_TRANSPOSEWX8_NEON #define HAS_TRANSPOSEUVWX8_NEON #endif -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ - defined(__mips__) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_TRANSPOSEWX8_DSPR2 -#define HAS_TRANSPOSEUVWX8_DSPR2 -#endif // defined(__mips__) +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_TRANSPOSEWX16_MSA +#define HAS_TRANSPOSEUVWX16_MSA +#endif -void TransposeWxH_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height); +void TransposeWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); -void TransposeWx8_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); +void TransposeWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); -void TransposeWx8_Any_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); +void TransposeWx8_Any_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Any_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_Any_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); -void TransposeUVWxH_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void TransposeUVWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); -void TransposeUVWx8_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); -void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_Any_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_Any_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_Any_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/row.h b/libs/libvpx/third_party/libyuv/include/libyuv/row.h index 013a7e53e3..65ef448b8c 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/row.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROW_H_ #define INCLUDE_LIBYUV_ROW_H_ #include // For malloc. @@ -20,41 +20,20 @@ namespace libyuv { extern "C" { #endif -#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) - -#ifdef __cplusplus -#define align_buffer_64(var, size) \ - uint8* var##_mem = reinterpret_cast(malloc((size) + 63)); \ - uint8* var = reinterpret_cast \ - ((reinterpret_cast(var##_mem) + 63) & ~63) -#else -#define align_buffer_64(var, size) \ - uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \ - uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ -#endif - -#define free_aligned_buffer_64(var) \ - free(var##_mem); \ - var = 0 - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif -// True if compiling for SSSE3 as a requirement. -#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3)) -#define LIBYUV_SSSE3_ONLY -#endif - -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif // clang >= 3.5.0 required for Arm64. #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) @@ -76,9 +55,19 @@ extern "C" { #endif // clang >= 3.4 #endif // __clang__ +// clang >= 6.0.0 required for AVX512. +// TODO(fbarchard): fix xcode 9 ios b/789. +#if 0 // Build fails in libvpx on Mac +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ >= 7) && !defined(__APPLE_EMBEDDED_SIMULATOR__) +#define CLANG_HAS_AVX512 1 +#endif // clang >= 7 +#endif // __clang__ +#endif // 0 + // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 @@ -90,8 +79,8 @@ extern "C" { #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 -#define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 @@ -104,12 +93,12 @@ extern "C" { #define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 +#define HAS_HALFFLOATROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 @@ -126,8 +115,10 @@ extern "C" { #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV12TORGB24ROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 +#define HAS_NV21TORGB24ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTORGB24ROW_SSSE3 #define HAS_RAWTOYROW_SSSE3 @@ -180,11 +171,8 @@ extern "C" { // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. -#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \ - !defined(__i386__) || defined(_MSC_VER) -// TODO(fbarchard): fix build error on x86 debug -// https://code.google.com/p/libyuv/issues/detail?id=524 -#define HAS_I411TOARGBROW_SSSE3 +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I422ALPHATOARGBROW_SSSE3 @@ -193,11 +181,12 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. -// The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ - defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_ARGBEXTRACTALPHAROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 @@ -208,13 +197,9 @@ extern "C" { #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 +#define HAS_HALFFLOATROW_AVX2 +// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast #define HAS_I400TOARGBROW_AVX2 -#if !(defined(_DEBUG) && defined(__i386__)) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_AVX2 -#endif -#define HAS_I411TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -227,8 +212,10 @@ extern "C" { #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV12TORGB24ROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 +#define HAS_NV21TORGB24ROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 @@ -246,11 +233,18 @@ extern "C" { #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 + +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_AVX2 +#endif #endif // The following are available for AVX2 Visual C and clangcl 32 bit: // TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_ARGB1555TOARGBROW_AVX2 #define HAS_ARGB4444TOARGBROW_AVX2 @@ -268,6 +262,51 @@ extern "C" { #define HAS_I422TOARGBROW_SSSE3 #endif +// The following are available for gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_ABGRTOAR30ROW_SSSE3 +#define HAS_ARGBTOAR30ROW_SSSE3 +#define HAS_CONVERT16TO8ROW_SSSE3 +#define HAS_CONVERT8TO16ROW_SSE2 +// I210 is for H010. 2 = 422. I for 601 vs H for 709. +#define HAS_I210TOAR30ROW_SSSE3 +#define HAS_I210TOARGBROW_SSSE3 +#define HAS_I422TOAR30ROW_SSSE3 +#define HAS_MERGERGBROW_SSSE3 +#define HAS_SPLITRGBROW_SSSE3 +#endif + +// The following are available for AVX2 gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_ABGRTOAR30ROW_AVX2 +#define HAS_ARGBTOAR30ROW_AVX2 +#define HAS_ARGBTORAWROW_AVX2 +#define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_CONVERT16TO8ROW_AVX2 +#define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_I210TOAR30ROW_AVX2 +#define HAS_I210TOARGBROW_AVX2 +#define HAS_I422TOAR30ROW_AVX2 +#define HAS_I422TOUYVYROW_AVX2 +#define HAS_I422TOYUY2ROW_AVX2 +#define HAS_MERGEUVROW_16_AVX2 +#define HAS_MULTIPLYROW_16_AVX2 +#endif + +// The following are available for AVX512 clang x86 platforms: +// TODO(fbarchard): Port to GCC and Visual C +// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX512)) +#define HAS_ARGBTORGB24ROW_AVX512VBMI +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -279,6 +318,7 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_NEON #define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_ARGBSETROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON @@ -286,18 +326,17 @@ extern "C" { #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON -#define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON -#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON +#define HAS_BYTETOFLOATROW_NEON #define HAS_COPYROW_NEON +#define HAS_HALFFLOATROW_NEON #define HAS_I400TOARGBROW_NEON -#define HAS_I411TOARGBROW_NEON #define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB4444ROW_NEON @@ -313,8 +352,10 @@ extern "C" { #define HAS_MIRRORROW_NEON #define HAS_MIRRORUVROW_NEON #define HAS_NV12TOARGBROW_NEON +#define HAS_NV12TORGB24ROW_NEON #define HAS_NV12TORGB565ROW_NEON #define HAS_NV21TOARGBROW_NEON +#define HAS_NV21TORGB24ROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RAWTORGB24ROW_NEON #define HAS_RAWTOUVROW_NEON @@ -328,6 +369,7 @@ extern "C" { #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON +#define HAS_SPLITRGBROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON @@ -359,17 +401,87 @@ extern "C" { #define HAS_SOBELYROW_NEON #endif -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) -#define HAS_COPYROW_MIPS -#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_I422TOARGBROW_DSPR2 -#define HAS_INTERPOLATEROW_DSPR2 -#define HAS_MIRRORROW_DSPR2 -#define HAS_MIRRORUVROW_DSPR2 -#define HAS_SPLITUVROW_DSPR2 +// The following are available on AArch64 platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_SCALESUMSAMPLES_NEON #endif +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_ABGRTOUVROW_MSA +#define HAS_ABGRTOYROW_MSA +#define HAS_ARGB1555TOARGBROW_MSA +#define HAS_ARGB1555TOUVROW_MSA +#define HAS_ARGB1555TOYROW_MSA +#define HAS_ARGB4444TOARGBROW_MSA +#define HAS_ARGBADDROW_MSA +#define HAS_ARGBATTENUATEROW_MSA +#define HAS_ARGBBLENDROW_MSA +#define HAS_ARGBCOLORMATRIXROW_MSA +#define HAS_ARGBEXTRACTALPHAROW_MSA +#define HAS_ARGBGRAYROW_MSA +#define HAS_ARGBMIRRORROW_MSA +#define HAS_ARGBMULTIPLYROW_MSA +#define HAS_ARGBQUANTIZEROW_MSA +#define HAS_ARGBSEPIAROW_MSA +#define HAS_ARGBSETROW_MSA +#define HAS_ARGBSHADEROW_MSA +#define HAS_ARGBSHUFFLEROW_MSA +#define HAS_ARGBSUBTRACTROW_MSA +#define HAS_ARGBTOARGB1555ROW_MSA +#define HAS_ARGBTOARGB4444ROW_MSA +#define HAS_ARGBTORAWROW_MSA +#define HAS_ARGBTORGB24ROW_MSA +#define HAS_ARGBTORGB565DITHERROW_MSA +#define HAS_ARGBTORGB565ROW_MSA +#define HAS_ARGBTOUV444ROW_MSA +#define HAS_ARGBTOUVJROW_MSA +#define HAS_ARGBTOUVROW_MSA +#define HAS_ARGBTOYJROW_MSA +#define HAS_ARGBTOYROW_MSA +#define HAS_BGRATOUVROW_MSA +#define HAS_BGRATOYROW_MSA +#define HAS_HALFFLOATROW_MSA +#define HAS_I400TOARGBROW_MSA +#define HAS_I422ALPHATOARGBROW_MSA +#define HAS_I422TOARGBROW_MSA +#define HAS_I422TORGB24ROW_MSA +#define HAS_I422TORGBAROW_MSA +#define HAS_I422TOUYVYROW_MSA +#define HAS_I422TOYUY2ROW_MSA +#define HAS_I444TOARGBROW_MSA +#define HAS_INTERPOLATEROW_MSA +#define HAS_J400TOARGBROW_MSA +#define HAS_MERGEUVROW_MSA +#define HAS_MIRRORROW_MSA +#define HAS_MIRRORUVROW_MSA +#define HAS_NV12TOARGBROW_MSA +#define HAS_NV12TORGB565ROW_MSA +#define HAS_NV21TOARGBROW_MSA +#define HAS_RAWTOARGBROW_MSA +#define HAS_RAWTORGB24ROW_MSA +#define HAS_RAWTOUVROW_MSA +#define HAS_RAWTOYROW_MSA +#define HAS_RGB24TOARGBROW_MSA +#define HAS_RGB24TOUVROW_MSA +#define HAS_RGB24TOYROW_MSA +#define HAS_RGB565TOARGBROW_MSA +#define HAS_RGB565TOUVROW_MSA +#define HAS_RGB565TOYROW_MSA +#define HAS_RGBATOUVROW_MSA +#define HAS_RGBATOYROW_MSA +#define HAS_SETROW_MSA +#define HAS_SOBELROW_MSA +#define HAS_SOBELTOPLANEROW_MSA +#define HAS_SOBELXROW_MSA +#define HAS_SOBELXYROW_MSA +#define HAS_SOBELYROW_MSA +#define HAS_SPLITUVROW_MSA +#define HAS_UYVYTOARGBROW_MSA +#define HAS_UYVYTOUVROW_MSA +#define HAS_UYVYTOYROW_MSA +#define HAS_YUY2TOARGBROW_MSA +#define HAS_YUY2TOUV422ROW_MSA +#define HAS_YUY2TOUVROW_MSA +#define HAS_YUY2TOYROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -378,18 +490,18 @@ extern "C" { #else #define SIMD_ALIGNED(var) __declspec(align(16)) var #endif -typedef __declspec(align(16)) int16 vec16[8]; -typedef __declspec(align(16)) int32 vec32[4]; -typedef __declspec(align(16)) int8 vec8[16]; -typedef __declspec(align(16)) uint16 uvec16[8]; -typedef __declspec(align(16)) uint32 uvec32[4]; -typedef __declspec(align(16)) uint8 uvec8[16]; -typedef __declspec(align(32)) int16 lvec16[16]; -typedef __declspec(align(32)) int32 lvec32[8]; -typedef __declspec(align(32)) int8 lvec8[32]; -typedef __declspec(align(32)) uint16 ulvec16[16]; -typedef __declspec(align(32)) uint32 ulvec32[8]; -typedef __declspec(align(32)) uint8 ulvec8[32]; +typedef __declspec(align(16)) int16_t vec16[8]; +typedef __declspec(align(16)) int32_t vec32[4]; +typedef __declspec(align(16)) int8_t vec8[16]; +typedef __declspec(align(16)) uint16_t uvec16[8]; +typedef __declspec(align(16)) uint32_t uvec32[4]; +typedef __declspec(align(16)) uint8_t uvec8[16]; +typedef __declspec(align(32)) int16_t lvec16[16]; +typedef __declspec(align(32)) int32_t lvec32[8]; +typedef __declspec(align(32)) int8_t lvec8[32]; +typedef __declspec(align(32)) uint16_t ulvec16[16]; +typedef __declspec(align(32)) uint32_t ulvec32[8]; +typedef __declspec(align(32)) uint8_t ulvec8[32]; #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) @@ -397,32 +509,32 @@ typedef __declspec(align(32)) uint8 ulvec8[32]; #else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #endif -typedef int16 __attribute__((vector_size(16))) vec16; -typedef int32 __attribute__((vector_size(16))) vec32; -typedef int8 __attribute__((vector_size(16))) vec8; -typedef uint16 __attribute__((vector_size(16))) uvec16; -typedef uint32 __attribute__((vector_size(16))) uvec32; -typedef uint8 __attribute__((vector_size(16))) uvec8; -typedef int16 __attribute__((vector_size(32))) lvec16; -typedef int32 __attribute__((vector_size(32))) lvec32; -typedef int8 __attribute__((vector_size(32))) lvec8; -typedef uint16 __attribute__((vector_size(32))) ulvec16; -typedef uint32 __attribute__((vector_size(32))) ulvec32; -typedef uint8 __attribute__((vector_size(32))) ulvec8; +typedef int16_t __attribute__((vector_size(16))) vec16; +typedef int32_t __attribute__((vector_size(16))) vec32; +typedef int8_t __attribute__((vector_size(16))) vec8; +typedef uint16_t __attribute__((vector_size(16))) uvec16; +typedef uint32_t __attribute__((vector_size(16))) uvec32; +typedef uint8_t __attribute__((vector_size(16))) uvec8; +typedef int16_t __attribute__((vector_size(32))) lvec16; +typedef int32_t __attribute__((vector_size(32))) lvec32; +typedef int8_t __attribute__((vector_size(32))) lvec8; +typedef uint16_t __attribute__((vector_size(32))) ulvec16; +typedef uint32_t __attribute__((vector_size(32))) ulvec32; +typedef uint8_t __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var -typedef int16 vec16[8]; -typedef int32 vec32[4]; -typedef int8 vec8[16]; -typedef uint16 uvec16[8]; -typedef uint32 uvec32[4]; -typedef uint8 uvec8[16]; -typedef int16 lvec16[16]; -typedef int32 lvec32[8]; -typedef int8 lvec8[32]; -typedef uint16 ulvec16[16]; -typedef uint32 ulvec32[8]; -typedef uint8 ulvec8[32]; +typedef int16_t vec16[8]; +typedef int32_t vec32[4]; +typedef int8_t vec8[16]; +typedef uint16_t uvec16[8]; +typedef uint32_t uvec32[4]; +typedef uint8_t uvec8[16]; +typedef int16_t lvec16[16]; +typedef int32_t lvec32[8]; +typedef int8_t lvec8[32]; +typedef uint16_t ulvec16[16]; +typedef uint32_t ulvec32[8]; +typedef uint8_t ulvec8[32]; #endif #if defined(__aarch64__) @@ -446,23 +558,23 @@ struct YuvConstants { #else // This struct is for Intel color conversion. struct YuvConstants { - int8 kUVToB[32]; - int8 kUVToG[32]; - int8 kUVToR[32]; - int16 kUVBiasB[16]; - int16 kUVBiasG[16]; - int16 kUVBiasR[16]; - int16 kYToRgb[16]; + int8_t kUVToB[32]; + int8_t kUVToG[32]; + int8_t kUVToR[32]; + int16_t kUVBiasB[16]; + int16_t kUVBiasG[16]; + int16_t kUVBiasR[16]; + int16_t kYToRgb[16]; }; // Offsets into YuvConstants structure -#define KUVTOB 0 -#define KUVTOG 32 -#define KUVTOR 64 +#define KUVTOB 0 +#define KUVTOG 32 +#define KUVTOR 64 #define KUVBIASB 96 #define KUVBIASG 128 #define KUVBIASR 160 -#define KYTORGB 192 +#define KYTORGB 192 #endif // Conversion matrix for YUV to RGB @@ -475,6 +587,16 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601 extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) + +#define align_buffer_64(var, size) \ + uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \ + uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ + +#define free_aligned_buffer_64(var) \ + free(var##_mem); \ + var = 0 + #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #define OMITFP #else @@ -487,1458 +609,2863 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 #else #define LABELALIGN #endif -#if defined(__native_client__) && defined(__x86_64__) -// r14 is used for MEMOP macros. -#define NACL_R14 "r14", -#define BUNDLELOCK ".bundle_lock\n" -#define BUNDLEUNLOCK ".bundle_unlock\n" -#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" -#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" -#define MEMLEA(offset, base) #offset "(%q" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%q" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%q" #base ",%q" #index "," #scale ")" -#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15" -#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15" -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg "\n" \ - BUNDLEUNLOCK -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " %%" #reg ",(%%r15,%%r14)\n" \ - BUNDLEUNLOCK -#define MEMOPARG(opcode, offset, base, index, scale, arg) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%" #arg "\n" \ - BUNDLEUNLOCK -#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \ - BUNDLEUNLOCK -#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \ - BUNDLEUNLOCK -#else // defined(__native_client__) && defined(__x86_64__) -#define NACL_R14 -#define BUNDLEALIGN -#define MEMACCESS(base) "(%" #base ")" -#define MEMACCESS2(offset, base) #offset "(%" #base ")" -#define MEMLEA(offset, base) #offset "(%" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%" #base ",%" #index "," #scale ")" -#define MEMMOVESTRING(s, d) -#define MEMSTORESTRING(reg, d) -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" -#define MEMOPARG(opcode, offset, base, index, scale, arg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" -#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \ - #reg2 "\n" -#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ - #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" -#endif // defined(__native_client__) && defined(__x86_64__) -#if defined(__arm__) || defined(__aarch64__) -#undef MEMACCESS -#if defined(__native_client__) -#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" -#else -#define MEMACCESS(base) -#endif +// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be +// measured and then run with iaca -64 libyuv_unittest. +// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within +// inline assembly blocks. +// example of iaca: +// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest + +#if defined(__x86_64__) || defined(__i386__) + +#define IACA_ASM_START \ + ".byte 0x0F, 0x0B\n" \ + " movl $111, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" + +#define IACA_ASM_END \ + " movl $222, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" \ + ".byte 0x0F, 0x0B\n" + +#define IACA_SSC_MARK(MARK_ID) \ + __asm__ __volatile__("\n\t movl $" #MARK_ID \ + ", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : \ + : \ + : "memory"); + +#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B"); + +#else /* Visual C */ +#define IACA_UD_BYTES \ + { __asm _emit 0x0F __asm _emit 0x0B } + +#define IACA_SSC_MARK(x) \ + { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); #endif -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +#define IACA_START \ + { \ + IACA_UD_BYTES \ + IACA_SSC_MARK(111) \ + } +#define IACA_END \ + { \ + IACA_SSC_MARK(222) \ + IACA_UD_BYTES \ + } + +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); - -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width); -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width); -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width); -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width); -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width); -void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width); -void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width); -void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width); -void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width); -void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width); -void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width); -void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width); -void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, - int width); -void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, - int width); - -void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width); -void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width); -void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, - int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, - int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); - -void ARGBToUV444Row_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); - -void ARGBToUV444Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV411Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); - -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); -void MirrorRow_NEON(const uint8* src, uint8* dst, int width); -void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width); -void MirrorRow_C(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); - -void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); - -void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_MSA(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, int width); +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width); +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width); +void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); +void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void RGB565ToUVRow_C(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MirrorUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void SplitUVRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void SplitUVRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, int width); +void SplitUVRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); -void CopyRow_SSE2(const uint8* src, uint8* dst, int count); -void CopyRow_AVX(const uint8* src, uint8* dst, int count); -void CopyRow_ERMS(const uint8* src, uint8* dst, int count); -void CopyRow_NEON(const uint8* src, uint8* dst, int count); -void CopyRow_MIPS(const uint8* src, uint8* dst, int count); -void CopyRow_C(const uint8* src, uint8* dst, int count); -void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count); -void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count); -void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count); +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); -void CopyRow_16_C(const uint16* src, uint16* dst, int count); +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); -void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void MergeRGBRow_Any_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); + +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, /* 64 for 10 bit */ + int width); +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width); + +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); + +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int scale, + int width); +void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int scale, + int width); + +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, int width); -void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + int width); + +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); + +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width); -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width); -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width); -void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a, +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a, +void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, - int width); -void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, - int width); +void SetRow_C(uint8_t* dst, uint8_t v8, int width); +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width); +void SetRow_X86(uint8_t* dst, uint8_t v8, int width); +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width); +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width); +void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width); +void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width); -void SetRow_C(uint8* dst, uint8 v8, int count); -void SetRow_X86(uint8* dst, uint8 v8, int count); -void SetRow_ERMS(uint8* dst, uint8 v8, int count); -void SetRow_NEON(uint8* dst, uint8 v8, int count); -void SetRow_Any_X86(uint8* dst, uint8 v8, int count); -void SetRow_Any_NEON(uint8* dst, uint8 v8, int count); - -void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count); -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count); -void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count); -void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width); +void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width); +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width); // ARGBShufflers for BGRAToARGB etc. -void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); - -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width); -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int width); -void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int width); - -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width); -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, - int width); -void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width); -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, int width); -void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); - -void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int width); -void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int width); -void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int width); -void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int width); - -void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, +void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, int width); -void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, +void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); +void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); + +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width); +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width); +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width); + +void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, +void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); +void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width); -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width); +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width); -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width); +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width); +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width); +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, + int width); +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, + int width); +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); + +void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); + +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_C(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_C(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, + +void I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I411ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_AVX2(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_Any_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width); +void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width); +void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width); +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); // Unattenuated planar alpha blend. -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_C(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); +void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); +void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBMultiplyRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB add images. -void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBAddRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBSubtractRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); -void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); +void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); +void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); -void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); +void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); +void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); -void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, - int width); -void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, - int width); -void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); - -void I444ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, +void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_Any_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, +void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); + +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void I422ToARGBRow_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width); +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_C(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); -void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); - -void I422ToYUY2Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_Any_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_Any_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width); +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width); +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); // Effects related row functions. -void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, - int width); -void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); +void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); // Inverse table for unattenuate, shared by C and SSE2. -extern const uint32 fixed_invtbl8[256]; -void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, +extern const uint32_t fixed_invtbl8[256]; +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBSepiaRow_C(uint8* dst_argb, int width); -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); -void ARGBSepiaRow_NEON(uint8* dst_argb, int width); +void ARGBSepiaRow_C(uint8_t* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); -void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); -void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); -void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); -void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); +void ARGBQuantizeRow_C(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); -void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value); -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value); -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value); +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); // Used for blur. -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count); -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width); +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, + int count); +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width); -void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count); -void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width); +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, + int w, + int area, + uint8_t* dst, + int count); +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width); LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* src_dudv, + int width); // Used for I420Scale, ARGBScale, and ARGBInterpolate. -void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, - int width, int source_y_fraction); -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); -void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); -void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); -void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_Any_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); +void InterpolateRow_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); -void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride_ptr, - int width, int source_y_fraction); +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); // Sobel images. -void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, - uint8* dst_sobelx, int width); -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width); -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width); -void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); - -void ARGBPolynomialRow_C(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width); -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); + +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width); +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, int width); -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, int width); -void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff); -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, +// Scale and convert to half float. +void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_Any_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_MSA(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width); +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width); +void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, + float* dst_ptr, + float param, + int width); + +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - const uint8* luma, uint32 lumacoeff); + const uint8_t* luma, + uint32_t lumacoeff); + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width); +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width); +void ScaleSamples_C(const float* src, float* dst, float scale, int width); +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROW_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale.h index 102158d1ab..b937d348ca 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/scale.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_H_ #define INCLUDE_LIBYUV_SCALE_H_ #include "libyuv/basic_types.h" @@ -20,25 +20,33 @@ extern "C" { // Supported filtering. typedef enum FilterMode { - kFilterNone = 0, // Point sample; Fastest. - kFilterLinear = 1, // Filter horizontally only. + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. kFilterBilinear = 2, // Faster than box, but lower quality scaling down. - kFilterBox = 3 // Highest quality. + kFilterBox = 3 // Highest quality. } FilterModeEnum; // Scale a YUV plane. LIBYUV_API -void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering); LIBYUV_API -void ScalePlane_16(const uint16* src, int src_stride, - int src_width, int src_height, - uint16* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane_16(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering); // Scales a YUV 4:2:0 image from the src width and height to the @@ -52,44 +60,64 @@ void ScalePlane_16(const uint16* src, int src_stride, // Returns 0 if successful. LIBYUV_API -int I420Scale(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering); LIBYUV_API -int I420Scale_16(const uint16* src_y, int src_stride_y, - const uint16* src_u, int src_stride_u, - const uint16* src_v, int src_stride_v, - int src_width, int src_height, - uint16* dst_y, int dst_stride_y, - uint16* dst_u, int dst_stride_u, - uint16* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering); #ifdef __cplusplus // Legacy API. Deprecated. LIBYUV_API -int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, - int src_stride_y, int src_stride_u, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, uint8* dst_u, uint8* dst_v, - int dst_stride_y, int dst_stride_u, int dst_stride_v, - int dst_width, int dst_height, +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, LIBYUV_BOOL interpolate); -// Legacy API. Deprecated. -LIBYUV_API -int ScaleOffset(const uint8* src_i420, int src_width, int src_height, - uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset, - LIBYUV_BOOL interpolate); - // For testing, allow disabling of specialized scalers. LIBYUV_API void SetUseReferenceImpl(LIBYUV_BOOL use); @@ -100,4 +128,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use); } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h index b56cf52099..7641f18e34 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ #define INCLUDE_LIBYUV_SCALE_ARGB_H_ #include "libyuv/basic_types.h" @@ -20,32 +20,52 @@ extern "C" { #endif LIBYUV_API -int ARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +int ARGBScale(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, enum FilterMode filtering); // Clipped scale takes destination rectangle coordinates for clip values. LIBYUV_API -int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int ARGBScaleClip(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering); // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint32 src_fourcc, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - uint32 dst_fourcc, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int YUVToARGBScaleClip(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint32_t src_fourcc, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + uint32_t dst_fourcc, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering); #ifdef __cplusplus @@ -53,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h index df699e6c22..7194ba09f8 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ #define INCLUDE_LIBYUV_SCALE_ROW_H_ #include "libyuv/basic_types.h" @@ -19,17 +19,20 @@ namespace libyuv { extern "C" { #endif -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif - // GCC >= 4.7.0 required for AVX2. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) @@ -45,8 +48,8 @@ extern "C" { #endif // __clang__ // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 @@ -72,15 +75,16 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ - defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) #define HAS_SCALEADDROW_AVX2 #define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN4_AVX2 #endif // The following are available on Neon platforms: -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ +#if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEARGBCOLS_NEON #define HAS_SCALEARGBROWDOWN2_NEON @@ -93,33 +97,51 @@ extern "C" { #define HAS_SCALEARGBFILTERCOLS_NEON #endif -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ - defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_SCALEROWDOWN2_DSPR2 -#define HAS_SCALEROWDOWN4_DSPR2 -#define HAS_SCALEROWDOWN34_DSPR2 -#define HAS_SCALEROWDOWN38_DSPR2 +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_SCALEADDROW_MSA +#define HAS_SCALEARGBCOLS_MSA +#define HAS_SCALEARGBFILTERCOLS_MSA +#define HAS_SCALEARGBROWDOWN2_MSA +#define HAS_SCALEARGBROWDOWNEVEN_MSA +#define HAS_SCALEFILTERCOLS_MSA +#define HAS_SCALEROWDOWN2_MSA +#define HAS_SCALEROWDOWN34_MSA +#define HAS_SCALEROWDOWN38_MSA +#define HAS_SCALEROWDOWN4_MSA #endif // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int y, int dy, - int bpp, enum FilterMode filtering); + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering); void ScalePlaneVertical_16(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_argb, uint16* dst_argb, - int x, int y, int dy, - int wpp, enum FilterMode filtering); + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint16_t* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering); // Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, int src_height, - int dst_width, int dst_height, +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering); // Divide num by div and return as 16.16 fixed point result. @@ -137,367 +159,786 @@ int FixedDiv1_X86(int num, int div); #endif // Compute slope values for stepping. -void ScaleSlope(int src_width, int src_height, - int dst_width, int dst_height, +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering, - int* x, int* y, int* dx, int* dy); + int* x, + int* y, + int* dx, + int* dy); -void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width); -void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width); -void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int, int); -void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int, int); -void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown38_3_Box_C(const uint8* src_ptr, +void ScaleRowDown2_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, - ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); -void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width); -void ScaleARGBRowDown2_C(const uint8* src_argb, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown4_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown4Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown34_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown34_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + uint16_t* dst, + int dst_width); +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + uint16_t* d, + int dst_width); +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width); +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int, + int); +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int, + int); +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x32, + int dx); +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x32, + int dx); +void ScaleRowDown38_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width); +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width); +void ScaleARGBRowDown2_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int, int); -void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); + uint8_t* dst_argb, + int dst_width); +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx); +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int, + int); +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx); // Specialized scalers for x86. -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); - -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); +void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); // ARGB Column functions -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); // ARGB Row functions -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. // Note - not static due to reuse in convert for 444 to 420. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); // 32x3 -> 12x1 -void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); -void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // 32 -> 12 -void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // 32x3 -> 12x1 -void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_Any_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); -void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); +void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); -void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); + +void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_Any_MSA(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/version.h b/libs/libvpx/third_party/libyuv/include/libyuv/version.h index 0fbdc022d5..7022785d8c 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/version.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/version.h @@ -8,9 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1616 +#define LIBYUV_VERSION 1711 -#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT +#endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h b/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h index ad934e4241..bcef378b5a 100644 --- a/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h +++ b/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h @@ -10,7 +10,7 @@ // Common definitions for video, including fourcc and VideoFormat. -#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ #define INCLUDE_LIBYUV_VIDEO_COMMON_H_ #include "libyuv/basic_types.h" @@ -28,13 +28,13 @@ extern "C" { // Needs to be a macro otherwise the OS X compiler complains when the kFormat* // constants are used in a switch. #ifdef __cplusplus -#define FOURCC(a, b, c, d) ( \ - (static_cast(a)) | (static_cast(b) << 8) | \ - (static_cast(c) << 16) | (static_cast(d) << 24)) +#define FOURCC(a, b, c, d) \ + ((static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | (static_cast(d) << 24)) #else -#define FOURCC(a, b, c, d) ( \ - ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \ - ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */ +#define FOURCC(a, b, c, d) \ + (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \ + ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */ #endif // Some pages discussing FourCC codes: @@ -53,38 +53,33 @@ enum FourCC { FOURCC_I420 = FOURCC('I', '4', '2', '0'), FOURCC_I422 = FOURCC('I', '4', '2', '2'), FOURCC_I444 = FOURCC('I', '4', '4', '4'), - FOURCC_I411 = FOURCC('I', '4', '1', '1'), FOURCC_I400 = FOURCC('I', '4', '0', '0'), FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb - // 2 Secondary YUV formats: row biplanar. + // 1 Secondary YUV format: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), - FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. - // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. + // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. + FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), - FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. - // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated. - FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), - FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), - FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), - FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), - // 1 Primary Compressed YUV format. FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), - // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. + // 7 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), @@ -112,7 +107,13 @@ enum FourCC { FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. - // 1 Auxiliary compressed YUV format set aside for capturer. + // deprecated formats. Not supported, but defined for backward compatibility. + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), FOURCC_H264 = FOURCC('H', '2', '6', '4'), // Match any fourcc. @@ -136,8 +137,10 @@ enum FourCCBpp { FOURCC_BPP_BGRA = 32, FOURCC_BPP_ABGR = 32, FOURCC_BPP_RGBA = 32, + FOURCC_BPP_AR30 = 32, + FOURCC_BPP_AB30 = 32, FOURCC_BPP_24BG = 24, - FOURCC_BPP_RAW = 24, + FOURCC_BPP_RAW = 24, FOURCC_BPP_RGBP = 16, FOURCC_BPP_RGBO = 16, FOURCC_BPP_R444 = 16, @@ -152,6 +155,7 @@ enum FourCCBpp { FOURCC_BPP_J420 = 12, FOURCC_BPP_J400 = 8, FOURCC_BPP_H420 = 12, + FOURCC_BPP_H010 = 24, FOURCC_BPP_MJPG = 0, // 0 means unknown. FOURCC_BPP_H264 = 0, FOURCC_BPP_IYUV = 12, @@ -170,15 +174,15 @@ enum FourCCBpp { FOURCC_BPP_CM24 = 24, // Match any fourcc. - FOURCC_BPP_ANY = 0, // 0 means unknown. + FOURCC_BPP_ANY = 0, // 0 means unknown. }; // Converts fourcc aliases into canonical ones. -LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc); +LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ diff --git a/libs/libvpx/third_party/libyuv/source/compare.cc b/libs/libvpx/third_party/libyuv/source/compare.cc index e3846bdfdd..50e3abd055 100644 --- a/libs/libvpx/third_party/libyuv/source/compare.cc +++ b/libs/libvpx/third_party/libyuv/source/compare.cc @@ -29,10 +29,10 @@ extern "C" { // hash seed of 5381 recommended. LIBYUV_API -uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = + uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) if (TestCpuFlag(kCpuHasSSE41)) { @@ -45,25 +45,25 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { } #endif - while (count >= (uint64)(kBlockSize)) { + while (count >= (uint64_t)(kBlockSize)) { seed = HashDjb2_SSE(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; } - remainder = (int)(count) & ~15; + remainder = (int)count & ~15; if (remainder) { seed = HashDjb2_SSE(src, remainder, seed); src += remainder; count -= remainder; } - remainder = (int)(count) & 15; + remainder = (int)count & 15; if (remainder) { seed = HashDjb2_C(src, remainder, seed); } return seed; } -static uint32 ARGBDetectRow_C(const uint8* argb, int width) { +static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. @@ -94,8 +94,11 @@ static uint32 ARGBDetectRow_C(const uint8* argb, int width) { // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API -uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { - uint32 fourcc = 0; +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height) { + uint32_t fourcc = 0; int h; // Coalesce rows. @@ -111,19 +114,80 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { return fourcc; } +// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. +// So actual maximum is 1 less loop, which is 64436 - 32 bytes. + +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + const int kBlockSize = 1 << 15; // 32768; + const int kSimdSize = 64; + // SIMD for multiple of 64, and C for remainder + int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); + uint64_t diff = 0; + int i; + uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, + int count) = HammingDistance_C; +#if defined(HAS_HAMMINGDISTANCE_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HammingDistance = HammingDistance_NEON; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + HammingDistance = HammingDistance_SSSE3; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSE42) + if (TestCpuFlag(kCpuHasSSE42)) { + HammingDistance = HammingDistance_SSE42; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HammingDistance = HammingDistance_AVX2; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HammingDistance = HammingDistance_MSA; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : diff) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + diff += HammingDistance(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + diff += HammingDistance(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & (kSimdSize - 1); + if (remainder) { + diff += HammingDistance_C(src_a, src_b, remainder); + } + return diff; +} + // TODO(fbarchard): Refactor into row function. LIBYUV_API -uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, - int count) { +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count) { // SumSquareError returns values 0 to 65535 for each squared difference. - // Up to 65536 of those can be summed and remain within a uint32. - // After each block of 65536 pixels, accumulate into a uint64. + // Up to 65536 of those can be summed and remain within a uint32_t. + // After each block of 65536 pixels, accumulate into a uint64_t. const int kBlockSize = 65536; int remainder = count & (kBlockSize - 1) & ~31; - uint64 sse = 0; + uint64_t sse = 0; int i; - uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = - SumSquareError_C; + uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, + int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) if (TestCpuFlag(kCpuHasNEON)) { SumSquareError = SumSquareError_NEON; @@ -141,8 +205,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, SumSquareError = SumSquareError_AVX2; } #endif +#if defined(HAS_SUMSQUAREERROR_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SumSquareError = SumSquareError_MSA; + } +#endif #ifdef _OPENMP -#pragma omp parallel for reduction(+: sse) +#pragma omp parallel for reduction(+ : sse) #endif for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { sse += SumSquareError(src_a + i, src_b + i, kBlockSize); @@ -162,14 +231,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } LIBYUV_API -uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { - uint64 sse = 0; +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + uint64_t sse = 0; int h; // Coalesce rows. - if (stride_a == width && - stride_b == width) { + if (stride_a == width && stride_b == width) { width *= height; height = 1; stride_a = stride_b = 0; @@ -183,66 +254,76 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, } LIBYUV_API -double SumSquareErrorToPsnr(uint64 sse, uint64 count) { +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) { double psnr; if (sse > 0) { - double mse = (double)(count) / (double)(sse); + double mse = (double)count / (double)sse; psnr = 10.0 * log10(255.0 * 255.0 * mse); } else { - psnr = kMaxPsnr; // Limit to prevent divide by 0 + psnr = kMaxPsnr; // Limit to prevent divide by 0 } - if (psnr > kMaxPsnr) + if (psnr > kMaxPsnr) { psnr = kMaxPsnr; + } return psnr; } LIBYUV_API -double CalcFramePsnr(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { - const uint64 samples = width * height; - const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, - src_b, stride_b, - width, height); +double CalcFramePsnr(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + const uint64_t samples = (uint64_t)width * (uint64_t)height; + const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, + stride_b, width, height); return SumSquareErrorToPsnr(sse, samples); } LIBYUV_API -double I420Psnr(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height) { - const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, - src_y_b, stride_y_b, - width, height); +double I420Psnr(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const uint64_t sse_y = ComputeSumSquareErrorPlane( + src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; - const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, - src_u_b, stride_u_b, - width_uv, height_uv); - const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, - src_v_b, stride_v_b, - width_uv, height_uv); - const uint64 samples = width * height + 2 * (width_uv * height_uv); - const uint64 sse = sse_y + sse_u + sse_v; + const uint64_t sse_u = ComputeSumSquareErrorPlane( + src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); + const uint64_t sse_v = ComputeSumSquareErrorPlane( + src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); + const uint64_t samples = (uint64_t)width * (uint64_t)height + + 2 * ((uint64_t)width_uv * (uint64_t)height_uv); + const uint64_t sse = sse_y + sse_u + sse_v; return SumSquareErrorToPsnr(sse, samples); } -static const int64 cc1 = 26634; // (64^2*(.01*255)^2 -static const int64 cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 -static double Ssim8x8_C(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b) { - int64 sum_a = 0; - int64 sum_b = 0; - int64 sum_sq_a = 0; - int64 sum_sq_b = 0; - int64 sum_axb = 0; +static double Ssim8x8_C(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b) { + int64_t sum_a = 0; + int64_t sum_b = 0; + int64_t sum_sq_a = 0; + int64_t sum_sq_b = 0; + int64_t sum_axb = 0; int i; for (i = 0; i < 8; ++i) { @@ -260,22 +341,22 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a, } { - const int64 count = 64; + const int64_t count = 64; // scale the constants by number of pixels - const int64 c1 = (cc1 * count * count) >> 12; - const int64 c2 = (cc2 * count * count) >> 12; + const int64_t c1 = (cc1 * count * count) >> 12; + const int64_t c2 = (cc2 * count * count) >> 12; - const int64 sum_a_x_sum_b = sum_a * sum_b; + const int64_t sum_a_x_sum_b = sum_a * sum_b; - const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * - (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); - const int64 sum_a_sq = sum_a*sum_a; - const int64 sum_b_sq = sum_b*sum_b; + const int64_t sum_a_sq = sum_a * sum_a; + const int64_t sum_b_sq = sum_b * sum_b; - const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * - (count * sum_sq_a - sum_a_sq + - count * sum_sq_b - sum_b_sq + c2); + const int64_t ssim_d = + (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); if (ssim_d == 0.0) { return DBL_MAX; @@ -288,13 +369,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a, // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. LIBYUV_API -double CalcFrameSsim(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { +double CalcFrameSsim(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { int samples = 0; double ssim_total = 0; - double (*Ssim8x8)(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b) = Ssim8x8_C; + double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b, + int stride_b) = Ssim8x8_C; // sample point start with each 4x4 location int i; @@ -314,22 +398,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a, } LIBYUV_API -double I420Ssim(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height) { - const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, - src_y_b, stride_y_b, width, height); +double I420Ssim(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const double ssim_y = + CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; - const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, - src_u_b, stride_u_b, + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); - const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, - src_v_b, stride_v_b, + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); } diff --git a/libs/libvpx/third_party/libyuv/source/compare_common.cc b/libs/libvpx/third_party/libyuv/source/compare_common.cc index 42fc589354..d4b170ad98 100644 --- a/libs/libvpx/third_party/libyuv/source/compare_common.cc +++ b/libs/libvpx/third_party/libyuv/source/compare_common.cc @@ -17,20 +17,80 @@ namespace libyuv { extern "C" { #endif -uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { - uint32 sse = 0u; +#if ORIGINAL_OPT +uint32_t HammingDistance_C1(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count; ++i) { + int x = src_a[i] ^ src_b[i]; + if (x & 1) + ++diff; + if (x & 2) + ++diff; + if (x & 4) + ++diff; + if (x & 8) + ++diff; + if (x & 16) + ++diff; + if (x & 32) + ++diff; + if (x & 64) + ++diff; + if (x & 128) + ++diff; + } + return diff; +} +#endif + +// Hakmem method for hamming distance. +uint32_t HammingDistance_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b); + uint32_t u = x - ((x >> 1) & 0x55555555); + u = ((u >> 2) & 0x33333333) + (u & 0x33333333); + diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); + src_a += 4; + src_b += 4; + } + + for (; i < count; ++i) { + uint32_t x = *src_a ^ *src_b; + uint32_t u = x - ((x >> 1) & 0x55); + u = ((u >> 2) & 0x33) + (u & 0x33); + diff += (u + (u >> 4)) & 0x0f; + src_a += 1; + src_b += 1; + } + + return diff; +} + +uint32_t SumSquareError_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; int i; for (i = 0; i < count; ++i) { int diff = src_a[i] - src_b[i]; - sse += (uint32)(diff * diff); + sse += (uint32_t)(diff * diff); } return sse; } // hash seed of 5381 recommended. // Internal C version of HashDjb2 with int sized count for efficiency. -uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { - uint32 hash = seed; +uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash = seed; int i; for (i = 0; i < count; ++i) { hash += (hash << 5) + src[i]; diff --git a/libs/libvpx/third_party/libyuv/source/compare_gcc.cc b/libs/libvpx/third_party/libyuv/source/compare_gcc.cc index 1b83edb166..676527c1b1 100644 --- a/libs/libvpx/third_party/libyuv/source/compare_gcc.cc +++ b/libs/libvpx/third_party/libyuv/source/compare_gcc.cc @@ -22,124 +22,334 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { - uint32 sse; - asm volatile ( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "lea " MEMLEA(0x10, 0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "lea " MEMLEA(0x10, 1) ",%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" +#if defined(__x86_64__) +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint64_t diff = 0u; - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" + asm volatile( + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // Process 32 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : + : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); + + return static_cast(diff); +} +#else +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + // Process 16 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "ecx", "edx"); + + return diff; +} +#endif + +static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15}; +static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + +uint32_t HammingDistance_SSSE3(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + + return diff; +} + +#ifdef HAS_HAMMINGDISTANCE_AVX2 +uint32_t HammingDistance_AVX2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "vbroadcastf128 %4,%%ymm2 \n" + "vbroadcastf128 %5,%%ymm3 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" + "vzeroupper \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + + return diff; +} +#endif // HAS_HAMMINGDISTANCE_AVX2 + +uint32_t SumSquareError_SSE2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); return sse; } -static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 -static uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 +static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 +static const uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 }; -static uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 +static const uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 }; -static uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 +static const uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 }; -static uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 +static const uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 }; -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { - uint32 hash; - asm volatile ( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "lea " MEMLEA(0x10, 0) ",%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" - : "+r"(src), // %0 - "+r"(count), // %1 - "+rm"(seed), // %2 - "=g"(hash) // %3 - : "m"(kHash16x33), // %4 - "m"(kHashMul0), // %5 - "m"(kHashMul1), // %6 - "m"(kHashMul2), // %7 - "m"(kHashMul3) // %8 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash; + asm volatile( + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); return hash; } #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) @@ -148,4 +358,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { } // extern "C" } // namespace libyuv #endif - diff --git a/libs/libvpx/third_party/libyuv/source/compare_msa.cc b/libs/libvpx/third_party/libyuv/source/compare_msa.cc new file mode 100644 index 0000000000..0b807d37be --- /dev/null +++ b/libs/libvpx/third_party/libyuv/source/compare_msa.cc @@ -0,0 +1,97 @@ +/* + * Copyright 2017 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32_t HammingDistance_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + int i; + v16u8 src0, src1, src2, src3; + v2i64 vec0 = {0}, vec1 = {0}; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + src0 ^= src2; + src1 ^= src3; + vec0 += __msa_pcnt_d((v2i64)src0); + vec1 += __msa_pcnt_d((v2i64)src1); + src_a += 32; + src_b += 32; + } + + vec0 += vec1; + diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); + diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); + return diff; +} + +uint32_t SumSquareError_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + int i; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2, vec3; + v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; + v2i64 tmp0; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); + reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); + reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); + reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); + src_a += 32; + src_b += 32; + } + + reg0 += reg1; + reg2 += reg3; + reg0 += reg2; + tmp0 = __msa_hadd_s_d(reg0, reg0); + sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); + sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); + return sse; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/libs/libvpx/third_party/libyuv/source/compare_neon.cc b/libs/libvpx/third_party/libyuv/source/compare_neon.cc index 49aa3b4eef..2a2181e0cb 100644 --- a/libs/libvpx/third_party/libyuv/source/compare_neon.cc +++ b/libs/libvpx/third_party/libyuv/source/compare_neon.cc @@ -21,40 +21,70 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" + asm volatile( + "vmov.u16 q4, #0 \n" // accumulator - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" + + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" + + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "q0", "q1", "q2", "q3", "q4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); return sse; } diff --git a/libs/libvpx/third_party/libyuv/source/compare_neon64.cc b/libs/libvpx/third_party/libyuv/source/compare_neon64.cc index f9c7df98c8..6e8f672ab7 100644 --- a/libs/libvpx/third_party/libyuv/source/compare_neon64.cc +++ b/libs/libvpx/third_party/libyuv/source/compare_neon64.cc @@ -20,39 +20,65 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + asm volatile( + "movi v4.8h, #0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); return sse; } diff --git a/libs/libvpx/third_party/libyuv/source/compare_win.cc b/libs/libvpx/third_party/libyuv/source/compare_win.cc index dc86fe25b1..d57d3d9d1c 100644 --- a/libs/libvpx/third_party/libyuv/source/compare_win.cc +++ b/libs/libvpx/third_party/libyuv/source/compare_win.cc @@ -13,20 +13,39 @@ #include "libyuv/compare_row.h" #include "libyuv/row.h" +#if defined(_MSC_VER) +#include // For __popcnt +#endif + #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -__declspec(naked) -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT + src_a += 4; + src_b += 4; + diff += __popcnt(x); + } + return diff; +} + +__declspec(naked) uint32_t + SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count pxor xmm0, xmm0 pxor xmm5, xmm5 @@ -61,13 +80,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. -#pragma warning(disable: 4752) -__declspec(naked) -uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { +#pragma warning(disable : 4752) +__declspec(naked) uint32_t + SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count vpxor ymm0, ymm0, ymm0 // sum vpxor ymm5, ymm5, ymm5 // constant 0 for unpck sub edx, eax @@ -101,65 +120,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { } #endif // _MSC_VER >= 1700 -uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 }; uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 }; uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 }; uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 }; -__declspec(naked) -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32_t + HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count movd xmm0, [esp + 12] // seed - pxor xmm7, xmm7 // constant 0 for unpck + pxor xmm7, xmm7 // constant 0 for unpck movdqa xmm6, xmmword ptr kHash16x33 wloop: - movdqu xmm1, [eax] // src[0-15] + movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 + pmulld xmm0, xmm6 // hash *= 33 ^ 16 movdqa xmm5, xmmword ptr kHashMul0 movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] + punpcklbw xmm2, xmm7 // src[0-7] movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] + punpcklwd xmm3, xmm7 // src[0-3] pmulld xmm3, xmm5 movdqa xmm5, xmmword ptr kHashMul1 movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] + punpckhwd xmm4, xmm7 // src[4-7] pmulld xmm4, xmm5 movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] + punpckhbw xmm1, xmm7 // src[8-15] movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] + punpcklwd xmm2, xmm7 // src[8-11] pmulld xmm2, xmm5 movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] + punpckhwd xmm1, xmm7 // src[12-15] pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results + paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 paddd xmm1, xmm3 @@ -171,18 +190,18 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { sub ecx, 16 jg wloop - movd eax, xmm0 // return hash + movd eax, xmm0 // return hash ret } } // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -__declspec(naked) -uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32_t + HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count vmovd xmm0, [esp + 12] // seed wloop: @@ -196,7 +215,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { vpmulld xmm2, xmm2, xmmword ptr kHashMul2 lea eax, [eax + 16] vpmulld xmm1, xmm1, xmmword ptr kHashMul3 - vpaddd xmm3, xmm3, xmm4 // add 16 results + vpaddd xmm3, xmm3, xmm4 // add 16 results vpaddd xmm1, xmm1, xmm2 vpaddd xmm1, xmm1, xmm3 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords @@ -207,7 +226,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { sub ecx, 16 jg wloop - vmovd eax, xmm0 // return hash + vmovd eax, xmm0 // return hash vzeroupper ret } diff --git a/libs/libvpx/third_party/libyuv/source/convert.cc b/libs/libvpx/third_party/libyuv/source/convert.cc index a33742d24d..375cc732c1 100644 --- a/libs/libvpx/third_party/libyuv/source/convert.cc +++ b/libs/libvpx/third_party/libyuv/source/convert.cc @@ -14,8 +14,8 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" -#include "libyuv/scale.h" // For ScalePlane() #include "libyuv/row.h" +#include "libyuv/scale.h" // For ScalePlane() #ifdef __cplusplus namespace libyuv { @@ -28,14 +28,22 @@ static __inline int Abs(int v) { } // Any I4xx To I420 format with mirroring. -static int I4xxToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_y_width, int src_y_height, - int src_uv_width, int src_uv_height) { +static int I4xxToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int src_uv_width, + int src_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); @@ -44,35 +52,37 @@ static int I4xxToI420(const uint8* src_y, int src_stride_y, return -1; } if (dst_y) { - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, - dst_y, dst_stride_y, dst_y_width, dst_y_height, - kFilterBilinear); + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); } - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, - dst_u, dst_stride_u, dst_uv_width, dst_uv_height, - kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, - dst_v, dst_stride_v, dst_uv_width, dst_uv_height, - kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); return 0; } -// Copy I420 with optional flipping +// Copy I420 with optional flipping. // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure // is does row coalescing. LIBYUV_API -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -96,79 +106,152 @@ int I420Copy(const uint8* src_y, int src_stride_y, return 0; } +// Copy I010 with optional flipping. +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// Convert 10 bit YUV to 8 bit. +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width, + height); + // Convert UV planes. + Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth, + halfheight); + Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth, + halfheight); + return 0; +} + // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I422ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { const int src_uv_width = SUBSAMPLE(width, 1, 1); - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - src_uv_width, height); + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, src_uv_width, height); } // 444 chroma is 1x width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I444ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - width, height); -} - -// 411 chroma is 1/4 width, 1x height -// 420 chroma is 1/2 width, 1/2 height -LIBYUV_API -int I411ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - const int src_uv_width = SUBSAMPLE(width, 3, 2); - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - src_uv_width, height); +int I444ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, width, height); } // I400 is greyscale typically used in MJPG LIBYUV_API -int I400ToI420(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I400ToI420(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!dst_u || !dst_v || - width <= 0 || height == 0) { + if (!dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -186,11 +269,15 @@ int I400ToI420(const uint8* src_y, int src_stride_y, return 0; } -static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, - uint8* dst, int dst_stride, - int width, int height) { +static void CopyPlane2(const uint8_t* src, + int src_stride_0, + int src_stride_1, + uint8_t* dst, + int dst_stride, + int width, + int height) { int y; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -211,11 +298,6 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Copy plane for (y = 0; y < height - 1; y += 2) { @@ -238,17 +320,22 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // src_stride_m420 is row planar. Normally this will be the width in pixels. // The UV plane is half width, but 2 values, so src_stride_m420 applies to // this as well as the two Y planes. -static int X420ToI420(const uint8* src_y, - int src_stride_y0, int src_stride_y1, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +static int X420ToI420(const uint8_t* src_y, + int src_stride_y0, + int src_stride_y1, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_uv || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -265,16 +352,14 @@ static int X420ToI420(const uint8* src_y, dst_stride_v = -dst_stride_v; } // Coalesce rows. - if (src_stride_y0 == width && - src_stride_y1 == width && + if (src_stride_y0 == width && src_stride_y1 == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y0 = src_stride_y1 = dst_stride_y = 0; } // Coalesce rows. - if (src_stride_uv == halfwidth * 2 && - dst_stride_u == halfwidth && + if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && dst_stride_v == halfwidth) { halfwidth *= halfheight; halfheight = 1; @@ -299,63 +384,78 @@ static int X420ToI420(const uint8* src_y, // Convert NV12 to I420. LIBYUV_API -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, - src_uv, src_stride_uv, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); } // Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API -int NV21ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, - src_vu, src_stride_vu, - dst_y, dst_stride_y, - dst_v, dst_stride_v, - dst_u, dst_stride_u, - width, height); +int NV21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu, + dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, width, height); } // Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8* src_m420, int src_stride_m420, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int M420ToI420(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, - src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); } // Convert YUY2 to I420. LIBYUV_API -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int YUY2ToI420(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, - uint8* dst_y, int width) = YUY2ToYRow_C; + void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, + uint8_t* dst_u, uint8_t* dst_v, int width) = + YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -392,6 +492,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUVRow = YUY2ToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUVRow = YUY2ToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -411,16 +521,22 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to I420. LIBYUV_API -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int UYVYToI420(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C; - void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int width) = UYVYToYRow_C; + void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, + uint8_t* dst_u, uint8_t* dst_v, int width) = + UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -457,6 +573,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUVRow = UYVYToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUVRow = UYVYToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); @@ -476,19 +602,23 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, // Convert ARGB to I420. LIBYUV_API -int ARGBToI420(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - if (!src_argb || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -533,6 +663,22 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -552,19 +698,23 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, // Convert BGRA to I420. LIBYUV_API -int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int BGRAToI420(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; - void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) = + void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, + uint8_t* dst_u, uint8_t* dst_v, int width) = + BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) = BGRAToYRow_C; - if (!src_bgra || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -592,12 +742,28 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, } #endif #if defined(HAS_BGRATOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - BGRAToUVRow = BGRAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; } + } +#endif +#if defined(HAS_BGRATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToYRow = BGRAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_MSA; + } + } +#endif +#if defined(HAS_BGRATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToUVRow = BGRAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_MSA; + } + } #endif for (y = 0; y < height - 1; y += 2) { @@ -618,19 +784,23 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, // Convert ABGR to I420. LIBYUV_API -int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ABGRToI420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) = + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = ABGRToYRow_C; - if (!src_abgr || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -665,6 +835,22 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); @@ -684,19 +870,23 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, // Convert RGBA to I420. LIBYUV_API -int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGBAToI420(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; - void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) = + void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) = RGBAToYRow_C; - if (!src_rgba || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -731,6 +921,22 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, } } #endif +#if defined(HAS_RGBATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYRow = RGBAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_MSA; + } + } +#endif +#if defined(HAS_RGBATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToUVRow = RGBAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); @@ -750,27 +956,33 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, // Convert RGB24 to I420. LIBYUV_API -int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGB24ToI420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RGB24TOYROW_NEON) - void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; - void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) = +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = RGB24ToYRow_C; #else - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_rgb24 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -792,6 +1004,15 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } } +#elif defined(HAS_RGB24TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MSA; + RGB24ToYRow = RGB24ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_MSA; + RGB24ToUVRow = RGB24ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RGB24 to ARGB. #else #if defined(HAS_RGB24TOARGBROW_SSSE3) @@ -822,14 +1043,17 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } #endif +#endif + { +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RGB24TOYROW_NEON) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -846,7 +1070,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_RGB24TOYROW_NEON) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); #else @@ -855,36 +1079,41 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_RGB24TOYROW_NEON) +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert RAW to I420. LIBYUV_API -int RAWToI420(const uint8* src_raw, int src_stride_raw, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RAWToI420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RAWTOYROW_NEON) - void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; - void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) = +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, + uint8_t* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYRow_C; #else - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_raw || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -906,6 +1135,15 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } } +#elif defined(HAS_RAWTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVRow = RAWToUVRow_Any_MSA; + RAWToYRow = RAWToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_MSA; + RAWToUVRow = RAWToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RAW to ARGB. #else #if defined(HAS_RAWTOARGBROW_SSSE3) @@ -936,14 +1174,17 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } #endif +#endif + { +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RAWTOYROW_NEON) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); @@ -960,7 +1201,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_RAWTOYROW_NEON) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); #else @@ -969,36 +1210,42 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_RAWTOYROW_NEON) +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert RGB565 to I420. LIBYUV_API -int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGB565ToI420(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RGB565TOYROW_NEON) - void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; - void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) = +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB565ToUVRow_C; + void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) = RGB565ToYRow_C; #else - void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - RGB565ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_rgb565 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1020,6 +1267,15 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } } } +#elif defined(HAS_RGB565TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MSA; + RGB565ToYRow = RGB565ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_MSA; + RGB565ToUVRow = RGB565ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -1057,15 +1313,16 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, ARGBToYRow = ARGBToYRow_AVX2; } } +#endif #endif { +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RGB565TOYROW_NEON) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); @@ -1082,7 +1339,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_RGB565TOYROW_NEON) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); #else @@ -1091,36 +1348,43 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_RGB565TOYROW_NEON) +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert ARGB1555 to I420. LIBYUV_API -int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGB1555ToI420(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_ARGB1555TOYROW_NEON) - void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; - void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) = - ARGB1555ToYRow_C; +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, + int width) = ARGB1555ToYRow_C; #else - void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - ARGB1555ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_argb1555 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -1142,6 +1406,15 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } } } +#elif defined(HAS_ARGB1555TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_MSA; + ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -1179,15 +1452,17 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, ARGBToYRow = ARGBToYRow_AVX2; } } +#endif #endif { +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_ARGB1555TOYROW_NEON) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, @@ -1206,7 +1481,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_ARGB1555TOYROW_NEON) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else @@ -1215,36 +1490,43 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_ARGB1555TOYROW_NEON) +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert ARGB4444 to I420. LIBYUV_API -int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGB4444ToI420(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; #if defined(HAS_ARGB4444TOYROW_NEON) - void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; - void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) = - ARGB4444ToYRow_C; + void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, + int width) = ARGB4444ToYRow_C; #else - void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - ARGB4444ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_argb4444 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -1284,6 +1566,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1304,7 +1594,22 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } + } +#endif +#endif + { +#if !defined(HAS_ARGB4444TOYROW_NEON) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); @@ -1341,13 +1646,15 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } #if !defined(HAS_ARGB4444TOYROW_NEON) free_aligned_buffer_64(row); - } #endif + } return 0; } -static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv, - uint8* dst_u, int width) { +static void SplitPixels(const uint8_t* src_u, + int src_pixel_stride_uv, + uint8_t* dst_u, + int width) { int i; for (i = 0; i < width; ++i) { *dst_u = *src_u; @@ -1358,21 +1665,26 @@ static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv, // Convert Android420 to I420. LIBYUV_API -int Android420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, +int Android420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, int src_pixel_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - const int vu_off = src_v - src_u; + const ptrdiff_t vu_off = src_v - src_u; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1396,15 +1708,16 @@ int Android420ToI420(const uint8* src_y, int src_stride_y, CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; - // Split UV planes - NV21 - } else if (src_pixel_stride_uv == 2 && vu_off == -1 && - src_stride_u == src_stride_v) { + // Split UV planes - NV21 + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, halfwidth, halfheight); return 0; - // Split UV planes - NV12 - } else if (src_pixel_stride_uv == 2 && vu_off == 1 && - src_stride_u == src_stride_v) { + // Split UV planes - NV12 + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight); return 0; diff --git a/libs/libvpx/third_party/libyuv/source/convert_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_argb.cc index fb9582d627..f2fe474f70 100644 --- a/libs/libvpx/third_party/libyuv/source/convert_argb.cc +++ b/libs/libvpx/third_party/libyuv/source/convert_argb.cc @@ -26,11 +26,13 @@ extern "C" { // Copy ARGB with optional flipping LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - if (!src_argb || !dst_argb || - width <= 0 || height == 0) { +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -40,27 +42,29 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } - CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width * 4, height); + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4, + height); return 0; } -// Convert I422 to ARGB with matrix -static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +// Convert I420 to ARGB with matrix +static int I420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -93,13 +97,12 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif @@ -117,111 +120,130 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I420 to ARGB. LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I420 to ABGR. LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J420 to ARGB. LIBYUV_API -int J420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); +int J420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert J420 to ABGR. LIBYUV_API -int J420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int J420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H420 to ARGB. LIBYUV_API -int H420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvH709Constants, - width, height); +int H420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); } // Convert H420 to ABGR. LIBYUV_API -int H420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int H420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert I422 to ARGB with matrix -static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +static int I422ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -231,10 +253,8 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; @@ -263,13 +283,12 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif @@ -285,111 +304,380 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I422 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I422 to ABGR. LIBYUV_API -int I422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J422 to ARGB. LIBYUV_API -int J422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); +int J422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert J422 to ABGR. LIBYUV_API -int J422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int J422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H422 to ARGB. LIBYUV_API -int H422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvH709Constants, - width, height); +int H422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); } // Convert H422 to ABGR. LIBYUV_API -int H422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int H422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert 10 bit YUV to ARGB with matrix +// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to +// multiply 10 bit yuv into high bits to allow any number of bits. +static int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToAR30Row = I210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToAR30Row = I210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvH709Constants, width, height); +} + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +// Convert 10 bit YUV to ARGB with matrix +static int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToARGBRow = I210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToARGBRow = I210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert I444 to ARGB with matrix -static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +static int I444ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I444ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I444ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -399,9 +687,7 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u == width && - src_stride_v == width && + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -431,6 +717,14 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -444,138 +738,81 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I444 to ARGB. LIBYUV_API -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I444 to ABGR. LIBYUV_API -int I444ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J444 to ARGB. LIBYUV_API -int J444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); -} - -// Convert I411 to ARGB. -LIBYUV_API -int I411ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - int y; - void (*I411ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I411ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 4 == width && - src_stride_v * 4 == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_I411TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I411ToARGBRow = I411ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I411TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I411ToARGBRow = I411ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I411ToARGBRow = I411ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I411TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I411ToARGBRow = I411ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert I420 with Alpha to preattenuated ARGB. -static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, +static int I420AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height, int attenuate) { + int width, + int height, + int attenuate) { int y; - void (*I422AlphaToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, + void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I422AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -608,13 +845,12 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422ALPHATOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2; +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) @@ -641,6 +877,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -661,49 +905,59 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I420 with Alpha to ARGB. LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int attenuate) { - return I420AlphaToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - src_a, src_stride_a, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height, attenuate); +int I420AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); } // Convert I420 with Alpha to ABGR. LIBYUV_API -int I420AlphaToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height, int attenuate) { - return I420AlphaToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - src_a, src_stride_a, - dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height, attenuate); +int I420AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); } // Convert I400 to ARGB. LIBYUV_API -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*I400ToARGBRow)(const uint8* y_buf, - uint8* rgb_buf, - int width) = I400ToARGBRow_C; - if (!src_y || !dst_argb || - width <= 0 || height == 0) { + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) = + I400ToARGBRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -713,8 +967,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -743,6 +996,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I400ToARGBRow = I400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, width); @@ -754,14 +1015,16 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, // Convert J400 to ARGB. LIBYUV_API -int J400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int J400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) = + void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = J400ToARGBRow_C; - if (!src_y || !dst_argb || - width <= 0 || height == 0) { + if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -771,8 +1034,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, src_stride_y = -src_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -800,6 +1062,14 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, J400ToARGBRow = J400ToARGBRow_NEON; } } +#endif +#if defined(HAS_J400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + J400ToARGBRow = J400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_MSA; + } + } #endif for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); @@ -810,85 +1080,89 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, } // Shuffle table for converting BGRA to ARGB. -static uvec8 kShuffleMaskBGRAToARGB = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u -}; +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; // Shuffle table for converting ABGR to ARGB. -static uvec8 kShuffleMaskABGRToARGB = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; // Shuffle table for converting RGBA to ARGB. -static uvec8 kShuffleMaskRGBAToARGB = { - 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u -}; +static const uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; // Convert BGRA to ARGB. LIBYUV_API -int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), - width, height); +int BGRAToARGB(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). LIBYUV_API -int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), - width, height); +int ARGBToBGRA(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ABGR to ARGB. LIBYUV_API -int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), - width, height); +int ABGRToARGB(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). LIBYUV_API -int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), - width, height); +int ARGBToABGR(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); } // Convert RGBA to ARGB. LIBYUV_API -int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_rgba, src_stride_rgba, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskRGBAToARGB), - width, height); +int RGBAToARGB(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height); } // Convert RGB24 to ARGB. LIBYUV_API -int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RGB24ToARGB(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; - if (!src_rgb24 || !dst_argb || - width <= 0 || height == 0) { + if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -898,8 +1172,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_stride_rgb24 = -src_stride_rgb24; } // Coalesce rows. - if (src_stride_rgb24 == width * 3 && - dst_stride_argb == width * 4) { + if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb24 = dst_stride_argb = 0; @@ -920,6 +1193,14 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -931,14 +1212,16 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, // Convert RAW to ARGB. LIBYUV_API -int RAWToARGB(const uint8* src_raw, int src_stride_raw, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RAWToARGB(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; - if (!src_raw || !dst_argb || - width <= 0 || height == 0) { + if (!src_raw || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -948,8 +1231,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_stride_raw = -src_stride_raw; } // Coalesce rows. - if (src_stride_raw == width * 3 && - dst_stride_argb == width * 4) { + if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_raw = dst_stride_argb = 0; @@ -970,6 +1252,14 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToARGBRow = RAWToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -981,14 +1271,16 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, // Convert RGB565 to ARGB. LIBYUV_API -int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RGB565ToARGB(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) = - RGB565ToARGBRow_C; - if (!src_rgb565 || !dst_argb || - width <= 0 || height == 0) { + void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -998,8 +1290,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_stride_rgb565 = -src_stride_rgb565; } // Coalesce rows. - if (src_stride_rgb565 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb565 = dst_stride_argb = 0; @@ -1028,6 +1319,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); @@ -1039,14 +1338,16 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, // Convert ARGB1555 to ARGB. LIBYUV_API -int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGB1555ToARGB(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, - int width) = ARGB1555ToARGBRow_C; - if (!src_argb1555 || !dst_argb || - width <= 0 || height == 0) { + void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1056,8 +1357,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_stride_argb1555 = -src_stride_argb1555; } // Coalesce rows. - if (src_stride_argb1555 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb1555 = dst_stride_argb = 0; @@ -1086,6 +1386,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); @@ -1097,14 +1405,16 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, // Convert ARGB4444 to ARGB. LIBYUV_API -int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGB4444ToARGB(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, - int width) = ARGB4444ToARGBRow_C; - if (!src_argb4444 || !dst_argb || - width <= 0 || height == 0) { + void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1114,8 +1424,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_stride_argb4444 = -src_stride_argb4444; } // Coalesce rows. - if (src_stride_argb4444 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb4444 = dst_stride_argb = 0; @@ -1144,6 +1453,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); @@ -1153,20 +1470,117 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, return 0; } -// Convert NV12 to ARGB. +// Convert AR30 to ARGB. LIBYUV_API -int NV12ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || - width <= 0 || height == 0) { + if (!src_ar30 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_argb = 0; + } + for (y = 0; y < height; ++y) { + AR30ToARGBRow_C(src_ar30, dst_argb, width); + src_ar30 += src_stride_ar30; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AR30 to ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + int y; + if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_abgr = 0; + } + for (y = 0; y < height; ++y) { + AR30ToABGRRow_C(src_ar30, dst_abgr, width); + src_ar30 += src_stride_ar30; + dst_abgr += dst_stride_abgr; + } + return 0; +} + +// Convert AR30 to AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + int y; + if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_ab30 = 0; + } + for (y = 0; y < height; ++y) { + AR30ToAB30Row_C(src_ar30, dst_ab30, width); + src_ar30 += src_stride_ar30; + dst_ab30 += dst_stride_ab30; + } + return 0; +} + +// Convert NV12 to ARGB with matrix +static int NV12ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1199,9 +1613,17 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { - NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); + NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1211,20 +1633,21 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, return 0; } -// Convert NV21 to ARGB. -LIBYUV_API -int NV21ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +// Convert NV21 to ARGB with matrix +static int NV21ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; - void (*NV21ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV21ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || - width <= 0 || height == 0) { + void (*NV21ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; + if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1257,11 +1680,136 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV21TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV21ToARGBRow = NV21ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); + NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV12 to ABGR. +// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix. +// To swap the UV use NV12 instead of NV21.LIBYUV_API +int NV12ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// Convert NV21 to ABGR. +LIBYUV_API +int NV21ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// TODO(fbarchard): Consider SSSE3 2 step conversion. +// Convert NV12 to RGB24 with matrix +static int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; + if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV12TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB24Row = NV12ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV12ToRGB24Row = NV12ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; if (y & 1) { src_uv += src_stride_uv; } @@ -1269,19 +1817,109 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Convert NV21 to RGB24 with matrix +static int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV21ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; + if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV21TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB24Row = NV21ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToRGB24Row = NV21ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix. +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + // Convert M420 to ARGB. LIBYUV_API -int M420ToARGB(const uint8* src_m420, int src_stride_m420, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int M420ToARGB(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToARGBRow_C; - if (!src_m420 || !dst_argb || - width <= 0 || height == 0) { + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + if (!src_m420 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1314,6 +1952,14 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, } } #endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, @@ -1332,17 +1978,17 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, // Convert YUY2 to ARGB. LIBYUV_API -int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int YUY2ToARGB(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*YUY2ToARGBRow)(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = + void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = YUY2ToARGBRow_C; - if (!src_yuy2 || !dst_argb || - width <= 0 || height == 0) { + if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1352,8 +1998,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. - if (src_stride_yuy2 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_yuy2 = dst_stride_argb = 0; @@ -1381,6 +2026,14 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToARGBRow = YUY2ToARGBRow_NEON; } } +#endif +#if defined(HAS_YUY2TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_MSA; + } + } #endif for (y = 0; y < height; ++y) { YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); @@ -1392,17 +2045,17 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to ARGB. LIBYUV_API -int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int UYVYToARGB(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*UYVYToARGBRow)(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = + void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = UYVYToARGBRow_C; - if (!src_uyvy || !dst_argb || - width <= 0 || height == 0) { + if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1412,8 +2065,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. - if (src_stride_uyvy == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_uyvy = dst_stride_argb = 0; @@ -1441,6 +2093,14 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, UYVYToARGBRow = UYVYToARGBRow_NEON; } } +#endif +#if defined(HAS_UYVYTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToARGBRow = UYVYToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_MSA; + } + } #endif for (y = 0; y < height; ++y) { UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); @@ -1449,6 +2109,121 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, } return 0; } +static void WeavePixels(const uint8_t* src_u, + const uint8_t* src_v, + int src_pixel_stride_uv, + uint8_t* dst_uv, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_uv[0] = *src_u; + dst_uv[1] = *src_v; + dst_uv += 2; + src_u += src_pixel_stride_uv; + src_v += src_pixel_stride_uv; + } +} + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + uint8_t* dst_uv; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + // I420 + if (src_pixel_stride_uv == 1) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + // NV21 + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb, + dst_stride_argb, yuvconstants, width, height); + // NV12 + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb, + dst_stride_argb, yuvconstants, width, height); + } + + // General case fallback creates NV12 + align_buffer_64(plane_uv, halfwidth * 2 * halfheight); + dst_uv = plane_uv; + for (y = 0; y < halfheight; ++y) { + WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += halfwidth * 2; + } + NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb, + dst_stride_argb, yuvconstants, width, height); + free_aligned_buffer_64(plane_uv); + return 0; +} + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_pixel_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height); +} + +// Convert Android420 to ABGR. +LIBYUV_API +int Android420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, src_pixel_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, + height); +} #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/third_party/libyuv/source/convert_from.cc b/libs/libvpx/third_party/libyuv/source/convert_from.cc index 3b2dca8163..6fa253237e 100644 --- a/libs/libvpx/third_party/libyuv/source/convert_from.cc +++ b/libs/libvpx/third_party/libyuv/source/convert_from.cc @@ -15,9 +15,9 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/row.h" #include "libyuv/scale.h" // For ScalePlane() #include "libyuv/video_common.h" -#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -30,109 +30,144 @@ static __inline int Abs(int v) { } // I420 To any I4xx YUV format with mirroring. -static int I420ToI4xx(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_y_width, int src_y_height, - int dst_uv_width, int dst_uv_height) { +static int I420ToI4xx(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int dst_uv_width, + int dst_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); - if (src_y_width == 0 || src_y_height == 0 || - dst_uv_width <= 0 || dst_uv_height <= 0) { + if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 || + dst_uv_height <= 0) { return -1; } if (dst_y) { - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, - dst_y, dst_stride_y, dst_y_width, dst_y_height, - kFilterBilinear); + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); } - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, - dst_u, dst_stride_u, dst_uv_width, dst_uv_height, - kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, - dst_v, dst_stride_v, dst_uv_width, dst_uv_height, - kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +// Convert 8 bit YUV to 10 bit. +LIBYUV_API +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, + halfheight); return 0; } // 420 chroma is 1/2 width, 1/2 height // 422 chroma is 1/2 width, 1x height LIBYUV_API -int I420ToI422(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420ToI422(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { const int dst_uv_width = (Abs(width) + 1) >> 1; const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); } // 420 chroma is 1/2 width, 1/2 height // 444 chroma is 1x width, 1x height LIBYUV_API -int I420ToI444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { const int dst_uv_width = Abs(width); const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); -} - -// 420 chroma is 1/2 width, 1/2 height -// 411 chroma is 1/4 width, 1x height -LIBYUV_API -int I420ToI411(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - const int dst_uv_width = (Abs(width) + 3) >> 2; - const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); } // Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API -int I400Copy(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { - if (!src_y || !dst_y || - width <= 0 || height == 0) { +int I400Copy(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -146,17 +181,21 @@ int I400Copy(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I422ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int I422ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -166,10 +205,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_yuy2 == width * 2) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; @@ -182,6 +219,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -202,17 +247,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I420ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int I420ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -229,6 +278,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -237,6 +294,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -254,17 +319,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I422ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int I422ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -274,10 +343,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_uyvy == width * 2) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; @@ -290,6 +357,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -298,6 +373,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -310,17 +393,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I420ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int I420ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -337,6 +424,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -345,6 +440,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -363,14 +466,20 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, // TODO(fbarchard): test negative height for invert. LIBYUV_API -int I420ToNV12(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { - if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || - width <= 0 || height == 0) { +int I420ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || + height == 0) { return -1; } int halfwidth = (width + 1) / 2; @@ -378,44 +487,47 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } - MergeUVPlane(src_u, src_stride_u, - src_v, src_stride_v, - dst_uv, dst_stride_uv, + MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, halfwidth, halfheight); return 0; } LIBYUV_API -int I420ToNV21(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height) { - return I420ToNV12(src_y, src_stride_y, - src_v, src_stride_v, - src_u, src_stride_u, - dst_y, dst_stride_y, - dst_vu, dst_stride_vu, +int I420ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } // Convert I422 to RGBA with matrix -static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, +static int I420ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || - width <= 0 || height == 0) { + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -448,13 +560,12 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TORGBAROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { - I422ToRGBARow = I422ToRGBARow_DSPR2; +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } } #endif @@ -472,50 +583,58 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, // Convert I420 to RGBA. LIBYUV_API -int I420ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgba, dst_stride_rgba, - &kYuvI601Constants, - width, height); +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); } // Convert I420 to BGRA. LIBYUV_API -int I420ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_bgra, dst_stride_bgra, +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert I420 to RGB24 with matrix -static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb24, int dst_stride_rgb24, +static int I420ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGB24Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGB24Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb24 || - width <= 0 || height == 0) { + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -548,6 +667,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); @@ -563,50 +690,95 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, // Convert I420 to RGB24. LIBYUV_API -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgb24, dst_stride_rgb24, - &kYuvI601Constants, - width, height); +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); } // Convert I420 to RAW. LIBYUV_API -int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_raw, int dst_stride_raw, - int width, int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_raw, dst_stride_raw, +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, &kYvuI601Constants, // Use Yvu matrix width, height); } +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + // Convert I420 to ARGB1555. LIBYUV_API -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height) { +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { int y; - void (*I422ToARGB1555Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB1555Row_C; - if (!src_y || !src_u || !src_v || !dst_argb1555 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -639,6 +811,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, @@ -653,23 +833,25 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, return 0; } - // Convert I420 to ARGB4444. LIBYUV_API -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height) { +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { int y; - void (*I422ToARGB4444Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB4444Row_C; - if (!src_y || !src_u || !src_v || !dst_argb4444 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -702,6 +884,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, @@ -718,20 +908,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, // Convert I420 to RGB565. LIBYUV_API -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*I422ToRGB565Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || - width <= 0 || height == 0) { + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -764,6 +956,14 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); @@ -777,32 +977,102 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, return 0; } +// Convert I422 to RGB565. +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8 kDither565_4x4[16] = { - 0, 4, 1, 5, - 6, 2, 7, 3, - 1, 5, 0, 4, - 7, 3, 6, 2, +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert I420 to RGB565 with dithering. LIBYUV_API -int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height) { +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -838,12 +1108,12 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) @@ -869,6 +1139,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; } } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } #endif { // Allocate a row of argb. @@ -876,7 +1154,8 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { @@ -889,220 +1168,254 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, return 0; } +// Convert I420 to AR30 with matrix +static int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToAR30Row_C; + + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + +#if defined(HAS_I422TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToAR30Row = I422ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToAR30Row = I422ToAR30Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYvuH709Constants, width, height); +} + // Convert I420 to specified format LIBYUV_API -int ConvertFromI420(const uint8* y, int y_stride, - const uint8* u, int u_stride, - const uint8* v, int v_stride, - uint8* dst_sample, int dst_sample_stride, - int width, int height, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); +int ConvertFromI420(const uint8_t* y, + int y_stride, + const uint8_t* u, + int u_stride, + const uint8_t* v, + int v_stride, + uint8_t* dst_sample, + int dst_sample_stride, + int width, + int height, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int r = 0; - if (!y || !u|| !v || !dst_sample || - width <= 0 || height == 0) { + if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { return -1; } switch (format) { // Single plane formats case FOURCC_YUY2: - r = I420ToYUY2(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_UYVY: - r = I420ToUYVY(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_RGBP: - r = I420ToRGB565(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_RGBO: - r = I420ToARGB1555(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, + r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_R444: - r = I420ToARGB4444(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, + r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_24BG: - r = I420ToRGB24(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); + r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); break; case FOURCC_RAW: - r = I420ToRAW(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); + r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); break; case FOURCC_ARGB: - r = I420ToARGB(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_BGRA: - r = I420ToBGRA(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_ABGR: - r = I420ToABGR(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_RGBA: - r = I420ToRGBA(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_AR30: + r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_I400: - r = I400Copy(y, y_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I400Copy(y, y_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; case FOURCC_NV12: { - uint8* dst_uv = dst_sample + width * height; - r = I420ToNV12(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - dst_uv, - dst_sample_stride ? dst_sample_stride : width, - width, height); + uint8_t* dst_uv = dst_sample + width * height; + r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_uv, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; } case FOURCC_NV21: { - uint8* dst_vu = dst_sample + width * height; - r = I420ToNV21(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - dst_vu, - dst_sample_stride ? dst_sample_stride : width, - width, height); + uint8_t* dst_vu = dst_sample + width * height; + r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_vu, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; } // TODO(fbarchard): Add M420. // Triplanar formats - // TODO(fbarchard): halfstride instead of halfwidth case FOURCC_I420: case FOURCC_YV12: { - int halfwidth = (width + 1) / 2; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; int halfheight = (height + 1) / 2; - uint8* dst_u; - uint8* dst_v; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV12) { - dst_v = dst_sample + width * height; - dst_u = dst_v + halfwidth * halfheight; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * halfheight; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + halfwidth * halfheight; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * halfheight; } - r = I420Copy(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, + r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I422: case FOURCC_YV16: { - int halfwidth = (width + 1) / 2; - uint8* dst_u; - uint8* dst_v; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV16) { - dst_v = dst_sample + width * height; - dst_u = dst_v + halfwidth * height; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * height; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + halfwidth * height; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * height; } - r = I420ToI422(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, + r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I444: case FOURCC_YV24: { - uint8* dst_u; - uint8* dst_v; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV24) { - dst_v = dst_sample + width * height; - dst_u = dst_v + width * height; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + dst_sample_stride * height; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + width * height; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + dst_sample_stride * height; } - r = I420ToI444(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, width, - dst_v, width, - width, height); + r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, dst_sample_stride, dst_v, + dst_sample_stride, width, height); break; } - case FOURCC_I411: { - int quarterwidth = (width + 3) / 4; - uint8* dst_u = dst_sample + width * height; - uint8* dst_v = dst_u + quarterwidth * height; - r = I420ToI411(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, quarterwidth, - dst_v, quarterwidth, - width, height); - break; - } - // Formats not supported - MJPG, biplanar, some rgb formats. default: return -1; // unknown fourcc - return failure code. diff --git a/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc index 2a8682b7eb..c8d91252e9 100644 --- a/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc +++ b/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc @@ -22,16 +22,21 @@ extern "C" { // ARGB little endian (bgra in memory) to I444 LIBYUV_API -int ARGBToI444(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) = ARGBToUV444Row_C; + void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, + uint8_t* dst_v, int width) = ARGBToUV444Row_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u == width && - dst_stride_v == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOUV444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; - } + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } } #endif #if defined(HAS_ARGBTOUV444ROW_NEON) @@ -65,6 +68,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_MSA; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -89,6 +100,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -103,19 +122,23 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, // ARGB little endian (bgra in memory) to I422 LIBYUV_API -int ARGBToI422(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - if (!src_argb || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -125,10 +148,8 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -170,6 +191,23 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); @@ -181,95 +219,25 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, return 0; } -// ARGB little endian (bgra in memory) to I411 LIBYUV_API -int ARGBToI411(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - int y; - void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) = ARGBToUV411Row_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = - ARGBToYRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u * 4 == width && - dst_stride_v * 4 == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUV411ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUV411Row = ARGBToUV411Row_Any_NEON; - if (IS_ALIGNED(width, 32)) { - ARGBToUV411Row = ARGBToUV411Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToUV411Row(src_argb, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - src_argb += src_stride_argb; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -LIBYUV_API -int ARGBToNV12(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int ARGBToNV12(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; - if (!src_argb || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -314,6 +282,22 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -337,11 +321,19 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, MergeUVRow_ = MergeUVRow_NEON; } } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8* row_v = row_u + ((halfwidth + 31) & ~31); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); @@ -364,21 +356,24 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, // Same as NV12 but U and V swapped. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; - if (!src_argb || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -423,6 +418,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -446,24 +457,32 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, MergeUVRow_ = MergeUVRow_NEON; } } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8* row_v = row_u + ((halfwidth + 31) & ~31); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; + dst_vu += dst_stride_vu; } if (height & 1) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ARGBToYRow(src_argb, dst_y, width); } free_aligned_buffer_64(row_u); @@ -473,19 +492,23 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, // Convert ARGB to YUY2. LIBYUV_API -int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int ARGBToYUY2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; - if (!src_argb || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -495,8 +518,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yuy2 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_yuy2 = 0; @@ -537,6 +559,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -545,6 +583,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -553,12 +599,20 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); @@ -575,19 +629,23 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, // Convert ARGB to UYVY. LIBYUV_API -int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int ARGBToUYVY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; - if (!src_argb || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -597,8 +655,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_uyvy == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_uyvy = 0; @@ -639,6 +696,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -647,6 +720,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -655,12 +736,20 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); @@ -677,11 +766,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, // Convert ARGB to I400. LIBYUV_API -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int ARGBToI400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || width <= 0 || height == 0) { return -1; @@ -692,8 +784,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = 0; @@ -722,6 +813,14 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -732,28 +831,31 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } // Shuffle table for converting ARGB to RGBA. -static uvec8 kShuffleMaskARGBToRGBA = { - 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u -}; +static const uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; // Convert ARGB to RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return ARGBShuffle(src_argb, src_stride_argb, - dst_rgba, dst_stride_rgba, - (const uint8*)(&kShuffleMaskARGBToRGBA), - width, height); +int ARGBToRGBA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, + (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height); } // Convert ARGB To RGB24. LIBYUV_API -int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { +int ARGBToRGB24(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { int y; - void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) = + void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRGB24Row_C; if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { return -1; @@ -764,8 +866,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_rgb24 == width * 3) { + if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_rgb24 = 0; @@ -778,6 +879,22 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI; + } + } +#endif #if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; @@ -786,6 +903,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -797,11 +922,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, // Convert ARGB To RAW. LIBYUV_API -int ARGBToRAW(const uint8* src_argb, int src_stride_argb, - uint8* dst_raw, int dst_stride_raw, - int width, int height) { +int ARGBToRAW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { int y; - void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) = + void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRAWRow_C; if (!src_argb || !dst_raw || width <= 0 || height == 0) { return -1; @@ -812,8 +940,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_raw == width * 3) { + if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_raw = 0; @@ -826,6 +953,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRAWRow = ARGBToRAWRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; @@ -834,6 +969,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -844,21 +987,23 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8 kDither565_4x4[16] = { - 0, 4, 1, 5, - 6, 2, 7, 3, - 1, 5, 0, 4, - 7, 3, 6, 2, +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). LIBYUV_API -int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height) { +int ARGBToRGB565Dither(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { int y; - void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -894,9 +1039,19 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); src_argb += src_stride_argb; dst_rgb565 += dst_stride_rgb565; } @@ -906,12 +1061,15 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, // Convert ARGB To RGB565. // TODO(fbarchard): Consider using dither function low level with zeros. LIBYUV_API -int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int ARGBToRGB565(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToRGB565Row_C; + void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToRGB565Row_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -921,8 +1079,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_rgb565 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_rgb565 = 0; @@ -951,6 +1108,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); @@ -962,12 +1127,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, // Convert ARGB To ARGB1555. LIBYUV_API -int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height) { +int ARGBToARGB1555(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { int y; - void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToARGB1555Row_C; + void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB1555Row_C; if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { return -1; } @@ -977,8 +1145,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb1555 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb1555 = 0; @@ -1007,6 +1174,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); @@ -1018,12 +1193,15 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, // Convert ARGB To ARGB4444. LIBYUV_API -int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height) { +int ARGBToARGB4444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { int y; - void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToARGB4444Row_C; + void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB4444Row_C; if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { return -1; } @@ -1033,8 +1211,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb4444 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb4444 = 0; @@ -1063,6 +1240,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); @@ -1072,21 +1257,123 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, return 0; } +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = + ABGRToAR30Row_C; + if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_ar30 = 0; + } +#if defined(HAS_ABGRTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ABGRToAR30Row = ABGRToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ABGRToAR30Row = ABGRToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ABGRToAR30Row(src_abgr, dst_ar30, width); + src_abgr += src_stride_abgr; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToAR30Row_C; + if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar30 = 0; + } +#if defined(HAS_ARGBTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR30Row = ARGBToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR30Row = ARGBToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ARGBToAR30Row(src_argb, dst_ar30, width); + src_argb += src_stride_argb; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ420(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToJ420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || - !dst_yj || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1129,6 +1416,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -1148,19 +1451,23 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, // Convert ARGB to J422. (JPeg full range I422). LIBYUV_API -int ARGBToJ422(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToJ422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || - !dst_yj || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1170,10 +1477,8 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yj == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_argb == width * 4 && dst_stride_yj == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; @@ -1212,6 +1517,22 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); @@ -1226,11 +1547,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, // Convert ARGB to J400. LIBYUV_API -int ARGBToJ400(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - int width, int height) { +int ARGBToJ400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || width <= 0 || height == 0) { return -1; @@ -1241,8 +1565,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yj == width) { + if (src_stride_argb == width * 4 && dst_stride_yj == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = 0; @@ -1271,6 +1594,14 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); diff --git a/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc b/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc index 90f550a26a..ae3cc18cd2 100644 --- a/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc +++ b/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc @@ -22,28 +22,24 @@ extern "C" { #ifdef HAVE_JPEG struct I420Buffers { - uint8* y; + uint8_t* y; int y_stride; - uint8* u; + uint8_t* u; int u_stride; - uint8* v; + uint8_t* v; int v_stride; int w; int h; }; static void JpegCopyI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I420Copy(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -51,17 +47,13 @@ static void JpegCopyI420(void* opaque, } static void JpegI422ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I422ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -69,35 +61,13 @@ static void JpegI422ToI420(void* opaque, } static void JpegI444ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I444ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); - dest->y += rows * dest->y_stride; - dest->u += ((rows + 1) >> 1) * dest->u_stride; - dest->v += ((rows + 1) >> 1) * dest->v_stride; - dest->h -= rows; -} - -static void JpegI411ToI420(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { - I420Buffers* dest = (I420Buffers*)(opaque); - I411ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -105,15 +75,12 @@ static void JpegI411ToI420(void* opaque, } static void JpegI400ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I400ToI420(data[0], strides[0], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u, + dest->u_stride, dest->v, dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -122,8 +89,10 @@ static void JpegI400ToI420(void* opaque, // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8* sample, size_t sample_size, - int* width, int* height) { +int MJPGSize(const uint8_t* sample, + size_t sample_size, + int* width, + int* height) { MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); if (ret) { @@ -135,15 +104,21 @@ int MJPGSize(const uint8* sample, size_t sample_size, } // MJPG (Motion JPeg) to I420 -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. LIBYUV_API -int MJPGToI420(const uint8* sample, +int MJPGToI420(const uint8_t* sample, size_t sample_size, - uint8* y, int y_stride, - uint8* u, int u_stride, - uint8* v, int v_stride, - int w, int h, - int dw, int dh) { + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height) { if (sample_size == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; @@ -152,17 +127,17 @@ int MJPGToI420(const uint8* sample, // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && (mjpeg_decoder.GetWidth() != w || - mjpeg_decoder.GetHeight() != h)) { + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh }; + I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, dst_width, dst_height}; // YUV420 - if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && @@ -170,8 +145,9 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); - // YUV422 + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width, + dst_height); + // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -181,8 +157,9 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); - // YUV444 + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width, + dst_height); + // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -192,28 +169,19 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); - // YUV411 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 4 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh); - // YUV400 + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width, + dst_height); + // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width, + dst_height); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; @@ -224,88 +192,67 @@ int MJPGToI420(const uint8* sample, #ifdef HAVE_JPEG struct ARGBBuffers { - uint8* argb; + uint8_t* argb; int argb_stride; int w; int h; }; static void JpegI420ToARGB(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { + const uint8_t* const* data, + const int* strides, + int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I420ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI422ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I422ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI444ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I444ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); - dest->argb += rows * dest->argb_stride; - dest->h -= rows; -} - -static void JpegI411ToARGB(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { - ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I411ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI400ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I400ToARGB(data[0], strides[0], - dest->argb, dest->argb_stride, - dest->w, rows); + I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } // MJPG (Motion JPeg) to ARGB -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. LIBYUV_API -int MJPGToARGB(const uint8* sample, +int MJPGToARGB(const uint8_t* sample, size_t sample_size, - uint8* argb, int argb_stride, - int w, int h, - int dw, int dh) { + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height) { if (sample_size == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; @@ -314,17 +261,16 @@ int MJPGToARGB(const uint8* sample, // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && (mjpeg_decoder.GetWidth() != w || - mjpeg_decoder.GetHeight() != h)) { + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - ARGBBuffers bufs = { argb, argb_stride, dw, dh }; + ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height}; // YUV420 - if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && @@ -332,8 +278,9 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); - // YUV422 + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width, + dst_height); + // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -343,8 +290,9 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); - // YUV444 + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width, + dst_height); + // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -354,28 +302,19 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); - // YUV411 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 4 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh); - // YUV400 + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width, + dst_height); + // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width, + dst_height); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; diff --git a/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc index aecdc80fde..67484522c0 100644 --- a/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc +++ b/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc @@ -28,36 +28,50 @@ extern "C" { // src_height is used to compute location of planes, and indicate inversion // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. + +// TODO(fbarchard): Add the following: +// H010ToARGB +// H420ToARGB +// H422ToARGB +// I010ToARGB +// J400ToARGB +// J422ToARGB +// J444ToARGB + LIBYUV_API -int ConvertToARGB(const uint8* sample, size_t sample_size, - uint8* crop_argb, int argb_stride, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; - const uint8* src; - const uint8* src_uv; + const uint8_t* src; + const uint8_t* src_uv; int abs_src_height = (src_height < 0) ? -src_height : src_height; int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; // One pass rotation is available for some formats. For the rest, convert - // to I420 (with optional vertical flipping) into a temporary I420 buffer, - // and then rotate the I420 to the final destination buffer. - // For in-place conversion, if destination crop_argb is same as source sample, + // to ARGB (with optional vertical flipping) into a temporary ARGB buffer, + // and then rotate the ARGB to the final destination buffer. + // For in-place conversion, if destination dst_argb is same as source sample, // also enable temporary buffer. - LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || - crop_argb == sample; - uint8* dest_argb = crop_argb; - int dest_argb_stride = argb_stride; - uint8* rotate_buffer = NULL; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_ARGB) || dst_argb == sample; + uint8_t* dest_argb = dst_argb; + int dest_dst_stride_argb = dst_stride_argb; + uint8_t* rotate_buffer = NULL; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - if (crop_argb == NULL || sample == NULL || - src_width <= 0 || crop_width <= 0 || + if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } @@ -67,187 +81,174 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, if (need_buf) { int argb_size = crop_width * 4 * abs_crop_height; - rotate_buffer = (uint8*)malloc(argb_size); + rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } - crop_argb = rotate_buffer; - argb_stride = crop_width * 4; + dst_argb = rotate_buffer; + dst_stride_argb = crop_width * 4; } switch (format) { // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToARGB(src, aligned_src_width * 2, - crop_argb, argb_stride, + r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToARGB(src, aligned_src_width * 2, - crop_argb, argb_stride, + r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToARGB(src, src_width * 3, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToARGB(src, src_width * 3, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_ARGB: - src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + if (!need_buf && !rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + } break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AR30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AB30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToARGB(src, src_width * 2, - crop_argb, argb_stride, + r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToARGB(src, src_width * 2, - crop_argb, argb_stride, + r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToARGB(src, src_width * 2, - crop_argb, argb_stride, + r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; - r = NV12ToARGB(src, src_width, - src_uv, aligned_src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; // Call NV12 but with u and v parameters swapped. - r = NV21ToARGB(src, src_width, - src_uv, aligned_src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; + // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } - r = I420ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_J420: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - r = J420ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_I422: case FOURCC_YV16: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } else { - src_u = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } - r = I422ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_I444: case FOURCC_YV24: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; @@ -255,32 +256,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } - r = I444ToARGB(src_y, src_width, - src_u, src_width, - src_v, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); - break; - } - case FOURCC_I411: { - int quarterwidth = (src_width + 3) / 4; - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u = sample + src_width * abs_src_height + - quarterwidth * crop_y + crop_x / 4; - const uint8* src_v = sample + src_width * abs_src_height + - quarterwidth * (abs_src_height + crop_y) + crop_x / 4; - r = I411ToARGB(src_y, src_width, - src_u, quarterwidth, - src_v, quarterwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToARGB(sample, sample_size, - crop_argb, argb_stride, - src_width, abs_src_height, crop_width, inv_crop_height); + r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, + abs_src_height, crop_width, inv_crop_height); break; #endif default: @@ -289,11 +272,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, if (need_buf) { if (!r) { - r = ARGBRotate(crop_argb, argb_stride, - dest_argb, dest_argb_stride, + r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb, crop_width, abs_crop_height, rotation); } free(rotate_buffer); + } else if (rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height, rotation); } return r; diff --git a/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc b/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc index e5f307c446..df08309f9b 100644 --- a/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc +++ b/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc @@ -25,251 +25,216 @@ extern "C" { // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. LIBYUV_API -int ConvertToI420(const uint8* sample, +int ConvertToI420(const uint8_t* sample, size_t sample_size, - uint8* y, int y_stride, - uint8* u, int u_stride, - uint8* v, int v_stride, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; - const uint8* src; - const uint8* src_uv; + const uint8_t* src; + const uint8_t* src_uv; const int abs_src_height = (src_height < 0) ? -src_height : src_height; // TODO(nisse): Why allow crop_height < 0? const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; - LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && - format != FOURCC_NV12 && format != FOURCC_NV21 && - format != FOURCC_YV12) || y == sample; - uint8* tmp_y = y; - uint8* tmp_u = u; - uint8* tmp_v = v; - int tmp_y_stride = y_stride; - int tmp_u_stride = u_stride; - int tmp_v_stride = v_stride; - uint8* rotate_buffer = NULL; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && + format != FOURCC_NV21 && format != FOURCC_YV12) || + dst_y == sample; + uint8_t* tmp_y = dst_y; + uint8_t* tmp_u = dst_u; + uint8_t* tmp_v = dst_v; + int tmp_y_stride = dst_stride_y; + int tmp_u_stride = dst_stride_u; + int tmp_v_stride = dst_stride_v; + uint8_t* rotate_buffer = NULL; const int inv_crop_height = (src_height < 0) ? -abs_crop_height : abs_crop_height; - if (!y || !u || !v || !sample || - src_width <= 0 || crop_width <= 0 || - src_height == 0 || crop_height == 0) { + if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || + crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } // One pass rotation is available for some formats. For the rest, convert // to I420 (with optional vertical flipping) into a temporary I420 buffer, // and then rotate the I420 to the final destination buffer. - // For in-place conversion, if destination y is same as source sample, + // For in-place conversion, if destination dst_y is same as source sample, // also enable temporary buffer. if (need_buf) { int y_size = crop_width * abs_crop_height; int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); - rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); + rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } - y = rotate_buffer; - u = y + y_size; - v = u + uv_size; - y_stride = crop_width; - u_stride = v_stride = ((crop_width + 1) / 2); + dst_y = rotate_buffer; + dst_u = dst_y + y_size; + dst_v = dst_u + uv_size; + dst_stride_y = crop_width; + dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); } switch (format) { // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToI420(src, aligned_src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToI420(src, aligned_src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToI420(src, src_width * 3, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToI420(src, src_width * 3, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_ARGB: src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; + // TODO(fbarchard): Add AR30 and AB30 case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - r = NV12ToI420Rotate(src, src_width, - src_uv, aligned_src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height, rotation); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - // Call NV12 but with u and v parameters swapped. - r = NV12ToI420Rotate(src, src_width, - src_uv, aligned_src_width, - y, y_stride, - v, v_stride, - u, u_stride, - crop_width, inv_crop_height, rotation); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + // Call NV12 but with dst_u and dst_v parameters swapped. + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, crop_width, inv_crop_height, rotation); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); break; // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } - r = I420Rotate(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height, rotation); + r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); break; } case FOURCC_I422: case FOURCC_YV16: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } else { - src_u = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } - r = I422ToI420(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); break; } case FOURCC_I444: case FOURCC_YV24: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; @@ -277,38 +242,16 @@ int ConvertToI420(const uint8* sample, src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } - r = I444ToI420(src_y, src_width, - src_u, src_width, - src_v, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; - } - case FOURCC_I411: { - int quarterwidth = (src_width + 3) / 4; - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u = sample + src_width * abs_src_height + - quarterwidth * crop_y + crop_x / 4; - const uint8* src_v = sample + src_width * abs_src_height + - quarterwidth * (abs_src_height + crop_y) + crop_x / 4; - r = I411ToI420(src_y, src_width, - src_u, quarterwidth, - src_v, quarterwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToI420(sample, sample_size, - y, y_stride, - u, u_stride, - v, v_stride, - src_width, abs_src_height, crop_width, inv_crop_height); + r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, src_width, + abs_src_height, crop_width, inv_crop_height); break; #endif default: @@ -317,13 +260,10 @@ int ConvertToI420(const uint8* sample, if (need_buf) { if (!r) { - r = I420Rotate(y, y_stride, - u, u_stride, - v, v_stride, - tmp_y, tmp_y_stride, - tmp_u, tmp_u_stride, - tmp_v, tmp_v_stride, - crop_width, abs_crop_height, rotation); + r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, crop_width, abs_crop_height, + rotation); } free(rotate_buffer); } diff --git a/libs/libvpx/third_party/libyuv/source/cpu_id.cc b/libs/libvpx/third_party/libyuv/source/cpu_id.cc index 84927ebc3e..31e24b6739 100644 --- a/libs/libvpx/third_party/libyuv/source/cpu_id.cc +++ b/libs/libvpx/third_party/libyuv/source/cpu_id.cc @@ -13,22 +13,16 @@ #if defined(_MSC_VER) #include // For __cpuidex() #endif -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) #include // For _xgetbv() #endif -#if !defined(__native_client__) -#include // For getenv() -#endif - // For ArmCpuCaps() but unittested on all platforms #include #include -#include "libyuv/basic_types.h" // For CPU_X86 - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -43,16 +37,20 @@ extern "C" { #define SAFEBUFFERS #endif +// cpu_info_ variable for SIMD instruction sets detected. +LIBYUV_API int cpu_info_ = 0; + +// TODO(fbarchard): Consider using int for cpuid so casting is not needed. // Low level cpuid for X86. -#if (defined(_M_IX86) || defined(_M_X64) || \ - defined(__i386__) || defined(__x86_64__)) && \ +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) LIBYUV_API -void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { +void CpuId(int info_eax, int info_ecx, int* cpu_info) { #if defined(_MSC_VER) // Visual C version uses intrinsic or inline x86 assembly. #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - __cpuidex((int*)(cpu_info), info_eax, info_ecx); + __cpuidex(cpu_info, info_eax, info_ecx); #elif defined(_M_IX86) __asm { mov eax, info_eax @@ -66,26 +64,26 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { } #else // Visual C but not x86 if (info_ecx == 0) { - __cpuid((int*)(cpu_info), info_eax); + __cpuid(cpu_info, info_eax); } else { - cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; } #endif // GCC version uses inline x86 assembly. #else // defined(_MSC_VER) - uint32 info_ebx, info_edx; - asm volatile ( -#if defined( __i386__) && defined(__PIC__) - // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=D" (info_ebx), + int info_ebx, info_edx; + asm volatile( +#if defined(__i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D"(info_ebx), #else - "cpuid \n" - : "=b" (info_ebx), + "cpuid \n" + : "=b"(info_ebx), #endif // defined( __i386__) && defined(__PIC__) - "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx)); + "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); cpu_info[0] = info_eax; cpu_info[1] = info_ebx; cpu_info[2] = info_ecx; @@ -94,7 +92,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { } #else // (defined(_M_IX86) || defined(_M_X64) ... LIBYUV_API -void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { +void CpuId(int eax, int ecx, int* cpu_info) { + (void)eax; + (void)ecx; cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; } #endif @@ -111,20 +111,22 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { #if defined(_M_IX86) && (_MSC_VER < 1900) #pragma optimize("g", off) #endif -#if (defined(_M_IX86) || defined(_M_X64) || \ - defined(__i386__) || defined(__x86_64__)) && \ +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) -#define HAS_XGETBV // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. int GetXCR0() { - uint32 xcr0 = 0u; + int xcr0 = 0; #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. + xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT #elif defined(__i386__) || defined(__x86_64__) - asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); + asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); #endif // defined(__i386__) || defined(__x86_64__) return xcr0; } +#else +// xgetbv unavailable to query for OSSave support. Return 0. +#define GetXCR0() 0 #endif // defined(_M_IX86) || defined(_M_X64) .. // Return optimization to previous setting. #if defined(_M_IX86) && (_MSC_VER < 1900) @@ -133,8 +135,7 @@ int GetXCR0() { // based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU -LIBYUV_API SAFEBUFFERS -int ArmCpuCaps(const char* cpuinfo_name) { +LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; FILE* f = fopen(cpuinfo_name, "r"); if (!f) { @@ -151,7 +152,7 @@ int ArmCpuCaps(const char* cpuinfo_name) { } // aarch64 uses asimd for Neon. p = strstr(cpuinfo_line, " asimd"); - if (p && (p[6] == ' ' || p[6] == '\n')) { + if (p) { fclose(f); return kCpuHasNEON; } @@ -161,103 +162,78 @@ int ArmCpuCaps(const char* cpuinfo_name) { return 0; } -// CPU detect function for SIMD instruction sets. -LIBYUV_API -int cpu_info_ = 0; // cpu_info is not initialized yet. - -// Test environment variable for disabling CPU features. Any non-zero value -// to disable. Zero ignored to make it easy to set the variable on/off. -#if !defined(__native_client__) && !defined(_M_ARM) - -static LIBYUV_BOOL TestEnv(const char* name) { - const char* var = getenv(name); - if (var) { - if (var[0] != '0') { - return LIBYUV_TRUE; +// TODO(fbarchard): Consider read_msa_ir(). +// TODO(fbarchard): Add unittest. +LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, + const char ase[]) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // ase enabled if /proc/cpuinfo is unavailable. + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + return 0; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + char* p = strstr(cpuinfo_line, ase); + if (p) { + fclose(f); + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + return 0; + } } } - return LIBYUV_FALSE; + fclose(f); + return 0; } -#else // nacl does not support getenv(). -static LIBYUV_BOOL TestEnv(const char*) { - return LIBYUV_FALSE; -} -#endif -LIBYUV_API SAFEBUFFERS -int InitCpuFlags(void) { - // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized. +static SAFEBUFFERS int GetCpuFlags(void) { int cpu_info = 0; -#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) - uint32 cpu_info0[4] = { 0, 0, 0, 0 }; - uint32 cpu_info1[4] = { 0, 0, 0, 0 }; - uint32 cpu_info7[4] = { 0, 0, 0, 0 }; +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86)) + int cpu_info0[4] = {0, 0, 0, 0}; + int cpu_info1[4] = {0, 0, 0, 0}; + int cpu_info7[4] = {0, 0, 0, 0}; CpuId(0, 0, cpu_info0); CpuId(1, 0, cpu_info1); if (cpu_info0[0] >= 7) { CpuId(7, 0, cpu_info7); } - cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | - ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | - ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | - kCpuHasX86; + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); -#ifdef HAS_XGETBV - // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv + // AVX requires OS saves YMM registers. if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers - cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX; + cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); // Detect AVX512bw if ((GetXCR0() & 0xe0) == 0xe0) { - cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; + cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; + cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; + cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; + cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; + cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; } } #endif - - // Environment variable overrides for testing. - if (TestEnv("LIBYUV_DISABLE_X86")) { - cpu_info &= ~kCpuHasX86; - } - if (TestEnv("LIBYUV_DISABLE_SSE2")) { - cpu_info &= ~kCpuHasSSE2; - } - if (TestEnv("LIBYUV_DISABLE_SSSE3")) { - cpu_info &= ~kCpuHasSSSE3; - } - if (TestEnv("LIBYUV_DISABLE_SSE41")) { - cpu_info &= ~kCpuHasSSE41; - } - if (TestEnv("LIBYUV_DISABLE_SSE42")) { - cpu_info &= ~kCpuHasSSE42; - } - if (TestEnv("LIBYUV_DISABLE_AVX")) { - cpu_info &= ~kCpuHasAVX; - } - if (TestEnv("LIBYUV_DISABLE_AVX2")) { - cpu_info &= ~kCpuHasAVX2; - } - if (TestEnv("LIBYUV_DISABLE_ERMS")) { - cpu_info &= ~kCpuHasERMS; - } - if (TestEnv("LIBYUV_DISABLE_FMA3")) { - cpu_info &= ~kCpuHasFMA3; - } - if (TestEnv("LIBYUV_DISABLE_AVX3")) { - cpu_info &= ~kCpuHasAVX3; - } -#endif #if defined(__mips__) && defined(__linux__) -#if defined(__mips_dspr2) - cpu_info |= kCpuHasDSPR2; +#if defined(__mips_msa) + cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); #endif cpu_info |= kCpuHasMIPS; - if (getenv("LIBYUV_DISABLE_DSPR2")) { - cpu_info &= ~kCpuHasDSPR2; - } #endif #if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ @@ -276,22 +252,22 @@ int InitCpuFlags(void) { cpu_info = ArmCpuCaps("/proc/cpuinfo"); #endif cpu_info |= kCpuHasARM; - if (TestEnv("LIBYUV_DISABLE_NEON")) { - cpu_info &= ~kCpuHasNEON; - } #endif // __arm__ - if (TestEnv("LIBYUV_DISABLE_ASM")) { - cpu_info = 0; - } - cpu_info |= kCpuInitialized; - cpu_info_ = cpu_info; + cpu_info |= kCpuInitialized; return cpu_info; } // Note that use of this function is not thread safe. LIBYUV_API -void MaskCpuFlags(int enable_flags) { - cpu_info_ = InitCpuFlags() & enable_flags; +int MaskCpuFlags(int enable_flags) { + int cpu_info = GetCpuFlags() & enable_flags; + SetCpuFlags(cpu_info); + return cpu_info; +} + +LIBYUV_API +int InitCpuFlags(void) { + return MaskCpuFlags(-1); } #ifdef __cplusplus diff --git a/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc index 22025ad04a..eaf2530130 100644 --- a/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc +++ b/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc @@ -21,7 +21,7 @@ #if defined(_MSC_VER) // disable warning 4324: structure was padded due to __declspec(align()) -#pragma warning(disable:4324) +#pragma warning(disable : 4324) #endif #endif @@ -102,7 +102,7 @@ MJpegDecoder::~MJpegDecoder() { DestroyOutputBuffers(); } -LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { if (!ValidateJpeg(src, src_len)) { return LIBYUV_FALSE; } @@ -129,7 +129,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { if (scanlines_[i]) { delete scanlines_[i]; } - scanlines_[i] = new uint8* [scanlines_size]; + scanlines_[i] = new uint8_t*[scanlines_size]; scanlines_sizes_[i] = scanlines_size; } @@ -145,7 +145,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { if (databuf_[i]) { delete databuf_[i]; } - databuf_[i] = new uint8[databuf_size]; + databuf_[i] = new uint8_t[databuf_size]; databuf_strides_[i] = databuf_stride; } @@ -195,13 +195,11 @@ int MJpegDecoder::GetVertSampFactor(int component) { } int MJpegDecoder::GetHorizSubSampFactor(int component) { - return decompress_struct_->max_h_samp_factor / - GetHorizSampFactor(component); + return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component); } int MJpegDecoder::GetVertSubSampFactor(int component) { - return decompress_struct_->max_v_samp_factor / - GetVertSampFactor(component); + return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component); } int MJpegDecoder::GetImageScanlinesPerImcuRow() { @@ -245,10 +243,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() { } // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. -LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( - uint8** planes, int dst_width, int dst_height) { - if (dst_width != GetWidth() || - dst_height > GetHeight()) { +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } @@ -289,14 +287,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( for (int i = 0; i < num_outbufs_; ++i) { // TODO(fbarchard): Compute skip to avoid this assert(skip % GetVertSubSampFactor(i) == 0); - int rows_to_skip = - DivideAndRoundDown(skip, GetVertSubSampFactor(i)); - int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - - rows_to_skip; + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = + GetComponentScanlinesPerImcuRow(i) - rows_to_skip; int data_to_skip = rows_to_skip * GetComponentStride(i); - CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), + scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } lines_left -= (GetImageScanlinesPerImcuRow() - skip); @@ -305,16 +302,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( // Read full MCUs but cropped horizontally for (; lines_left > GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { + lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); - CopyPlane(databuf_[i], GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } @@ -328,19 +324,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); - CopyPlane(databuf_[i], GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } return FinishDecode(); } -LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, - int dst_width, int dst_height) { - if (dst_width != GetWidth() || - dst_height > GetHeight()) { +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } @@ -395,7 +391,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, } // Read full MCUs until we get to the crop point. for (; lines_left >= GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { + lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; @@ -435,22 +431,22 @@ void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT } void term_source(j_decompress_ptr cinfo) { - // Nothing to do. + (void)cinfo; // Nothing to do. } #ifdef HAVE_SETJMP void ErrorHandler(j_common_ptr cinfo) { - // This is called when a jpeglib command experiences an error. Unfortunately - // jpeglib's error handling model is not very flexible, because it expects the - // error handler to not return--i.e., it wants the program to terminate. To - // recover from errors we use setjmp() as shown in their example. setjmp() is - // C's implementation for the "call with current continuation" functionality - // seen in some functional programming languages. - // A formatted message can be output, but is unsafe for release. +// This is called when a jpeglib command experiences an error. Unfortunately +// jpeglib's error handling model is not very flexible, because it expects the +// error handler to not return--i.e., it wants the program to terminate. To +// recover from errors we use setjmp() as shown in their example. setjmp() is +// C's implementation for the "call with current continuation" functionality +// seen in some functional programming languages. +// A formatted message can be output, but is unsafe for release. #ifdef DEBUG char buf[JMSG_LENGTH_MAX]; (*cinfo->err->format_message)(cinfo, buf); - // ERROR: Error in jpeglib: buf +// ERROR: Error in jpeglib: buf #endif SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); @@ -459,8 +455,9 @@ void ErrorHandler(j_common_ptr cinfo) { longjmp(mgr->setjmp_buffer, 1); } +// Suppress fprintf warnings. void OutputHandler(j_common_ptr cinfo) { - // Suppress fprintf warnings. + (void)cinfo; } #endif // HAVE_SETJMP @@ -472,9 +469,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { // it. DestroyOutputBuffers(); - scanlines_ = new uint8** [num_outbufs]; + scanlines_ = new uint8_t**[num_outbufs]; scanlines_sizes_ = new int[num_outbufs]; - databuf_ = new uint8* [num_outbufs]; + databuf_ = new uint8_t*[num_outbufs]; databuf_strides_ = new int[num_outbufs]; for (int i = 0; i < num_outbufs; ++i) { @@ -490,13 +487,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { void MJpegDecoder::DestroyOutputBuffers() { for (int i = 0; i < num_outbufs_; ++i) { - delete [] scanlines_[i]; - delete [] databuf_[i]; + delete[] scanlines_[i]; + delete[] databuf_[i]; } - delete [] scanlines_; - delete [] databuf_; - delete [] scanlines_sizes_; - delete [] databuf_strides_; + delete[] scanlines_; + delete[] databuf_; + delete[] scanlines_sizes_; + delete[] databuf_strides_; scanlines_ = NULL; databuf_ = NULL; scanlines_sizes_ = NULL; @@ -530,9 +527,9 @@ LIBYUV_BOOL MJpegDecoder::FinishDecode() { return LIBYUV_TRUE; } -void MJpegDecoder::SetScanlinePointers(uint8** data) { +void MJpegDecoder::SetScanlinePointers(uint8_t** data) { for (int i = 0; i < num_outbufs_; ++i) { - uint8* data_i = data[i]; + uint8_t* data_i = data[i]; for (int j = 0; j < scanlines_sizes_[i]; ++j) { scanlines_[i][j] = data_i; data_i += GetComponentStride(i); @@ -542,26 +539,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) { inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { return (unsigned int)(GetImageScanlinesPerImcuRow()) == - jpeg_read_raw_data(decompress_struct_, - scanlines_, - GetImageScanlinesPerImcuRow()); + jpeg_read_raw_data(decompress_struct_, scanlines_, + GetImageScanlinesPerImcuRow()); } // The helper function which recognizes the jpeg sub-sampling type. JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( - int* subsample_x, int* subsample_y, int number_of_components) { + int* subsample_x, + int* subsample_y, + int number_of_components) { if (number_of_components == 3) { // Color images. - if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 2 && subsample_y[1] == 2 && - subsample_x[2] == 2 && subsample_y[2] == 2) { + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { return kJpegYuv420; - } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 2 && subsample_y[1] == 1 && - subsample_x[2] == 2 && subsample_y[2] == 1) { + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) { return kJpegYuv422; - } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 1 && subsample_y[1] == 1 && - subsample_x[2] == 1 && subsample_y[2] == 1) { + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 && + subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) { return kJpegYuv444; } } else if (number_of_components == 1) { // Grey-scale images. @@ -574,4 +571,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( } // namespace libyuv #endif // HAVE_JPEG - diff --git a/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc index 9c48832045..80c2cc0cb9 100644 --- a/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc +++ b/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc @@ -18,13 +18,13 @@ extern "C" { #endif // Helper function to scan for EOI marker (0xff 0xd9). -static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { +static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) { if (sample_size >= 2) { - const uint8* end = sample + sample_size - 1; - const uint8* it = sample; + const uint8_t* end = sample + sample_size - 1; + const uint8_t* it = sample; while (it < end) { // TODO(fbarchard): scan for 0xd9 instead. - it = static_cast(memchr(it, 0xff, end - it)); + it = (const uint8_t*)(memchr(it, 0xff, end - it)); if (it == NULL) { break; } @@ -39,7 +39,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { } // Helper function to validate the jpeg appears intact. -LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { +LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) { // Maximum size that ValidateJpeg will consider valid. const size_t kMaxJpegSize = 0x7fffffffull; const size_t kBackSearchSize = 1024; @@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { } // extern "C" } // namespace libyuv #endif - diff --git a/libs/libvpx/third_party/libyuv/source/planar_functions.cc b/libs/libvpx/third_party/libyuv/source/planar_functions.cc index a764f8da47..5eae3f763a 100644 --- a/libs/libvpx/third_party/libyuv/source/planar_functions.cc +++ b/libs/libvpx/third_party/libyuv/source/planar_functions.cc @@ -26,11 +26,14 @@ extern "C" { // Copy a plane of data LIBYUV_API -void CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +void CopyPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -38,8 +41,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, dst_stride_y = -dst_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; @@ -48,6 +50,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, if (src_y == dst_y && src_stride_y == dst_stride_y) { return; } + #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -68,11 +71,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Copy plane for (y = 0; y < height; ++y) { @@ -83,15 +81,18 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } // TODO(fbarchard): Consider support for negative height. +// TODO(fbarchard): Consider stride measured in bytes. LIBYUV_API -void CopyPlane_16(const uint16* src_y, int src_stride_y, - uint16* dst_y, int dst_stride_y, - int width, int height) { +void CopyPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C; + void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C; // Coalesce rows. - if (src_stride_y == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; @@ -111,11 +112,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, CopyRow = CopyRow_16_NEON; } #endif -#if defined(HAS_COPYROW_16_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_16_MIPS; - } -#endif // Copy plane for (y = 0; y < height; ++y) { @@ -125,19 +121,124 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, } } +// Convert a plane of 16 bit data to 8 bit +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, + int width) = Convert16To8Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT16TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert16To8Row = Convert16To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_SSSE3; + } + } +#endif +#if defined(HAS_CONVERT16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert16To8Row = Convert16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert16To8Row = Convert16To8Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert16To8Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert a plane of 8 bit data to 16 bit +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, + int width) = Convert8To16Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT8TO16ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Convert8To16Row = Convert8To16Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + Convert8To16Row = Convert8To16Row_SSE2; + } + } +#endif +#if defined(HAS_CONVERT8TO16ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert8To16Row = Convert8To16Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert8To16Row = Convert8To16Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert8To16Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + // Copy I422. LIBYUV_API -int I422Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I422Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -161,16 +262,21 @@ int I422Copy(const uint8* src_y, int src_stride_y, // Copy I444. LIBYUV_API -int I444Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { +int I444Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -194,9 +300,12 @@ int I444Copy(const uint8* src_y, int src_stride_y, // Copy I400. LIBYUV_API -int I400ToI400(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int I400ToI400(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } @@ -212,11 +321,20 @@ int I400ToI400(const uint8* src_y, int src_stride_y, // Convert I420 to I400. LIBYUV_API -int I420ToI400(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int I420ToI400(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + (void)src_u; + (void)src_stride_u; + (void)src_v; + (void)src_stride_v; if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } @@ -234,12 +352,16 @@ int I420ToI400(const uint8* src_y, int src_stride_y, // Support function for NV12 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API -void SplitUVPlane(const uint8* src_uv, int src_stride_uv, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +void SplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; // Negative height means invert the image. if (height < 0) { @@ -250,8 +372,7 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv, dst_stride_v = -dst_stride_v; } // Coalesce rows. - if (src_stride_uv == width * 2 && - dst_stride_u == width && + if (src_stride_uv == width * 2 && dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; @@ -281,13 +402,11 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv, } } #endif -#if defined(HAS_SPLITUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && - IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_DSPR2; +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; } } #endif @@ -302,13 +421,17 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv, } LIBYUV_API -void MergeUVPlane(const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +void MergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; - void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; // Coalesce rows. // Negative height means invert the image. if (height < 0) { @@ -317,8 +440,7 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u, dst_stride_uv = -dst_stride_uv; } // Coalesce rows. - if (src_stride_u == width && - src_stride_v == width && + if (src_stride_u == width && src_stride_v == width && dst_stride_uv == width * 2) { width *= height; height = 1; @@ -348,6 +470,14 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -358,12 +488,131 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u, } } -// Mirror a plane of data. -void MirrorPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +// Support function for NV12 etc RGB channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int y; - void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitRGBRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + } + // Coalesce rows. + if (src_stride_rgb == width * 3 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } +#if defined(HAS_SPLITRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitRGBRow = SplitRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitRGBRow = SplitRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of RGB. + SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_rgb += src_stride_rgb; + } +} + +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height) { + int y; + void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_rgb, int width) = + MergeRGBRow_C; + // Coalesce rows. + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; + dst_stride_rgb = -dst_stride_rgb; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_rgb == width * 3) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; + } +#if defined(HAS_MERGERGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MergeRGBRow = MergeRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_MERGERGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeRGBRow = MergeRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of RGB. + MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_rgb += dst_stride_rgb; + } +} + +// Mirror a plane of data. +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -394,12 +643,12 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } } #endif -// TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) { - MirrorRow = MirrorRow_DSPR2; +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } } #endif @@ -413,17 +662,24 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, // Convert YUY2 to I422. LIBYUV_API -int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int YUY2ToI422(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*YUY2ToUV422Row)(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) = - YUY2ToUV422Row_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = + void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, + uint8_t* dst_v, int width) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -431,10 +687,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. - if (src_stride_yuy2 == width * 2 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_yuy2 == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { width *= height; height = 1; src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -462,15 +717,23 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, #if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (width >= 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; - } + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; YUY2ToUV422Row = YUY2ToUV422Row_NEON; } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -485,17 +748,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to I422. LIBYUV_API -int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int UYVYToI422(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*UYVYToUV422Row)(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) = - UYVYToUV422Row_C; - void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int width) = UYVYToYRow_C; + void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, + uint8_t* dst_v, int width) = UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -503,10 +773,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. - if (src_stride_uyvy == width * 2 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_uyvy == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { width *= height; height = 1; src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -534,15 +803,23 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, #if defined(HAS_UYVYTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { UYVYToYRow = UYVYToYRow_Any_NEON; - if (width >= 16) { - UYVYToUV422Row = UYVYToUV422Row_Any_NEON; - } + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; UYVYToUV422Row = UYVYToUV422Row_NEON; } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUV422Row = UYVYToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUV422Row = UYVYToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); @@ -555,13 +832,82 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, return 0; } +// Convert YUY2 to Y. +LIBYUV_API +int YUY2ToY(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + } + return 0; +} + // Mirror I400 with optional flipping LIBYUV_API -int I400Mirror(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { - if (!src_y || !dst_y || - width <= 0 || height == 0) { +int I400Mirror(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -577,17 +923,24 @@ int I400Mirror(const uint8* src_y, int src_stride_y, // Mirror I420 with optional flipping LIBYUV_API -int I420Mirror(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -612,11 +965,14 @@ int I420Mirror(const uint8* src_y, int src_stride_y, // ARGB mirror. LIBYUV_API -int ARGBMirror(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBMirror(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) = ARGBMirrorRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -651,6 +1007,14 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -666,8 +1030,8 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, // the same blend function for all pixels if possible. LIBYUV_API ARGBBlendRow GetARGBBlend() { - void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width) = ARGBBlendRow_C; + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = ARGBBlendRow_C; #if defined(HAS_ARGBBLENDROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBBlendRow = ARGBBlendRow_SSSE3; @@ -678,19 +1042,28 @@ ARGBBlendRow GetARGBBlend() { if (TestCpuFlag(kCpuHasNEON)) { ARGBBlendRow = ARGBBlendRow_NEON; } +#endif +#if defined(HAS_ARGBBLENDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBBlendRow = ARGBBlendRow_MSA; + } #endif return ARGBBlendRow; } // Alpha Blend 2 ARGB images and store to destination. LIBYUV_API -int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBBlend(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width) = GetARGBBlend(); + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = GetARGBBlend(); if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -701,8 +1074,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -720,14 +1092,20 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, // Alpha Blend plane and store to destination. LIBYUV_API -int BlendPlane(const uint8* src_y0, int src_stride_y0, - const uint8* src_y1, int src_stride_y1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int BlendPlane(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { return -1; } @@ -739,10 +1117,8 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, } // Coalesce rows for Y plane. - if (src_stride_y0 == width && - src_stride_y1 == width && - alpha_stride == width && - dst_stride_y == width) { + if (src_stride_y0 == width && src_stride_y1 == width && + alpha_stride == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; @@ -750,7 +1126,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { BlendPlaneRow = BlendPlaneRow_SSSE3; } @@ -758,7 +1134,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #endif #if defined(HAS_BLENDPLANEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - BlendPlaneRow = BlendPlaneRow_Any_AVX2; + BlendPlaneRow = BlendPlaneRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { BlendPlaneRow = BlendPlaneRow_AVX2; } @@ -778,24 +1154,36 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #define MAXTWIDTH 2048 // Alpha Blend YUV images and store to destination. LIBYUV_API -int I420Blend(const uint8* src_y0, int src_stride_y0, - const uint8* src_u0, int src_stride_u0, - const uint8* src_v0, int src_stride_v0, - const uint8* src_y1, int src_stride_y1, - const uint8* src_u1, int src_stride_u1, - const uint8* src_v1, int src_stride_v1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Blend(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_u0, + int src_stride_u0, + const uint8_t* src_v0, + int src_stride_v0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* src_u1, + int src_stride_u1, + const uint8_t* src_v1, + int src_stride_v1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; // Half width/height for UV. int halfwidth = (width + 1) >> 1; - void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; - void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -809,11 +1197,8 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, } // Blend Y plane. - BlendPlane(src_y0, src_stride_y0, - src_y1, src_stride_y1, - alpha, alpha_stride, - dst_y, dst_stride_y, - width, height); + BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, + dst_y, dst_stride_y, width, height); #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -893,13 +1278,17 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, // Multiply 2 ARGB images and store to destination. LIBYUV_API -int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBMultiply(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, - int width) = ARGBMultiplyRow_C; + void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBMultiplyRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -910,8 +1299,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -941,6 +1329,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_MSA; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -954,12 +1350,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, // Add 2 ARGB images and store to destination. LIBYUV_API -int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBAdd(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, + void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, int width) = ARGBAddRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -971,8 +1371,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -1007,6 +1406,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAddRow = ARGBAddRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_MSA; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -1020,13 +1427,17 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, // Subtract 2 ARGB images and store to destination. LIBYUV_API -int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBSubtract(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, - int width) = ARGBSubtractRow_C; + void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBSubtractRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1037,8 +1448,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -1068,6 +1478,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_MSA; + } + } +#endif // Subtract plane for (y = 0; y < height; ++y) { @@ -1079,21 +1497,23 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, return 0; } // Convert I422 to RGBA with matrix -static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, +static int I422ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || - width <= 0 || height == 0) { + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1126,13 +1546,12 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TORGBAROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { - I422ToRGBARow = I422ToRGBARow_DSPR2; +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } } #endif @@ -1148,48 +1567,55 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, // Convert I422 to RGBA. LIBYUV_API -int I422ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgba, dst_stride_rgba, - &kYuvI601Constants, - width, height); +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); } // Convert I422 to BGRA. LIBYUV_API -int I422ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_bgra, dst_stride_bgra, +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert NV12 to RGB565. LIBYUV_API -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*NV12ToRGB565Row)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToRGB565Row_C; - if (!src_y || !src_uv || !dst_rgb565 || - width <= 0 || height == 0) { + void (*NV12ToRGB565Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1222,6 +1648,14 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); @@ -1236,14 +1670,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, // Convert RAW to RGB24. LIBYUV_API -int RAWToRGB24(const uint8* src_raw, int src_stride_raw, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { +int RAWToRGB24(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { int y; - void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) = + void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) = RAWToRGB24Row_C; - if (!src_raw || !dst_rgb24 || - width <= 0 || height == 0) { + if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1253,8 +1689,7 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, src_stride_raw = -src_stride_raw; } // Coalesce rows. - if (src_stride_raw == width * 3 && - dst_stride_rgb24 == width * 3) { + if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_raw = dst_stride_rgb24 = 0; @@ -1275,6 +1710,14 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, } } #endif +#if defined(HAS_RAWTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); @@ -1285,11 +1728,13 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, } LIBYUV_API -void SetPlane(uint8* dst_y, int dst_stride_y, - int width, int height, - uint32 value) { +void SetPlane(uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + uint32_t value) { int y; - void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C; + void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; @@ -1322,6 +1767,11 @@ void SetPlane(uint8* dst_y, int dst_stride_y, SetRow = SetRow_ERMS; } #endif +#if defined(HAS_SETROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { + SetRow = SetRow_MSA; + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -1332,22 +1782,26 @@ void SetPlane(uint8* dst_y, int dst_stride_y, // Draw a rectangle into I420 LIBYUV_API -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, - int width, int height, - int value_y, int value_u, int value_v) { +int I420Rect(uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - uint8* start_y = dst_y + y * dst_stride_y + x; - uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); - uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); - if (!dst_y || !dst_u || !dst_v || - width <= 0 || height == 0 || - x < 0 || y < 0 || - value_y < 0 || value_y > 255 || - value_u < 0 || value_u > 255 || + uint8_t* start_y = dst_y + y * dst_stride_y + x; + uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || + y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || value_v < 0 || value_v > 255) { return -1; } @@ -1360,15 +1814,17 @@ int I420Rect(uint8* dst_y, int dst_stride_y, // Draw a rectangle into ARGB LIBYUV_API -int ARGBRect(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, - int width, int height, - uint32 value) { +int ARGBRect(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, + uint32_t value) { int y; - void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C; - if (!dst_argb || - width <= 0 || height == 0 || - dst_x < 0 || dst_y < 0) { + void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = + ARGBSetRow_C; + if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { return -1; } if (height < 0) { @@ -1397,6 +1853,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, ARGBSetRow = ARGBSetRow_X86; } #endif +#if defined(HAS_ARGBSETROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSetRow = ARGBSetRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_MSA; + } + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -1420,11 +1884,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, // f is foreground pixel premultiplied by alpha LIBYUV_API -int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBAttenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1435,8 +1902,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1465,6 +1931,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -1476,11 +1950,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, // Convert preattentuated ARGB to unattenuated ARGB. LIBYUV_API -int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBUnattenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBUnattenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1491,8 +1968,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1513,7 +1989,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, } } #endif -// TODO(fbarchard): Neon version. + // TODO(fbarchard): Neon version. for (y = 0; y < height; ++y) { ARGBUnattenuateRow(src_argb, dst_argb, width); @@ -1525,12 +2001,15 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, // Convert ARGB to Grayed ARGB. LIBYUV_API -int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBGrayTo(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBGrayRow_C; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1540,8 +2019,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1556,6 +2034,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); @@ -1567,13 +2050,16 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, // Make a rectangle of ARGB gray scale. LIBYUV_API -int ARGBGray(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, - int width, int height) { +int ARGBGray(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBGrayRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } @@ -1593,6 +2079,12 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); dst += dst_stride_argb; @@ -1602,11 +2094,15 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, // Make a rectangle of ARGB Sepia tone. LIBYUV_API -int ARGBSepia(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, int width, int height) { +int ARGBSepia(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } @@ -1626,6 +2122,12 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, ARGBSepiaRow = ARGBSepiaRow_NEON; } #endif +#if defined(HAS_ARGBSEPIAROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); dst += dst_stride_argb; @@ -1636,13 +2138,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, // Apply a 4x4 matrix to each ARGB pixel. // Note: Normally for shading, but can be used to swizzle or invert. LIBYUV_API -int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const int8* matrix_argb, - int width, int height) { +int ARGBColorMatrix(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_argb, + int width, + int height) { int y; - void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; + void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const int8_t* matrix_argb, int width) = + ARGBColorMatrixRow_C; if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { return -1; } @@ -1652,8 +2158,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1667,6 +2172,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; + } #endif for (y = 0; y < height; ++y) { ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); @@ -1679,13 +2189,17 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, // Apply a 4x3 matrix to each ARGB pixel. // Deprecated. LIBYUV_API -int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, - const int8* matrix_rgb, - int dst_x, int dst_y, int width, int height) { - SIMD_ALIGNED(int8 matrix_argb[16]); - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { +int RGBColorMatrix(uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_rgb, + int dst_x, + int dst_y, + int width, + int height) { + SIMD_ALIGNED(int8_t matrix_argb[16]); + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } @@ -1705,23 +2219,26 @@ int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; matrix_argb[15] = 64; // 1.0 - return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, - dst, dst_stride_argb, - &matrix_argb[0], width, height); + return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst, + dst_stride_argb, &matrix_argb[0], width, height); } // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API -int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int dst_x, int dst_y, int width, int height) { +int ARGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = ARGBColorTableRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } // Coalesce rows. @@ -1745,15 +2262,19 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, // Apply a color table each ARGB pixel but preserve destination alpha. // Table contains 256 ARGB values. LIBYUV_API -int RGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int dst_x, int dst_y, int width, int height) { +int RGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = RGBColorTableRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } // Coalesce rows. @@ -1784,13 +2305,19 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb, // Caveat - although SSE2 saturates, the C function does not and should be used // with care if doing anything but quantization. LIBYUV_API -int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, - int scale, int interval_size, int interval_offset, - int dst_x, int dst_y, int width, int height) { +int ARGBQuantize(uint8_t* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, + void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size, int interval_offset, int width) = ARGBQuantizeRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || interval_size < 1 || interval_size > 255) { return -1; @@ -1810,6 +2337,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBQuantizeRow = ARGBQuantizeRow_NEON; } +#endif +#if defined(HAS_ARGBQUANTIZEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_MSA; + } #endif for (y = 0; y < height; ++y) { ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); @@ -1821,13 +2353,17 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API -int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height) { +int ARGBComputeCumulativeSum(const uint8_t* src_argb, + int src_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height) { int y; - void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; - int32* previous_cumsum = dst_cumsum; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + int32_t* previous_cumsum = dst_cumsum; if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { return -1; } @@ -1851,18 +2387,25 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory // as the buffer is treated as circular. LIBYUV_API -int ARGBBlur(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height, int radius) { +int ARGBBlur(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius) { int y; - void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum, - const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; - void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C; - int32* cumsum_bot_row; - int32* max_cumsum_bot_row; - int32* cumsum_top_row; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverageRow)( + const int32_t* topleft, const int32_t* botleft, int width, int area, + uint8_t* dst, int count) = CumulativeSumToAverageRow_C; + int32_t* cumsum_bot_row; + int32_t* max_cumsum_bot_row; + int32_t* cumsum_top_row; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1889,9 +2432,8 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, #endif // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. - ARGBComputeCumulativeSum(src_argb, src_stride_argb, - dst_cumsum, dst_stride32_cumsum, - width, radius); + ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, + dst_stride32_cumsum, width, radius); src_argb = src_argb + radius * src_stride_argb; cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; @@ -1917,7 +2459,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Increment cumsum_bot_row pointer with circular buffer wrap around and // then fill in a row of CumulativeSum. if ((y + radius) < height) { - const int32* prev_cumsum_bot_row = cumsum_bot_row; + const int32_t* prev_cumsum_bot_row = cumsum_bot_row; cumsum_bot_row += dst_stride32_cumsum; if (cumsum_bot_row >= max_cumsum_bot_row) { cumsum_bot_row = dst_cumsum; @@ -1929,24 +2471,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Left clipped. for (x = 0; x < radius + 1; ++x) { - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], 1); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], 1); area += (bot_y - top_y); boxwidth += 4; } // Middle unclipped. n = (width - 1) - radius - x + 1; - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], n); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], n); // Right clipped. for (x += n; x <= width - 1; ++x) { area -= (bot_y - top_y); boxwidth -= 4; CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, - cumsum_bot_row + (x - radius - 1) * 4, - boxwidth, area, &dst_argb[x * 4], 1); + cumsum_bot_row + (x - radius - 1) * 4, boxwidth, + area, &dst_argb[x * 4], 1); } dst_argb += dst_stride_argb; } @@ -1955,12 +2497,16 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Multiply ARGB image by a specified ARGB value. LIBYUV_API -int ARGBShade(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, uint32 value) { +int ARGBShade(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32_t value) { int y; - void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, - int width, uint32 value) = ARGBShadeRow_C; + void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width, + uint32_t value) = ARGBShadeRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { return -1; } @@ -1970,8 +2516,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1986,6 +2531,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, ARGBShadeRow = ARGBShadeRow_NEON; } #endif +#if defined(HAS_ARGBSHADEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); @@ -1997,12 +2547,17 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, // Interpolate 2 planes by specified amount (0 to 255). LIBYUV_API -int InterpolatePlane(const uint8* src0, int src_stride0, - const uint8* src1, int src_stride1, - uint8* dst, int dst_stride, - int width, int height, int interpolation) { +int InterpolatePlane(const uint8_t* src0, + int src_stride0, + const uint8_t* src1, + int src_stride1, + uint8_t* dst, + int dst_stride, + int width, + int height, + int interpolation) { int y; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src0 || !src1 || !dst || width <= 0 || height == 0) { @@ -2015,9 +2570,7 @@ int InterpolatePlane(const uint8* src0, int src_stride0, dst_stride = -dst_stride; } // Coalesce rows. - if (src_stride0 == width && - src_stride1 == width && - dst_stride == width) { + if (src_stride0 == width && src_stride1 == width && dst_stride == width) { width *= height; height = 1; src_stride0 = src_stride1 = dst_stride = 0; @@ -2046,13 +2599,12 @@ int InterpolatePlane(const uint8* src0, int src_stride0, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) && - IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) && - IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) && - IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } } #endif @@ -2067,61 +2619,71 @@ int InterpolatePlane(const uint8* src0, int src_stride0, // Interpolate 2 ARGB images by specified amount (0 to 255). LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int interpolation) { - return InterpolatePlane(src_argb0, src_stride_argb0, - src_argb1, src_stride_argb1, - dst_argb, dst_stride_argb, +int ARGBInterpolate(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation) { + return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1, + src_stride_argb1, dst_argb, dst_stride_argb, width * 4, height, interpolation); } // Interpolate 2 YUV images by specified amount (0 to 255). LIBYUV_API -int I420Interpolate(const uint8* src0_y, int src0_stride_y, - const uint8* src0_u, int src0_stride_u, - const uint8* src0_v, int src0_stride_v, - const uint8* src1_y, int src1_stride_y, - const uint8* src1_u, int src1_stride_u, - const uint8* src1_v, int src1_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, int interpolation) { +int I420Interpolate(const uint8_t* src0_y, + int src0_stride_y, + const uint8_t* src0_u, + int src0_stride_u, + const uint8_t* src0_v, + int src0_stride_v, + const uint8_t* src1_y, + int src1_stride_y, + const uint8_t* src1_u, + int src1_stride_u, + const uint8_t* src1_v, + int src1_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src0_y || !src0_u || !src0_v || - !src1_y || !src1_u || !src1_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || + !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } - InterpolatePlane(src0_y, src0_stride_y, - src1_y, src1_stride_y, - dst_y, dst_stride_y, - width, height, interpolation); - InterpolatePlane(src0_u, src0_stride_u, - src1_u, src1_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight, interpolation); - InterpolatePlane(src0_v, src0_stride_v, - src1_v, src1_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight, interpolation); + InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, + dst_stride_y, width, height, interpolation); + InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, + dst_stride_u, halfwidth, halfheight, interpolation); + InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v, + dst_stride_v, halfwidth, halfheight, interpolation); return 0; } // Shuffle ARGB channel order. e.g. BGRA to ARGB. LIBYUV_API -int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - const uint8* shuffler, int width, int height) { +int ARGBShuffle(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* shuffler, + int width, + int height) { int y; - void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, - const uint8* shuffler, int width) = ARGBShuffleRow_C; - if (!src_bgra || !dst_argb || - width <= 0 || height == 0) { + void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb, + const uint8_t* shuffler, int width) = ARGBShuffleRow_C; + if (!src_bgra || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2131,20 +2693,11 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, src_stride_bgra = -src_stride_bgra; } // Coalesce rows. - if (src_stride_bgra == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_bgra = dst_stride_argb = 0; } -#if defined(HAS_ARGBSHUFFLEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBShuffleRow = ARGBShuffleRow_SSE2; - } - } -#endif #if defined(HAS_ARGBSHUFFLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; @@ -2169,6 +2722,14 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); @@ -2179,28 +2740,32 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } // Sobel ARGB effect. -static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, - void (*SobelRow)(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst, int width)) { +static int ARGBSobelize(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + void (*SobelRow)(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst, + int width)) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) = + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) = ARGBToYJRow_C; - void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) = SobelYRow_C; - void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobely, int width) = + void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, + uint8_t* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, + const uint8_t* src_y2, uint8_t* dst_sobely, int width) = SobelXRow_C; const int kEdge = 16; // Extra pixels at start of row for extrude/align. - if (!src_argb || !dst_argb || width <= 0 || height == 0) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } @@ -2228,6 +2793,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -2239,6 +2812,11 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, SobelYRow = SobelYRow_NEON; } #endif +#if defined(HAS_SOBELYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelYRow = SobelYRow_MSA; + } +#endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; @@ -2248,19 +2826,24 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasNEON)) { SobelXRow = SobelXRow_NEON; } +#endif +#if defined(HAS_SOBELXROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXRow = SobelXRow_MSA; + } #endif { // 3 rows with edges before/after. const int kRowSize = (width + kEdge + 31) & ~31; align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); - uint8* row_sobelx = rows; - uint8* row_sobely = rows + kRowSize; - uint8* row_y = rows + kRowSize * 2; + uint8_t* row_sobelx = rows; + uint8_t* row_sobely = rows + kRowSize; + uint8_t* row_y = rows + kRowSize * 2; // Convert first row. - uint8* row_y0 = row_y + kEdge; - uint8* row_y1 = row_y0 + kRowSize; - uint8* row_y2 = row_y1 + kRowSize; + uint8_t* row_y0 = row_y + kEdge; + uint8_t* row_y1 = row_y0 + kRowSize; + uint8_t* row_y2 = row_y1 + kRowSize; ARGBToYJRow(src_argb, row_y0, width); row_y0[-1] = row_y0[0]; memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. @@ -2284,7 +2867,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, // Cycle thru circular queue of 3 row_y buffers. { - uint8* row_yt = row_y0; + uint8_t* row_yt = row_y0; row_y0 = row_y1; row_y1 = row_y2; row_y2 = row_yt; @@ -2299,11 +2882,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, // Sobel ARGB effect. LIBYUV_API -int ARGBSobel(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelRow_C; +int ARGBSobel(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelRow = SobelRow_Any_SSE2; @@ -2319,6 +2905,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, SobelRow = SobelRow_NEON; } } +#endif +#if defined(HAS_SOBELROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelRow = SobelRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_MSA; + } + } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); @@ -2326,11 +2920,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, // Sobel ARGB effect with planar output. LIBYUV_API -int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height) { - void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_, int width) = SobelToPlaneRow_C; +int ARGBSobelToPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; @@ -2347,18 +2944,29 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, } } #endif - return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, - width, height, SobelToPlaneRow); +#if defined(HAS_SOBELTOPLANEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_MSA; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, + height, SobelToPlaneRow); } // SobelXY ARGB effect. // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. LIBYUV_API -int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelXYRow_C; +int ARGBSobelXY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXYRow = SobelXYRow_Any_SSE2; @@ -2374,6 +2982,14 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, SobelXYRow = SobelXYRow_NEON; } } +#endif +#if defined(HAS_SOBELXYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXYRow = SobelXYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_MSA; + } + } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); @@ -2381,26 +2997,27 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, // Apply a 4x4 polynomial to each ARGB pixel. LIBYUV_API -int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBPolynomial(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, const float* poly, - int width, int height) { + int width, + int height) { int y; - void (*ARGBPolynomialRow)(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) = ARGBPolynomialRow_C; + void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const float* poly, int width) = ARGBPolynomialRow_C; if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2425,28 +3042,132 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, return 0; } +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + float scale, + int width, + int height) { + int y; + void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, + int width) = HalfFloatRow_C; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + src_stride_y >>= 1; + dst_stride_y >>= 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_HALFFLOATROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + HalfFloatRow = HalfFloatRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = HalfFloatRow_SSE2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HalfFloatRow = HalfFloatRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = HalfFloatRow_AVX2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_F16C) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; + } + } +#endif +#if defined(HAS_HALFFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; + } + } +#endif +#if defined(HAS_HALFFLOATROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HalfFloatRow = HalfFloatRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + HalfFloatRow = HalfFloatRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + HalfFloatRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } + return 0; +} + +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { + void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, + int width) = ByteToFloatRow_C; + if (!src_y || !dst_y || width <= 0) { + return -1; + } +#if defined(HAS_BYTETOFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ByteToFloatRow = ByteToFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ByteToFloatRow = ByteToFloatRow_NEON; + } + } +#endif + + ByteToFloatRow(src_y, dst_y, scale, width); + return 0; +} + // Apply a lumacolortable to each ARGB pixel. LIBYUV_API -int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const uint8* luma, - int width, int height) { +int ARGBLumaColorTable(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* luma, + int width, + int height) { int y; - void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, - int width, const uint8* luma, const uint32 lumacoeff) = - ARGBLumaColorTableRow_C; + void (*ARGBLumaColorTableRow)( + const uint8_t* src_argb, uint8_t* dst_argb, int width, + const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C; if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2467,12 +3188,15 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, // Copy Alpha from one ARGB image to another. LIBYUV_API -int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBCopyAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = - ARGBCopyAlphaRow_C; + void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBCopyAlphaRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2483,8 +3207,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2516,55 +3239,73 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, // Extract just the alpha channel from ARGB. LIBYUV_API -int ARGBExtractAlpha(const uint8* src_argb, int src_stride, - uint8* dst_a, int dst_stride, - int width, int height) { +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { if (!src_argb || !dst_a || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb += (height - 1) * src_stride; - src_stride = -src_stride; + src_argb += (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride == width * 4 && dst_stride == width) { + if (src_stride_argb == width * 4 && dst_stride_a == width) { width *= height; height = 1; - src_stride = dst_stride = 0; + src_stride_argb = dst_stride_a = 0; } - void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) = - ARGBExtractAlphaRow_C; + void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, + int width) = ARGBExtractAlphaRow_C; #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 : ARGBExtractAlphaRow_Any_SSE2; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif #if defined(HAS_ARGBEXTRACTALPHAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON : ARGBExtractAlphaRow_Any_NEON; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); - src_argb += src_stride; - dst_a += dst_stride; + src_argb += src_stride_argb; + dst_a += dst_stride_a; } return 0; } // Copy a planar Y channel to the alpha channel of a destination ARGB image. LIBYUV_API -int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBCopyYToAlpha(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = - ARGBCopyYToAlphaRow_C; + void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, + int width) = ARGBCopyYToAlphaRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2575,8 +3316,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = -src_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -2610,20 +3350,22 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, // directly. A SplitUVRow_Odd function could copy the remaining chroma. LIBYUV_API -int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int YUY2ToNV12(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - if (!src_yuy2 || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2656,6 +3398,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2680,6 +3430,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; @@ -2708,20 +3466,22 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } LIBYUV_API -int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int UYVYToNV12(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - if (!src_uyvy || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2754,6 +3514,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2778,6 +3546,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; diff --git a/libs/libvpx/third_party/libyuv/source/rotate.cc b/libs/libvpx/third_party/libyuv/source/rotate.cc index 01ea5c4074..f2bed85b75 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate.cc @@ -10,8 +10,8 @@ #include "libyuv/rotate.h" -#include "libyuv/cpu_id.h" #include "libyuv/convert.h" +#include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate_row.h" #include "libyuv/row.h" @@ -22,12 +22,20 @@ extern "C" { #endif LIBYUV_API -void TransposePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void TransposePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { int i = height; - void (*TransposeWx8)(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) = TransposeWx8_C; +#if defined(HAS_TRANSPOSEWX16_MSA) + void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx16_C; +#else + void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx8_C; +#endif #if defined(HAS_TRANSPOSEWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; @@ -49,24 +57,32 @@ void TransposePlane(const uint8* src, int src_stride, } } #endif -#if defined(HAS_TRANSPOSEWX8_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - TransposeWx8 = TransposeWx8_Fast_DSPR2; - } else { - TransposeWx8 = TransposeWx8_DSPR2; +#if defined(HAS_TRANSPOSEWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeWx16 = TransposeWx16_Any_MSA; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_MSA; } } #endif +#if defined(HAS_TRANSPOSEWX16_MSA) + // Work across the source in 16x16 tiles + while (i >= 16) { + TransposeWx16(src, src_stride, dst, dst_stride, width); + src += 16 * src_stride; // Go down 16 rows. + dst += 16; // Move over 16 columns. + i -= 16; + } +#else // Work across the source in 8x8 tiles while (i >= 8) { TransposeWx8(src, src_stride, dst, dst_stride, width); - src += 8 * src_stride; // Go down 8 rows. - dst += 8; // Move over 8 columns. + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. i -= 8; } +#endif if (i > 0) { TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); @@ -74,9 +90,12 @@ void TransposePlane(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane90(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { // Rotate by 90 is a transpose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. @@ -86,9 +105,12 @@ void RotatePlane90(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane270(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { // Rotate by 270 is a transpose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. @@ -98,17 +120,20 @@ void RotatePlane270(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane180(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width); - const uint8* src_bot = src + src_stride * (height - 1); - uint8* dst_bot = dst + dst_stride * (height - 1); + const uint8_t* src_bot = src + src_stride * (height - 1); + uint8_t* dst_bot = dst + dst_stride * (height - 1); int half_height = (height + 1) >> 1; int y; - void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; @@ -133,12 +158,12 @@ void RotatePlane180(const uint8* src, int src_stride, } } #endif -// TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { - MirrorRow = MirrorRow_DSPR2; +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } } #endif #if defined(HAS_COPYROW_SSE2) @@ -161,11 +186,6 @@ void RotatePlane180(const uint8* src, int src_stride, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { @@ -181,15 +201,24 @@ void RotatePlane180(const uint8* src, int src_stride, } LIBYUV_API -void TransposeUV(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void TransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i = height; - void (*TransposeUVWx8)(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +#if defined(HAS_TRANSPOSEUVWX16_MSA) + void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx16_C; +#else + void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; +#endif #if defined(HAS_TRANSPOSEUVWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; @@ -203,72 +232,90 @@ void TransposeUV(const uint8* src, int src_stride, } } #endif -#if defined(HAS_TRANSPOSEUVWX8_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - TransposeUVWx8 = TransposeUVWx8_DSPR2; +#if defined(HAS_TRANSPOSEUVWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeUVWx16 = TransposeUVWx16_Any_MSA; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_MSA; + } } #endif +#if defined(HAS_TRANSPOSEUVWX16_MSA) + // Work through the source in 8x8 tiles. + while (i >= 16) { + TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 16 * src_stride; // Go down 16 rows. + dst_a += 16; // Move over 8 columns. + dst_b += 16; // Move over 8 columns. + i -= 16; + } +#else // Work through the source in 8x8 tiles. while (i >= 8) { - TransposeUVWx8(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, + TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width); - src += 8 * src_stride; // Go down 8 rows. - dst_a += 8; // Move over 8 columns. - dst_b += 8; // Move over 8 columns. + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. i -= 8; } +#endif if (i > 0) { - TransposeUVWxH_C(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, + TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, i); } } LIBYUV_API -void RotateUV90(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { src += src_stride * (height - 1); src_stride = -src_stride; - TransposeUV(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, height); + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); } LIBYUV_API -void RotateUV270(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { dst_a += dst_stride_a * (width - 1); dst_b += dst_stride_b * (width - 1); dst_stride_a = -dst_stride_a; dst_stride_b = -dst_stride_b; - TransposeUV(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, height); + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); } // Rotate 180 is a horizontal and vertical flip. LIBYUV_API -void RotateUV180(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i; - void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = - MirrorUVRow_C; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, + int width) = MirrorUVRow_C; #if defined(HAS_MIRRORUVROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { MirrorUVRow = MirrorUVRow_NEON; @@ -279,10 +326,9 @@ void RotateUV180(const uint8* src, int src_stride, MirrorUVRow = MirrorUVRow_SSSE3; } #endif -#if defined(HAS_MIRRORUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - MirrorUVRow = MirrorUVRow_DSPR2; +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_MSA; } #endif @@ -298,9 +344,12 @@ void RotateUV180(const uint8* src, int src_stride, } LIBYUV_API -int RotatePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height, +int RotatePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height, enum RotationMode mode) { if (!src || width <= 0 || height == 0 || !dst) { return -1; @@ -316,24 +365,16 @@ int RotatePlane(const uint8* src, int src_stride, switch (mode) { case kRotate0: // copy frame - CopyPlane(src, src_stride, - dst, dst_stride, - width, height); + CopyPlane(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate90: - RotatePlane90(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane90(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate270: - RotatePlane270(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane270(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate180: - RotatePlane180(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane180(src, src_stride, dst, dst_stride, width, height); return 0; default: break; @@ -342,18 +383,25 @@ int RotatePlane(const uint8* src, int src_stride, } LIBYUV_API -int I420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, +int I420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || - !dst_y || !dst_u || !dst_v) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { return -1; } @@ -372,45 +420,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y, switch (mode) { case kRotate0: // copy frame - return I420Copy(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); + return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); case kRotate90: - RotatePlane90(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane90(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane90(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; case kRotate270: - RotatePlane270(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane270(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane270(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; case kRotate180: - RotatePlane180(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane180(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane180(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; default: break; @@ -419,17 +451,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y, } LIBYUV_API -int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, +int NV12ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_uv || width <= 0 || height == 0 || - !dst_y || !dst_u || !dst_v) { + if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u || + !dst_v) { return -1; } @@ -446,38 +484,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, switch (mode) { case kRotate0: // copy frame - return NV12ToI420(src_y, src_stride_y, - src_uv, src_stride_uv, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, + return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); case kRotate90: - RotatePlane90(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV90(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate270: - RotatePlane270(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV270(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate180: - RotatePlane180(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV180(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; default: break; diff --git a/libs/libvpx/third_party/libyuv/source/rotate_any.cc b/libs/libvpx/third_party/libyuv/source/rotate_any.cc index 31a74c3155..c2752e6222 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate_any.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate_any.cc @@ -18,16 +18,16 @@ namespace libyuv { extern "C" { #endif -#define TANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, \ - uint8* dst, int dst_stride, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ - } \ - TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\ - } +#define TANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ + int dst_stride, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } #ifdef HAS_TRANSPOSEWX8_NEON TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) @@ -38,25 +38,23 @@ TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #endif -#ifdef HAS_TRANSPOSEWX8_DSPR2 -TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7) +#ifdef HAS_TRANSPOSEWX16_MSA +TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) #endif #undef TANY #define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, \ - uint8* dst_a, int dst_stride_a, \ - uint8* dst_b, int dst_stride_b, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \ - n); \ - } \ - TransposeUVWx8_C(src + n * 2, src_stride, \ - dst_a + n * dst_stride_a, dst_stride_a, \ - dst_b + n * dst_stride_b, dst_stride_b, r); \ - } + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ + int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ + } \ + TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ + dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ + } #ifdef HAS_TRANSPOSEUVWX8_NEON TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) @@ -64,8 +62,8 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #ifdef HAS_TRANSPOSEUVWX8_SSE2 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) #endif -#ifdef HAS_TRANSPOSEUVWX8_DSPR2 -TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7) +#ifdef HAS_TRANSPOSEUVWX16_MSA +TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) #endif #undef TUVANY @@ -73,8 +71,3 @@ TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7) } // extern "C" } // namespace libyuv #endif - - - - - diff --git a/libs/libvpx/third_party/libyuv/source/rotate_argb.cc b/libs/libvpx/third_party/libyuv/source/rotate_argb.cc index 787c0ad1be..5a6e05376f 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate_argb.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate_argb.cc @@ -10,90 +10,106 @@ #include "libyuv/rotate.h" -#include "libyuv/cpu_id.h" #include "libyuv/convert.h" +#include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/row.h" +#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// ARGBScale has a function to copy pixels to a row, striding each source -// pixel by a constant. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || \ - (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, - int src_stepx, uint8* dst_ptr, int dst_width); -#endif -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SCALEARGBROWDOWNEVEN_NEON -void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, - int src_stepx, uint8* dst_ptr, int dst_width); -#endif - -void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, - int src_stepx, uint8* dst_ptr, int dst_width); - -static void ARGBTranspose(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +static void ARGBTranspose(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int i; - int src_pixel_step = src_stride >> 2; - void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, - int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; + int src_pixel_step = src_stride_argb >> 2; + void (*ScaleARGBRowDownEven)( + const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, + uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; + } } #endif for (i = 0; i < width; ++i) { // column of source to row of dest. - ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height); - dst += dst_stride; - src += 4; + ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); + dst_argb += dst_stride_argb; + src_argb += 4; } } -void ARGBRotate90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate90(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. - src += src_stride * (height - 1); - src_stride = -src_stride; - ARGBTranspose(src, src_stride, dst, dst_stride, width, height); + src_argb += src_stride_argb * (height - 1); + src_stride_argb = -src_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); } -void ARGBRotate270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate270(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. - dst += dst_stride * (width - 1); - dst_stride = -dst_stride; - ARGBTranspose(src, src_stride, dst, dst_stride, width, height); + dst_argb += dst_stride_argb * (width - 1); + dst_stride_argb = -dst_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); } -void ARGBRotate180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate180(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); - const uint8* src_bot = src + src_stride * (height - 1); - uint8* dst_bot = dst + dst_stride * (height - 1); + const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); + uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1); int half_height = (height + 1) >> 1; int y; - void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBMirrorRow_C; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + CopyRow_C; #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; @@ -118,6 +134,14 @@ void ARGBRotate180(const uint8* src, int src_stride, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -138,28 +162,27 @@ void ARGBRotate180(const uint8* src, int src_stride, CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - ARGBMirrorRow(src, row, width); // Mirror first row into a buffer - ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row + ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last - src += src_stride; - dst += dst_stride; - src_bot -= src_stride; - dst_bot -= dst_stride; + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + src_bot -= src_stride_argb; + dst_bot -= dst_stride_argb; } free_aligned_buffer_64(row); } LIBYUV_API -int ARGBRotate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, int width, int height, +int ARGBRotate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, enum RotationMode mode) { if (!src_argb || width <= 0 || height == 0 || !dst_argb) { return -1; @@ -175,23 +198,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb, switch (mode) { case kRotate0: // copy frame - return ARGBCopy(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, + return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate90: - ARGBRotate90(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; case kRotate270: - ARGBRotate270(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; case kRotate180: - ARGBRotate180(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; default: break; diff --git a/libs/libvpx/third_party/libyuv/source/rotate_common.cc b/libs/libvpx/third_party/libyuv/source/rotate_common.cc index b33a9a0c6e..ff212adebc 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate_common.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate_common.cc @@ -8,16 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif -void TransposeWx8_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +void TransposeWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { int i; for (i = 0; i < width; ++i) { dst[0] = src[0 * src_stride]; @@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride, } } -void TransposeUVWx8_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width) { +void TransposeUVWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { int i; for (i = 0; i < width; ++i) { dst_a[0] = src[0 * src_stride + 0]; @@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride, } } -void TransposeWxH_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void TransposeWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { int i; for (i = 0; i < width; ++i) { int j; @@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride, } } -void TransposeUVWxH_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void TransposeUVWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i; for (i = 0; i < width * 2; i += 2) { int j; diff --git a/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc b/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc index cbe870caa7..04e19e29ee 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -22,342 +22,348 @@ extern "C" { // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // defined(HAS_TRANSPOSEWX8_SSSE3) // Transpose 16x8. 64 bit #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) -void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" - ); +void TransposeWx8_Fast_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm15"); } #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) // Transpose UV 8x8. 64 bit. #if defined(HAS_TRANSPOSEUVWX8_SSE2) -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(src_stride)), // %4 - "r"((intptr_t)(dst_stride_a)), // %5 - "r"((intptr_t)(dst_stride_b)) // %6 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9" - ); +void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9"); } #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) #endif // defined(__x86_64__) || defined(__i386__) diff --git a/libs/libvpx/third_party/libyuv/source/rotate_mips.cc b/libs/libvpx/third_party/libyuv/source/rotate_mips.cc deleted file mode 100644 index 1e8ce25197..0000000000 --- a/libs/libvpx/third_party/libyuv/source/rotate_mips.cc +++ /dev/null @@ -1,484 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" -#include "libyuv/rotate_row.h" - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -void TransposeWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - "andi $t0, %[dst], 0x3 \n" - "andi $t1, %[dst_stride], 0x3 \n" - "or $t0, $t0, $t1 \n" - "bnez $t0, 11f \n" - " subu $t7, $t9, %[src_stride] \n" -//dst + dst_stride word aligned - "1: \n" - "lbu $t0, 0(%[src]) \n" - "lbux $t1, %[src_stride](%[src]) \n" - "lbux $t8, $t2(%[src]) \n" - "lbux $t9, $t3(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s0, $t8, $t0 \n" - "lbux $t0, $t4(%[src]) \n" - "lbux $t1, $t5(%[src]) \n" - "lbux $t8, $t6(%[src]) \n" - "lbux $t9, $t7(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s1, $t8, $t0 \n" - "sw $s0, 0(%[dst]) \n" - "addiu %[width], -1 \n" - "addiu %[src], 1 \n" - "sw $s1, 4(%[dst]) \n" - "bnez %[width], 1b \n" - " addu %[dst], %[dst], %[dst_stride] \n" - "b 2f \n" -//dst + dst_stride unaligned - "11: \n" - "lbu $t0, 0(%[src]) \n" - "lbux $t1, %[src_stride](%[src]) \n" - "lbux $t8, $t2(%[src]) \n" - "lbux $t9, $t3(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s0, $t8, $t0 \n" - "lbux $t0, $t4(%[src]) \n" - "lbux $t1, $t5(%[src]) \n" - "lbux $t8, $t6(%[src]) \n" - "lbux $t9, $t7(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s1, $t8, $t0 \n" - "swr $s0, 0(%[dst]) \n" - "swl $s0, 3(%[dst]) \n" - "addiu %[width], -1 \n" - "addiu %[src], 1 \n" - "swr $s1, 4(%[dst]) \n" - "swl $s1, 7(%[dst]) \n" - "bnez %[width], 11b \n" - "addu %[dst], %[dst], %[dst_stride] \n" - "2: \n" - ".set pop \n" - :[src] "+r" (src), - [dst] "+r" (dst), - [width] "+r" (width) - :[src_stride] "r" (src_stride), - [dst_stride] "r" (dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1" - ); -} - -void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - __asm__ __volatile__ ( - ".set noat \n" - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - - "srl $AT, %[width], 0x2 \n" - "andi $t0, %[dst], 0x3 \n" - "andi $t1, %[dst_stride], 0x3 \n" - "or $t0, $t0, $t1 \n" - "bnez $t0, 11f \n" - " subu $t7, $t9, %[src_stride] \n" -//dst + dst_stride word aligned - "1: \n" - "lw $t0, 0(%[src]) \n" - "lwx $t1, %[src_stride](%[src]) \n" - "lwx $t8, $t2(%[src]) \n" - "lwx $t9, $t3(%[src]) \n" - -// t0 = | 30 | 20 | 10 | 00 | -// t1 = | 31 | 21 | 11 | 01 | -// t8 = | 32 | 22 | 12 | 02 | -// t9 = | 33 | 23 | 13 | 03 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | - - "precr.qb.ph $s4, $s1, $s0 \n" - "precrq.qb.ph $s5, $s1, $s0 \n" - "precr.qb.ph $s6, $s3, $s2 \n" - "precrq.qb.ph $s7, $s3, $s2 \n" - - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | - - "lwx $t0, $t4(%[src]) \n" - "lwx $t1, $t5(%[src]) \n" - "lwx $t8, $t6(%[src]) \n" - "lwx $t9, $t7(%[src]) \n" - -// t0 = | 34 | 24 | 14 | 04 | -// t1 = | 35 | 25 | 15 | 05 | -// t8 = | 36 | 26 | 16 | 06 | -// t9 = | 37 | 27 | 17 | 07 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | - - "precr.qb.ph $t0, $s1, $s0 \n" - "precrq.qb.ph $t1, $s1, $s0 \n" - "precr.qb.ph $t8, $s3, $s2 \n" - "precrq.qb.ph $t9, $s3, $s2 \n" - - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | - - "addu $s0, %[dst], %[dst_stride] \n" - "addu $s1, $s0, %[dst_stride] \n" - "addu $s2, $s1, %[dst_stride] \n" - - "sw $s4, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $s6, 0($s0) \n" - "sw $t8, 4($s0) \n" - "sw $s5, 0($s1) \n" - "sw $t1, 4($s1) \n" - "sw $s7, 0($s2) \n" - "sw $t9, 4($s2) \n" - - "addiu $AT, -1 \n" - "addiu %[src], 4 \n" - - "bnez $AT, 1b \n" - " addu %[dst], $s2, %[dst_stride] \n" - "b 2f \n" -//dst + dst_stride unaligned - "11: \n" - "lw $t0, 0(%[src]) \n" - "lwx $t1, %[src_stride](%[src]) \n" - "lwx $t8, $t2(%[src]) \n" - "lwx $t9, $t3(%[src]) \n" - -// t0 = | 30 | 20 | 10 | 00 | -// t1 = | 31 | 21 | 11 | 01 | -// t8 = | 32 | 22 | 12 | 02 | -// t9 = | 33 | 23 | 13 | 03 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | - - "precr.qb.ph $s4, $s1, $s0 \n" - "precrq.qb.ph $s5, $s1, $s0 \n" - "precr.qb.ph $s6, $s3, $s2 \n" - "precrq.qb.ph $s7, $s3, $s2 \n" - - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | - - "lwx $t0, $t4(%[src]) \n" - "lwx $t1, $t5(%[src]) \n" - "lwx $t8, $t6(%[src]) \n" - "lwx $t9, $t7(%[src]) \n" - -// t0 = | 34 | 24 | 14 | 04 | -// t1 = | 35 | 25 | 15 | 05 | -// t8 = | 36 | 26 | 16 | 06 | -// t9 = | 37 | 27 | 17 | 07 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | - - "precr.qb.ph $t0, $s1, $s0 \n" - "precrq.qb.ph $t1, $s1, $s0 \n" - "precr.qb.ph $t8, $s3, $s2 \n" - "precrq.qb.ph $t9, $s3, $s2 \n" - - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | - - "addu $s0, %[dst], %[dst_stride] \n" - "addu $s1, $s0, %[dst_stride] \n" - "addu $s2, $s1, %[dst_stride] \n" - - "swr $s4, 0(%[dst]) \n" - "swl $s4, 3(%[dst]) \n" - "swr $t0, 4(%[dst]) \n" - "swl $t0, 7(%[dst]) \n" - "swr $s6, 0($s0) \n" - "swl $s6, 3($s0) \n" - "swr $t8, 4($s0) \n" - "swl $t8, 7($s0) \n" - "swr $s5, 0($s1) \n" - "swl $s5, 3($s1) \n" - "swr $t1, 4($s1) \n" - "swl $t1, 7($s1) \n" - "swr $s7, 0($s2) \n" - "swl $s7, 3($s2) \n" - "swr $t9, 4($s2) \n" - "swl $t9, 7($s2) \n" - - "addiu $AT, -1 \n" - "addiu %[src], 4 \n" - - "bnez $AT, 11b \n" - " addu %[dst], $s2, %[dst_stride] \n" - "2: \n" - ".set pop \n" - ".set at \n" - :[src] "+r" (src), - [dst] "+r" (dst), - [width] "+r" (width) - :[src_stride] "r" (src_stride), - [dst_stride] "r" (dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" - ); -} - -void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - "subu $t7, $t9, %[src_stride] \n" - "srl $t1, %[width], 1 \n" - -// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b - "andi $t0, %[dst_a], 0x3 \n" - "andi $t8, %[dst_b], 0x3 \n" - "or $t0, $t0, $t8 \n" - "andi $t8, %[dst_stride_a], 0x3 \n" - "andi $s5, %[dst_stride_b], 0x3 \n" - "or $t8, $t8, $s5 \n" - "or $t0, $t0, $t8 \n" - "bnez $t0, 11f \n" - " nop \n" -// dst + dst_stride word aligned (both, a & b dst addresses) - "1: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| - "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| - "addu $s6, %[dst_b], %[dst_stride_b] \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| - - "sw $s3, 0($s5) \n" - "sw $s4, 0($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| - "sw $s3, 0(%[dst_a]) \n" - "sw $s4, 0(%[dst_b]) \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| - "sw $s3, 4($s5) \n" - "sw $s4, 4($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| - - "addiu %[src], 4 \n" - "addiu $t1, -1 \n" - "sll $t0, %[dst_stride_a], 1 \n" - "sll $t8, %[dst_stride_b], 1 \n" - "sw $s3, 4(%[dst_a]) \n" - "sw $s4, 4(%[dst_b]) \n" - "addu %[dst_a], %[dst_a], $t0 \n" - "bnez $t1, 1b \n" - " addu %[dst_b], %[dst_b], $t8 \n" - "b 2f \n" - " nop \n" - -// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned - "11: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| - "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| - "addu $s6, %[dst_b], %[dst_stride_b] \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| - - "swr $s3, 0($s5) \n" - "swl $s3, 3($s5) \n" - "swr $s4, 0($s6) \n" - "swl $s4, 3($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| - "swr $s3, 0(%[dst_a]) \n" - "swl $s3, 3(%[dst_a]) \n" - "swr $s4, 0(%[dst_b]) \n" - "swl $s4, 3(%[dst_b]) \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| - - "swr $s3, 4($s5) \n" - "swl $s3, 7($s5) \n" - "swr $s4, 4($s6) \n" - "swl $s4, 7($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| - - "addiu %[src], 4 \n" - "addiu $t1, -1 \n" - "sll $t0, %[dst_stride_a], 1 \n" - "sll $t8, %[dst_stride_b], 1 \n" - "swr $s3, 4(%[dst_a]) \n" - "swl $s3, 7(%[dst_a]) \n" - "swr $s4, 4(%[dst_b]) \n" - "swl $s4, 7(%[dst_b]) \n" - "addu %[dst_a], %[dst_a], $t0 \n" - "bnez $t1, 11b \n" - " addu %[dst_b], %[dst_b], $t8 \n" - - "2: \n" - ".set pop \n" - : [src] "+r" (src), - [dst_a] "+r" (dst_a), - [dst_b] "+r" (dst_b), - [width] "+r" (width), - [src_stride] "+r" (src_stride) - : [dst_stride_a] "r" (dst_stride_a), - [dst_stride_b] "r" (dst_stride_b) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); -} - -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libs/libvpx/third_party/libyuv/source/rotate_msa.cc b/libs/libvpx/third_party/libyuv/source/rotate_msa.cc new file mode 100644 index 0000000000..99bdca65b3 --- /dev/null +++ b/libs/libvpx/third_party/libyuv/source/rotate_msa.cc @@ -0,0 +1,250 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ + out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ + out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ + out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ + } + +#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ + out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ + out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ + out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ + } + +#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ + out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ + out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ + out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ + } + +#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ + out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ + } + +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), + dst_stride_a, (dst_b + 8), dst_stride_b, width); +} + +void TransposeWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 16) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + src += 16; + dst += dst_stride * 4; + } +} + +void TransposeUVWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 8) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + src += 16; + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/libs/libvpx/third_party/libyuv/source/rotate_neon.cc b/libs/libvpx/third_party/libyuv/source/rotate_neon.cc index 1c22b472bc..fdc0dd476c 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate_neon.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate_neon.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #include "libyuv/basic_types.h" @@ -21,38 +21,32 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, int width) { - const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %5, #8 \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "vld1.8 {d0}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d1}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d2}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d3}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d4}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d5}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d6}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d7}, [%0] \n" "vtrn.8 d1, d0 \n" @@ -77,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" - MEMACCESS(0) "vst1.8 {d1}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d3}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d5}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d7}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0] \n" "add %1, #8 \n" // src += 8 @@ -99,180 +85,138 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "subs %5, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %5, #8 \n" - "beq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %5, #2 \n" - "blt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" - "cmp %5, #4 \n" - "blt 2f \n" + "cmp %5, #4 \n" + "blt 2f \n" - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d2[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d2[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d3[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d3[1]}, [%0] \n" + // 4x8 block + "mov %0, %1 \n" + "vld1.32 {d0[0]}, [%0], %2 \n" + "vld1.32 {d0[1]}, [%0], %2 \n" + "vld1.32 {d1[0]}, [%0], %2 \n" + "vld1.32 {d1[1]}, [%0], %2 \n" + "vld1.32 {d2[0]}, [%0], %2 \n" + "vld1.32 {d2[1]}, [%0], %2 \n" + "vld1.32 {d3[0]}, [%0], %2 \n" + "vld1.32 {d3[1]}, [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(6) - "vld1.8 {q3}, [%6] \n" + "vld1.8 {q3}, [%6] \n" - "vtbl.8 d4, {d0, d1}, d6 \n" - "vtbl.8 d5, {d0, d1}, d7 \n" - "vtbl.8 d0, {d2, d3}, d6 \n" - "vtbl.8 d1, {d2, d3}, d7 \n" + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - MEMACCESS(0) - "vst1.32 {d4[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d4[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d5[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d5[1]}, [%0] \n" + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "vst1.32 {d4[0]}, [%0], %4 \n" + "vst1.32 {d4[1]}, [%0], %4 \n" + "vst1.32 {d5[0]}, [%0], %4 \n" + "vst1.32 {d5[1]}, [%0] \n" - "add %0, %3, #4 \n" - MEMACCESS(0) - "vst1.32 {d0[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d0[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d1[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d1[1]}, [%0] \n" + "add %0, %3, #4 \n" + "vst1.32 {d0[0]}, [%0], %4 \n" + "vst1.32 {d0[1]}, [%0], %4 \n" + "vst1.32 {d1[0]}, [%0], %4 \n" + "vst1.32 {d1[1]}, [%0] \n" - "add %1, #4 \n" // src += 4 - "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride - "subs %5, #4 \n" // w -= 4 - "beq 4f \n" + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %5, #2 \n" - "blt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.16 {d0[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[3]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[3]}, [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld1.16 {d0[0]}, [%0], %2 \n" + "vld1.16 {d1[0]}, [%0], %2 \n" + "vld1.16 {d0[1]}, [%0], %2 \n" + "vld1.16 {d1[1]}, [%0], %2 \n" + "vld1.16 {d0[2]}, [%0], %2 \n" + "vld1.16 {d1[2]}, [%0], %2 \n" + "vld1.16 {d0[3]}, [%0], %2 \n" + "vld1.16 {d1[3]}, [%0] \n" - "vtrn.8 d0, d1 \n" + "vtrn.8 d0, d1 \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "vst1.64 {d0}, [%0], %4 \n" - MEMACCESS(0) - "vst1.64 {d1}, [%0] \n" + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d1}, [%0] \n" - "add %1, #2 \n" // src += 2 - "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride - "subs %5, #2 \n" // w -= 2 - "beq 4f \n" + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "vld1.8 {d0[0]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[1]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[2]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[3]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[4]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[5]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[6]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[7]}, [%1] \n" + // 1x8 block + "3: \n" + "vld1.8 {d0[0]}, [%1], %2 \n" + "vld1.8 {d0[1]}, [%1], %2 \n" + "vld1.8 {d0[2]}, [%1], %2 \n" + "vld1.8 {d0[3]}, [%1], %2 \n" + "vld1.8 {d0[4]}, [%1], %2 \n" + "vld1.8 {d0[5]}, [%1], %2 \n" + "vld1.8 {d0[6]}, [%1], %2 \n" + "vld1.8 {d0[7]}, [%1] \n" - MEMACCESS(3) - "vst1.64 {d0}, [%3] \n" + "vst1.64 {d0}, [%3] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst), // %3 - "+r"(dst_stride), // %4 - "+r"(width) // %5 - : "r"(&kVTbl4x4Transpose) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3"); } -static uvec8 kVTbl4x4TransposeDi = - { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; +static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, int width) { - const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %7, #8 \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "vld2.8 {d0, d1}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d2, d3}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d4, d5}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d6, d7}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d16, d17}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d18, d19}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d20, d21}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d22, d23}, [%0] \n" "vtrn.8 q1, q0 \n" @@ -301,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" - MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d18}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d16}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d22}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d20}, [%0] \n" "mov %0, %5 \n" - MEMACCESS(0) "vst1.8 {d3}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d1}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d7}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d5}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d19}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d17}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d23}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d21}, [%0] \n" "add %1, #8*2 \n" // src += 8*2 @@ -343,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "subs %7, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %7, #8 \n" - "beq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %7, #2 \n" - "blt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" - "cmp %7, #4 \n" - "blt 2f \n" + "cmp %7, #4 \n" + "blt 2f \n" - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.64 {d0}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d1}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d2}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d3}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d4}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d5}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d6}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d7}, [%0] \n" + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "vld1.64 {d0}, [%0], %2 \n" + "vld1.64 {d1}, [%0], %2 \n" + "vld1.64 {d2}, [%0], %2 \n" + "vld1.64 {d3}, [%0], %2 \n" + "vld1.64 {d4}, [%0], %2 \n" + "vld1.64 {d5}, [%0], %2 \n" + "vld1.64 {d6}, [%0], %2 \n" + "vld1.64 {d7}, [%0] \n" - MEMACCESS(8) - "vld1.8 {q15}, [%8] \n" + "vld1.8 {q15}, [%8] \n" - "vtrn.8 q0, q1 \n" - "vtrn.8 q2, q3 \n" + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" - "vtbl.8 d16, {d0, d1}, d30 \n" - "vtbl.8 d17, {d0, d1}, d31 \n" - "vtbl.8 d18, {d2, d3}, d30 \n" - "vtbl.8 d19, {d2, d3}, d31 \n" - "vtbl.8 d20, {d4, d5}, d30 \n" - "vtbl.8 d21, {d4, d5}, d31 \n" - "vtbl.8 d22, {d6, d7}, d30 \n" - "vtbl.8 d23, {d6, d7}, d31 \n" + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "vst1.32 {d16[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d16[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d17[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d17[1]}, [%0], %4 \n" + "vst1.32 {d16[0]}, [%0], %4 \n" + "vst1.32 {d16[1]}, [%0], %4 \n" + "vst1.32 {d17[0]}, [%0], %4 \n" + "vst1.32 {d17[1]}, [%0], %4 \n" - "add %0, %3, #4 \n" - MEMACCESS(0) - "vst1.32 {d20[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d20[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d21[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d21[1]}, [%0] \n" + "add %0, %3, #4 \n" + "vst1.32 {d20[0]}, [%0], %4 \n" + "vst1.32 {d20[1]}, [%0], %4 \n" + "vst1.32 {d21[0]}, [%0], %4 \n" + "vst1.32 {d21[1]}, [%0] \n" - "mov %0, %5 \n" + "mov %0, %5 \n" - MEMACCESS(0) - "vst1.32 {d18[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d18[1]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d19[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d19[1]}, [%0], %6 \n" + "vst1.32 {d18[0]}, [%0], %6 \n" + "vst1.32 {d18[1]}, [%0], %6 \n" + "vst1.32 {d19[0]}, [%0], %6 \n" + "vst1.32 {d19[1]}, [%0], %6 \n" - "add %0, %5, #4 \n" - MEMACCESS(0) - "vst1.32 {d22[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d22[1]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d23[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d23[1]}, [%0] \n" + "add %0, %5, #4 \n" + "vst1.32 {d22[0]}, [%0], %6 \n" + "vst1.32 {d22[1]}, [%0], %6 \n" + "vst1.32 {d23[0]}, [%0], %6 \n" + "vst1.32 {d23[1]}, [%0] \n" - "add %1, #4*2 \n" // src += 4 * 2 - "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a - "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %7, #4 \n" // w -= 4 - "beq 4f \n" + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %7, #2 \n" - "blt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[3], d3[3]}, [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + "vld2.16 {d1[3], d3[3]}, [%0] \n" - "vtrn.8 d0, d1 \n" - "vtrn.8 d2, d3 \n" + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "vst1.64 {d0}, [%0], %4 \n" - MEMACCESS(0) - "vst1.64 {d2}, [%0] \n" + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d2}, [%0] \n" - "mov %0, %5 \n" + "mov %0, %5 \n" - MEMACCESS(0) - "vst1.64 {d1}, [%0], %6 \n" - MEMACCESS(0) - "vst1.64 {d3}, [%0] \n" + "vst1.64 {d1}, [%0], %6 \n" + "vst1.64 {d3}, [%0] \n" - "add %1, #2*2 \n" // src += 2 * 2 - "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a - "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %7, #2 \n" // w -= 2 - "beq 4f \n" + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[7], d1[7]}, [%1] \n" + // 1x8 block + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + "vld2.8 {d0[7], d1[7]}, [%1] \n" - MEMACCESS(3) - "vst1.64 {d0}, [%3] \n" - MEMACCESS(5) - "vst1.64 {d1}, [%5] \n" + "vst1.64 {d0}, [%3] \n" + "vst1.64 {d1}, [%5] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst_a), // %3 - "+r"(dst_stride_a), // %4 - "+r"(dst_b), // %5 - "+r"(dst_stride_b), // %6 - "+r"(width) // %7 - : "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", - "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // defined(__ARM_NEON__) && !defined(__aarch64__) diff --git a/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc b/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc index 1ab448f3ab..f469baacf6 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #include "libyuv/basic_types.h" @@ -21,38 +21,32 @@ extern "C" { // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - const uint8* src_temp; - int64 width64 = (int64) width; // Work around clang 3.4 warning. - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %3, %3, #8 \n" +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w3, %w3, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "ld1 {v0.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v1.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v2.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v3.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v4.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v5.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v6.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v7.8b}, [%0] \n" "trn2 v16.8b, v0.8b, v1.8b \n" @@ -84,456 +78,345 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "mov %0, %2 \n" - MEMACCESS(0) "st1 {v17.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v16.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v19.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v18.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v21.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v20.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v23.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v22.8b}, [%0] \n" "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride - "subs %3, %3, #8 \n" // w -= 8 + "subs %w3, %w3, #8 \n" // w -= 8 "b.ge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %3, %3, #8 \n" - "b.eq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %3, #2 \n" - "b.lt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %w3, #2 \n" + "b.lt 3f \n" - "cmp %3, #4 \n" - "b.lt 2f \n" + "cmp %w3, #4 \n" + "b.lt 2f \n" - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.s}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[3], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[3], [%0] \n" + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(4) - "ld1 {v2.16b}, [%4] \n" + "ld1 {v2.16b}, [%4] \n" - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - MEMACCESS(0) - "st1 {v3.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[3], [%0] \n" + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "st1 {v3.s}[0], [%0], %6 \n" + "st1 {v3.s}[1], [%0], %6 \n" + "st1 {v3.s}[2], [%0], %6 \n" + "st1 {v3.s}[3], [%0] \n" - "add %0, %2, #4 \n" - MEMACCESS(0) - "st1 {v0.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[3], [%0] \n" + "add %0, %2, #4 \n" + "st1 {v0.s}[0], [%0], %6 \n" + "st1 {v0.s}[1], [%0], %6 \n" + "st1 {v0.s}[2], [%0], %6 \n" + "st1 {v0.s}[3], [%0] \n" - "add %1, %1, #4 \n" // src += 4 - "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride - "subs %3, %3, #4 \n" // w -= 4 - "b.eq 4f \n" + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %w3, %w3, #4 \n" // w -= 4 + "b.eq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %3, #2 \n" - "b.lt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w3, #2 \n" + "b.lt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[3], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[3], [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld1 {v0.h}[0], [%0], %5 \n" + "ld1 {v1.h}[0], [%0], %5 \n" + "ld1 {v0.h}[1], [%0], %5 \n" + "ld1 {v1.h}[1], [%0], %5 \n" + "ld1 {v0.h}[2], [%0], %5 \n" + "ld1 {v1.h}[2], [%0], %5 \n" + "ld1 {v0.h}[3], [%0], %5 \n" + "ld1 {v1.h}[3], [%0] \n" - "trn2 v2.8b, v0.8b, v1.8b \n" - "trn1 v3.8b, v0.8b, v1.8b \n" + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v3.8b}, [%0], %6 \n" - MEMACCESS(0) - "st1 {v2.8b}, [%0] \n" + "st1 {v3.8b}, [%0], %6 \n" + "st1 {v2.8b}, [%0] \n" - "add %1, %1, #2 \n" // src += 2 - "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride - "subs %3, %3, #2 \n" // w -= 2 - "b.eq 4f \n" + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %w3, %w3, #2 \n" // w -= 2 + "b.eq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "ld1 {v0.b}[0], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[1], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[2], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[3], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[4], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[5], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[6], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[7], [%1] \n" + // 1x8 block + "3: \n" + "ld1 {v0.b}[0], [%1], %5 \n" + "ld1 {v0.b}[1], [%1], %5 \n" + "ld1 {v0.b}[2], [%1], %5 \n" + "ld1 {v0.b}[3], [%1], %5 \n" + "ld1 {v0.b}[4], [%1], %5 \n" + "ld1 {v0.b}[5], [%1], %5 \n" + "ld1 {v0.b}[6], [%1], %5 \n" + "ld1 {v0.b}[7], [%1] \n" - MEMACCESS(2) - "st1 {v0.8b}, [%2] \n" + "st1 {v0.8b}, [%2] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width64) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride)) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } -static uint8 kVTbl4x4TransposeDi[32] = - { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, - 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; +static const uint8_t kVTbl4x4TransposeDi[32] = { + 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, int width) { - const uint8* src_temp; - int64 width64 = (int64) width; // Work around clang 3.4 warning. - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %4, %4, #8 \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w4, %w4, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v2.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v3.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v4.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v5.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v6.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v7.16b}, [%0] \n" + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v16.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v17.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v19.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v17.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v19.d}[1], [%0] \n" + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "st1 {v20.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v22.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v21.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v23.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v20.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v22.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v21.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v23.d}[1], [%0] \n" + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %4, %4, #8 \n" // w -= 8 - "b.ge 1b \n" + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %4, %4, #8 \n" - "b.eq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %4, #2 \n" - "b.lt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %w4, #2 \n" + "b.lt 3f \n" - "cmp %4, #4 \n" - "b.lt 2f \n" + "cmp %w4, #4 \n" + "b.lt 2f \n" - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v3.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v4.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v5.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v6.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v7.8b}, [%0] \n" + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" - MEMACCESS(8) - "ld1 {v30.16b}, [%8], #16 \n" - "ld1 {v31.16b}, [%8] \n" + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" - "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" - "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" - "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" - "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v16.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[3], [%0], %6 \n" + "st1 {v16.s}[0], [%0], %6 \n" + "st1 {v16.s}[1], [%0], %6 \n" + "st1 {v16.s}[2], [%0], %6 \n" + "st1 {v16.s}[3], [%0], %6 \n" - "add %0, %2, #4 \n" - MEMACCESS(0) - "st1 {v18.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[3], [%0] \n" + "add %0, %2, #4 \n" + "st1 {v18.s}[0], [%0], %6 \n" + "st1 {v18.s}[1], [%0], %6 \n" + "st1 {v18.s}[2], [%0], %6 \n" + "st1 {v18.s}[3], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "st1 {v17.s}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[2], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[3], [%0], %7 \n" + "st1 {v17.s}[0], [%0], %7 \n" + "st1 {v17.s}[1], [%0], %7 \n" + "st1 {v17.s}[2], [%0], %7 \n" + "st1 {v17.s}[3], [%0], %7 \n" - "add %0, %3, #4 \n" - MEMACCESS(0) - "st1 {v19.s}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[2], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[3], [%0] \n" + "add %0, %3, #4 \n" + "st1 {v19.s}[0], [%0], %7 \n" + "st1 {v19.s}[1], [%0], %7 \n" + "st1 {v19.s}[2], [%0], %7 \n" + "st1 {v19.s}[3], [%0] \n" - "add %1, %1, #8 \n" // src += 4 * 2 - "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a - "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %4, %4, #4 \n" // w -= 4 - "b.eq 4f \n" + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %w4, %w4, #4 \n" // w -= 4 + "b.eq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %4, #2 \n" - "b.lt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w4, #2 \n" + "b.lt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[3], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[3], [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + "ld2 {v2.h, v3.h}[3], [%0] \n" - "trn1 v4.8b, v0.8b, v2.8b \n" - "trn2 v5.8b, v0.8b, v2.8b \n" - "trn1 v6.8b, v1.8b, v3.8b \n" - "trn2 v7.8b, v1.8b, v3.8b \n" + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v4.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v6.d}[0], [%0] \n" + "st1 {v4.d}[0], [%0], %6 \n" + "st1 {v6.d}[0], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "st1 {v5.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v7.d}[0], [%0] \n" + "st1 {v5.d}[0], [%0], %7 \n" + "st1 {v7.d}[0], [%0] \n" - "add %1, %1, #4 \n" // src += 2 * 2 - "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a - "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %4, %4, #2 \n" // w -= 2 - "b.eq 4f \n" + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %w4, %w4, #2 \n" // w -= 2 + "b.eq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[0], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[1], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[2], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[3], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[4], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[5], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[6], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[7], [%1] \n" + // 1x8 block + "3: \n" + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + "ld2 {v0.b, v1.b}[7], [%1] \n" - MEMACCESS(2) - "st1 {v0.d}[0], [%2] \n" - MEMACCESS(3) - "st1 {v1.d}[0], [%3] \n" + "st1 {v0.d}[0], [%2] \n" + "st1 {v1.d}[0], [%3] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst_a), // %2 - "+r"(dst_b), // %3 - "+r"(width64) // %4 - : "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride_a)), // %6 - "r"(static_cast(dst_stride_b)), // %7 - "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v30", "v31" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride_a)), // %6 + "r"(static_cast(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) diff --git a/libs/libvpx/third_party/libyuv/source/rotate_win.cc b/libs/libvpx/third_party/libyuv/source/rotate_win.cc index 1300fc0feb..e887dd525c 100644 --- a/libs/libvpx/third_party/libyuv/source/rotate_win.cc +++ b/libs/libvpx/third_party/libyuv/source/rotate_win.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -17,17 +17,19 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -__declspec(naked) -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { __asm { push edi push esi push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride mov edx, [esp + 12 + 12] // dst mov esi, [esp + 12 + 16] // dst_stride mov ecx, [esp + 12 + 20] // width @@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride, } } -__declspec(naked) -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int w) { +__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int w) { __asm { push ebx push esi push edi push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride mov edx, [esp + 16 + 12] // dst_a mov esi, [esp + 16 + 16] // dst_stride_a mov ebx, [esp + 16 + 20] // dst_b @@ -133,9 +137,9 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, mov ecx, [ecx + 16 + 28] // w align 4 - convertloop: // Read in the data from the source pointer. // First round of bit swap. + convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + edi] lea eax, [eax + 2 * edi] @@ -162,13 +166,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea eax, [eax + 2 * edi] movdqu [esp], xmm5 // backup xmm5 neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. + movdqa xmm5, xmm6 // use xmm5 as temp register. punpcklbw xmm6, xmm7 punpckhbw xmm5, xmm7 movdqa xmm7, xmm5 lea eax, [eax + 8 * edi + 16] neg edi - // Second round of bit swap. + // Second round of bit swap. movdqa xmm5, xmm0 punpcklwd xmm0, xmm2 punpckhwd xmm5, xmm2 @@ -183,12 +187,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, movdqa xmm6, xmm5 movdqu xmm5, [esp] // restore xmm5 movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. + movdqa xmm6, xmm5 // use xmm6 as temp register. punpcklwd xmm5, xmm7 punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 - // Third round of bit swap. - // Write to the destination pointer. + + // Third round of bit swap. + // Write to the destination pointer. movdqa xmm6, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 @@ -200,7 +205,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm4 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. + movdqa xmm0, xmm2 // use xmm0 as the temp register. punpckldq xmm2, xmm6 movlpd qword ptr [edx], xmm2 movhpd qword ptr [ebx], xmm2 @@ -209,7 +214,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. + movdqa xmm0, xmm1 // use xmm0 as the temp register. punpckldq xmm1, xmm5 movlpd qword ptr [edx], xmm1 movhpd qword ptr [ebx], xmm1 @@ -218,7 +223,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. + movdqa xmm0, xmm3 // use xmm0 as the temp register. punpckldq xmm3, xmm7 movlpd qword ptr [edx], xmm3 movhpd qword ptr [ebx], xmm3 diff --git a/libs/libvpx/third_party/libyuv/source/row_any.cc b/libs/libvpx/third_party/libyuv/source/row_any.cc index 494164fd02..e91560c44c 100644 --- a/libs/libvpx/third_party/libyuv/source/row_any.cc +++ b/libs/libvpx/third_party/libyuv/source/row_any.cc @@ -19,30 +19,38 @@ namespace libyuv { extern "C" { #endif +// memset for temp is meant to clear the source buffer (not dest) so that +// SIMD that reads full multiple of 16 bytes will not trigger msan errors. +// memset is not needed for production, as the garbage values are processed but +// not used, although there may be edge cases for subsampling. +// The size of the buffer is based on the largest read, which can be inferred +// by the source type (e.g. ARGB) and the mask (last parameter), or by examining +// the source code for how much the source pointers are advanced. + // Subsampled source needs to be increase by 1 of not even. #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) // Any 4 planes to 1 with yuvconstants -#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - const uint8* a_buf, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422ALPHATOARGBROW_SSSE3 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) @@ -53,36 +61,57 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #ifdef HAS_I422ALPHATOARGBROW_NEON ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_MSA +ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) +#endif #undef ANY41C // Any 3 planes to 1. -#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ + } + +// Merge functions. +#ifdef HAS_MERGERGBROW_SSSE3 +ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) +#endif +#ifdef HAS_MERGERGBROW_NEON +ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) +#endif #ifdef HAS_I422TOYUY2ROW_SSE2 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_AVX2 +ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) +ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOYUY2ROW_NEON ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_MSA +ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOUYVYROW_MSA +ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif @@ -94,35 +123,38 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) // Note that odd width replication includes 444 due to implementation // on arm that subsamples 444 to 422 internally. // Any 3 planes to 1 with yuvconstants -#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - if (width & 1) { \ - temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ - } \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ + memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422TOARGBROW_SSSE3 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif -#ifdef HAS_I411TOARGBROW_SSSE3 -ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7) +#ifdef HAS_I422TOAR30ROW_SSSE3 +ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOAR30ROW_AVX2 +ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) @@ -130,10 +162,10 @@ ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7) +ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) #endif // HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I422TORGB24ROW_AVX2 -ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) +ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif #ifdef HAS_I422TOARGBROW_AVX2 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) @@ -144,47 +176,87 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #ifdef HAS_I444TOARGBROW_AVX2 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) #endif -#ifdef HAS_I411TOARGBROW_AVX2 -ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) -#endif #ifdef HAS_I422TOARGB4444ROW_AVX2 -ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TOARGB1555ROW_AVX2 -ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TORGB565ROW_AVX2 -ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TOARGBROW_NEON ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) -ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7) ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #endif +#ifdef HAS_I422TOARGBROW_MSA +ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) +ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) +#endif #undef ANY31C +// Any 3 planes of 16 bit to 1 with yuvconstants +// TODO(fbarchard): consider sharing this code with ANY31C +#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 3]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210TOAR30ROW_SSSE3 +ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_SSSE3 +ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_AVX2 +ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I210TOAR30ROW_AVX2 +ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#undef ANY31CT + // Any 2 planes to 1. -#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ - uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } // Merge functions. #ifdef HAS_MERGEUVROW_SSE2 @@ -196,6 +268,9 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) #ifdef HAS_MERGEUVROW_NEON ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #endif +#ifdef HAS_MERGEUVROW_MSA +ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) +#endif // Math functions. #ifdef HAS_ARGBMULTIPLYROW_SSE2 @@ -225,44 +300,61 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBSUBTRACTROW_NEON ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBMULTIPLYROW_MSA +ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_MSA +ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_MSA +ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) +#endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELROW_NEON ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELROW_MSA +ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) +#endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_NEON ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #endif +#ifdef HAS_SOBELTOPLANEROW_MSA +ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) +#endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELXYROW_NEON ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELXYROW_MSA +ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) +#endif #undef ANY21 // Any 2 planes to 1 with yuvconstants -#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ - uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ + } // Biplanar to RGB. #ifdef HAS_NV12TOARGBROW_SSSE3 @@ -274,6 +366,9 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV12TOARGBROW_NEON ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV12TOARGBROW_MSA +ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif #ifdef HAS_NV21TOARGBROW_SSSE3 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif @@ -283,6 +378,27 @@ ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV21TOARGBROW_NEON ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV21TOARGBROW_MSA +ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_NEON +ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV21TORGB24ROW_NEON +ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_SSSE3 +ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TORGB24ROW_SSSE3 +ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV12TORGB24ROW_AVX2 +ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif +#ifdef HAS_NV21TORGB24ROW_AVX2 +ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif #ifdef HAS_NV12TORGB565ROW_SSSE3 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) #endif @@ -292,22 +408,25 @@ ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #ifdef HAS_NV12TORGB565ROW_NEON ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) #endif +#ifdef HAS_NV12TORGB565ROW_MSA +ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) +#endif #undef ANY21C // Any 1 to 1. -#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #ifdef HAS_COPYROW_AVX ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) @@ -325,6 +444,15 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) #endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) +ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) +ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORAWROW_AVX2) +ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31) +#endif #if defined(HAS_ARGBTORGB565ROW_AVX2) ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) #endif @@ -332,6 +460,18 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif +#if defined(HAS_ABGRTOAR30ROW_SSSE3) +ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ARGBTOAR30ROW_SSSE3) +ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) +ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) +ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) +#endif #if defined(HAS_J400TOARGBROW_SSE2) ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #endif @@ -372,9 +512,21 @@ ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) +ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) +ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) +#endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif +#if defined(HAS_RAWTORGB24ROW_MSA) +ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) +#endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif @@ -403,30 +555,57 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #ifdef HAS_ARGBTOYROW_NEON ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBTOYROW_MSA +ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBTOYJROW_MSA +ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_BGRATOYROW_MSA +ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ABGRTOYROW_MSA +ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) +#endif #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_RGBATOYROW_MSA +ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RGB24TOYROW_MSA +ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) +#endif #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RAWTOYROW_MSA +ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) +#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_RGB565TOYROW_MSA +ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) +#endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_ARGB1555TOYROW_MSA +ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) +#endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif @@ -434,23 +613,44 @@ ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif #ifdef HAS_UYVYTOYROW_NEON -ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15) +ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_MSA +ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_MSA +ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif +#ifdef HAS_RGB24TOARGBROW_MSA +ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) +#endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif +#ifdef HAS_RAWTOARGBROW_MSA +ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) +#endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_RGB565TOARGBROW_MSA +ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_ARGB1555TOARGBROW_MSA +ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_ARGB4444TOARGBROW_MSA +ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif @@ -466,29 +666,38 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_NEON ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBATTENUATEROW_MSA +ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_NEON ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_MSA +ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) +#endif #undef ANY11 // Any 1 to 1 blended. Destination is read, modify, write. -#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } #ifdef HAS_ARGBCOPYALPHAROW_AVX2 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) @@ -506,61 +715,184 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) // Any 1 to 1 with parameter. #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ - T shuffler, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ - } + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, param, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) -ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, - const uint32, 4, 2, 3) +ANY11P(ARGBToRGB565DitherRow_Any_SSE2, + ARGBToRGB565DitherRow_SSE2, + const uint32_t, + 4, + 2, + 3) #endif #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) -ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, - const uint32, 4, 2, 7) +ANY11P(ARGBToRGB565DitherRow_Any_AVX2, + ARGBToRGB565DitherRow_AVX2, + const uint32_t, + 4, + 2, + 7) #endif #if defined(HAS_ARGBTORGB565DITHERROW_NEON) -ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON, - const uint32, 4, 2, 7) +ANY11P(ARGBToRGB565DitherRow_Any_NEON, + ARGBToRGB565DitherRow_NEON, + const uint32_t, + 4, + 2, + 7) #endif -#ifdef HAS_ARGBSHUFFLEROW_SSE2 -ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3) +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) +ANY11P(ARGBToRGB565DitherRow_Any_MSA, + ARGBToRGB565DitherRow_MSA, + const uint32_t, + 4, + 2, + 7) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 -ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7) +ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 -ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15) +ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) #endif #ifdef HAS_ARGBSHUFFLEROW_NEON -ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) +ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) +#endif +#ifdef HAS_ARGBSHUFFLEROW_MSA +ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif #undef ANY11P +// Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ + SIMD_ALIGNED(STYPE temp[32]); \ + SIMD_ALIGNED(DTYPE out[32]); \ + memset(temp, 0, 32 * SBPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, scale, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, scale, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_CONVERT16TO8ROW_SSSE3 +ANY11C(Convert16To8Row_Any_SSSE3, + Convert16To8Row_SSSE3, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif +#ifdef HAS_CONVERT16TO8ROW_AVX2 +ANY11C(Convert16To8Row_Any_AVX2, + Convert16To8Row_AVX2, + 2, + 1, + uint16_t, + uint8_t, + 31) +#endif +#ifdef HAS_CONVERT8TO16ROW_SSE2 +ANY11C(Convert8To16Row_Any_SSE2, + Convert8To16Row_SSE2, + 1, + 2, + uint8_t, + uint16_t, + 15) +#endif +#ifdef HAS_CONVERT8TO16ROW_AVX2 +ANY11C(Convert8To16Row_Any_AVX2, + Convert8To16Row_AVX2, + 1, + 2, + uint8_t, + uint16_t, + 31) +#endif +#undef ANY11C + +// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. +#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ + void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ + SIMD_ALIGNED(ST temp[32]); \ + SIMD_ALIGNED(T out[32]); \ + memset(temp, 0, SBPP * 32); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, param, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_HALFFLOATROW_SSE2 +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) +#endif +#ifdef HAS_HALFFLOATROW_AVX2 +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) +#endif +#ifdef HAS_HALFFLOATROW_F16C +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) +ANY11P16(HalfFloat1Row_Any_F16C, + HalfFloat1Row_F16C, + uint16_t, + uint16_t, + 2, + 2, + 15) +#endif +#ifdef HAS_HALFFLOATROW_NEON +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) +ANY11P16(HalfFloat1Row_Any_NEON, + HalfFloat1Row_NEON, + uint16_t, + uint16_t, + 2, + 2, + 7) +#endif +#ifdef HAS_HALFFLOATROW_MSA +ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) +#endif +#ifdef HAS_BYTETOFLOATROW_NEON +ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) +#endif +#undef ANY11P16 + // Any 1 to 1 with yuvconstants -#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #if defined(HAS_YUY2TOARGBROW_SSSE3) ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) @@ -573,25 +905,28 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) #endif +#if defined(HAS_YUY2TOARGBROW_MSA) +ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) +#endif #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ - ptrdiff_t src_stride_ptr, int width, \ - int source_y_fraction) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #ifdef HAS_INTERPOLATEROW_AVX2 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) @@ -602,25 +937,25 @@ ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #ifdef HAS_INTERPOLATEROW_NEON ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif -#ifdef HAS_INTERPOLATEROW_DSPR2 -ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3) +#ifdef HAS_INTERPOLATEROW_MSA +ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) #endif #undef ANY11T // Any 1 to 1 mirror. -#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr, r * BPP); \ - ANY_SIMD(temp, temp + 64, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ - } +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr, r* BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ + } #ifdef HAS_MIRRORROW_AVX2 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) @@ -631,6 +966,9 @@ ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) #endif +#ifdef HAS_MIRRORROW_MSA +ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -640,67 +978,54 @@ ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #ifdef HAS_ARGBMIRRORROW_NEON ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) #endif +#ifdef HAS_ARGBMIRRORROW_MSA +ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) +#endif #undef ANY11M // Any 1 plane. (memset) -#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, T v32, int width) { \ - SIMD_ALIGNED(uint8 temp[64]); \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, v32, n); \ - } \ - ANY_SIMD(temp, v32, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp, r * BPP); \ - } +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8_t temp[64]); \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, v32, n); \ + } \ + ANY_SIMD(temp, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp, r * BPP); \ + } #ifdef HAS_SETROW_X86 -ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3) +ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) #endif #ifdef HAS_SETROW_NEON -ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15) +ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) #endif #ifdef HAS_ARGBSETROW_NEON -ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3) +ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) +#endif +#ifdef HAS_ARGBSETROW_MSA +ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) #endif #undef ANY1 // Any 1 to 2. Outputs UV planes. -#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\ - SIMD_ALIGNED(uint8 temp[128 * 3]); \ - memset(temp, 0, 128); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - /* repeat last 4 bytes for 422 subsampler */ \ - if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - /* repeat last 4 - 12 bytes for 411 subsampler */ \ - if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \ - } \ - if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \ - } \ - if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ - memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ - memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ - } +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ + } #ifdef HAS_SPLITUVROW_SSE2 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) @@ -711,8 +1036,8 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #ifdef HAS_SPLITUVROW_NEON ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #endif -#ifdef HAS_SPLITUVROW_DSPR2 -ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) +#ifdef HAS_SPLITUVROW_MSA +ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) #endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) @@ -727,37 +1052,66 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) -ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif +#ifdef HAS_YUY2TOUV422ROW_MSA +ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) +ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) +#endif #undef ANY12 +// Any 1 to 3. Outputs RGB planes. +#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 6]); \ + memset(temp, 0, 16 * 3); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ + } \ + memcpy(temp, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ + memcpy(dst_r + n, temp + 16 * 3, r); \ + memcpy(dst_g + n, temp + 16 * 4, r); \ + memcpy(dst_b + n, temp + 16 * 5, r); \ + } + +#ifdef HAS_SPLITRGBROW_SSSE3 +ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) +#endif +#ifdef HAS_SPLITRGBROW_NEON +ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) +#endif + // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. -#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \ - uint8* dst_u, uint8* dst_v, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 4]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ - SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ - memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ - memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ - } +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \ + uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ + memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ + } #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) @@ -783,30 +1137,57 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVROW_MSA +ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVJROW_MSA +ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) +#endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_BGRATOUVROW_MSA +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVROW_MSA +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_RGBATOUVROW_MSA +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_RGB24TOUVROW_NEON ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RGB24TOUVROW_MSA +ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) +#endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RAWTOUVROW_MSA +ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) +#endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_RGB565TOUVROW_MSA +ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) +#endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_ARGB1555TOUVROW_MSA +ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) +#endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif @@ -816,6 +1197,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #ifdef HAS_UYVYTOUVROW_NEON ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #endif +#ifdef HAS_YUY2TOUVROW_MSA +ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) +#endif +#ifdef HAS_UYVYTOUVROW_MSA +ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) +#endif #undef ANY12S #ifdef __cplusplus diff --git a/libs/libvpx/third_party/libyuv/source/row_common.cc b/libs/libvpx/third_party/libyuv/source/row_common.cc index aefa38c495..2bbc5adbf1 100644 --- a/libs/libvpx/third_party/libyuv/source/row_common.cc +++ b/libs/libvpx/third_party/libyuv/source/row_common.cc @@ -10,6 +10,7 @@ #include "libyuv/row.h" +#include #include // For memcpy and memset. #include "libyuv/basic_types.h" @@ -23,59 +24,69 @@ extern "C" { #define USE_BRANCHLESS 1 #if USE_BRANCHLESS -static __inline int32 clamp0(int32 v) { +static __inline int32_t clamp0(int32_t v) { return ((-(v) >> 31) & (v)); } -static __inline int32 clamp255(int32 v) { +static __inline int32_t clamp255(int32_t v) { return (((255 - (v)) >> 31) | (v)) & 255; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32_t clamp1023(int32_t v) { + return (((1023 - (v)) >> 31) | (v)) & 1023; } -static __inline uint32 Abs(int32 v) { +static __inline uint32_t Abs(int32_t v) { int m = v >> 31; return (v + m) ^ m; } -#else // USE_BRANCHLESS -static __inline int32 clamp0(int32 v) { +#else // USE_BRANCHLESS +static __inline int32_t clamp0(int32_t v) { return (v < 0) ? 0 : v; } -static __inline int32 clamp255(int32 v) { +static __inline int32_t clamp255(int32_t v) { return (v > 255) ? 255 : v; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32_t clamp1023(int32_t v) { + return (v > 1023) ? 1023 : v; } -static __inline uint32 Abs(int32 v) { +static __inline uint32_t Abs(int32_t v) { return (v < 0) ? -v : v; } #endif // USE_BRANCHLESS +static __inline uint32_t Clamp(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp255(v)); +} -#ifdef LIBYUV_LITTLE_ENDIAN -#define WRITEWORD(p, v) *(uint32*)(p) = v +static __inline uint32_t Clamp10(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp1023(v)); +} + +// Little Endian +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define WRITEWORD(p, v) *(uint32_t*)(p) = v #else -static inline void WRITEWORD(uint8* p, uint32 v) { - p[0] = (uint8)(v & 255); - p[1] = (uint8)((v >> 8) & 255); - p[2] = (uint8)((v >> 16) & 255); - p[3] = (uint8)((v >> 24) & 255); +static inline void WRITEWORD(uint8_t* p, uint32_t v) { + p[0] = (uint8_t)(v & 255); + p[1] = (uint8_t)((v >> 8) & 255); + p[2] = (uint8_t)((v >> 16) & 255); + p[3] = (uint8_t)((v >> 24) & 255); } #endif -void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb24[0]; - uint8 g = src_rgb24[1]; - uint8 r = src_rgb24[2]; + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -85,12 +96,12 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { } } -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -100,12 +111,12 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { } } -void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) { +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { int x; for (x = 0; x < width; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; dst_rgb24[0] = b; dst_rgb24[1] = g; dst_rgb24[2] = r; @@ -114,12 +125,14 @@ void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) { } } -void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb565[0] & 0x1f; - uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r = src_rgb565[1] >> 3; + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 2) | (g >> 4); dst_argb[2] = (r << 3) | (r >> 2); @@ -129,14 +142,15 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { } } -void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb1555[0] & 0x1f; - uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r = (src_argb1555[1] & 0x7c) >> 2; - uint8 a = src_argb1555[1] >> 7; + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; + uint8_t a = src_argb1555[1] >> 7; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 3) | (g >> 2); dst_argb[2] = (r << 3) | (r >> 2); @@ -146,14 +160,15 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, } } -void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb4444[0] & 0x0f; - uint8 g = src_argb4444[0] >> 4; - uint8 r = src_argb4444[1] & 0x0f; - uint8 a = src_argb4444[1] >> 4; + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; + uint8_t a = src_argb4444[1] >> 4; dst_argb[0] = (b << 4) | b; dst_argb[1] = (g << 4) | g; dst_argb[2] = (r << 4) | r; @@ -163,12 +178,53 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, } } -void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb[0]; - uint8 g = src_argb[1]; - uint8 r = src_argb[2]; + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24); + dst_argb += 4; + src_ar30 += 4; + } +} + +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24); + dst_abgr += 4; + src_ar30 += 4; + } +} + +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = ar30 & 0x3ff; + uint32_t ga = ar30 & 0xc00ffc00; + uint32_t r = (ar30 >> 20) & 0x3ff; + *(uint32_t*)(dst_ab30) = r | ga | (b << 20); + dst_ab30 += 4; + src_ar30 += 4; + } +} + +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; dst_rgb[0] = b; dst_rgb[1] = g; dst_rgb[2] = r; @@ -177,12 +233,12 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb[0]; - uint8 g = src_argb[1]; - uint8 r = src_argb[2]; + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; dst_rgb[0] = r; dst_rgb[1] = g; dst_rgb[2] = b; @@ -191,25 +247,25 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 2; - uint8 r0 = src_argb[2] >> 3; - uint8 b1 = src_argb[4] >> 3; - uint8 g1 = src_argb[5] >> 2; - uint8 r1 = src_argb[6] >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27)); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 2; + uint8_t r1 = src_argb[6] >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 2; - uint8 r0 = src_argb[2] >> 3; - *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } @@ -221,132 +277,160 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { // endian will not affect order of the original matrix. But the dither4 // will containing the first pixel in the lower byte for little endian // or the upper byte for big endian. -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { int x; for (x = 0; x < width - 1; x += 2) { int dither0 = ((const unsigned char*)(&dither4))[x & 3]; int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; - uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; - uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; - uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; - uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27)); + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; + uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; + uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } if (width & 1) { int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; - uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; - *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } -void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 3; - uint8 r0 = src_argb[2] >> 3; - uint8 a0 = src_argb[3] >> 7; - uint8 b1 = src_argb[4] >> 3; - uint8 g1 = src_argb[5] >> 3; - uint8 r1 = src_argb[6] >> 3; - uint8 a1 = src_argb[7] >> 7; - *(uint32*)(dst_rgb) = - b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 3; + uint8_t r1 = src_argb[6] >> 3; + uint8_t a1 = src_argb[7] >> 7; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 3; - uint8 r0 = src_argb[2] >> 3; - uint8 a0 = src_argb[3] >> 7; - *(uint16*)(dst_rgb) = - b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); } } -void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 4; - uint8 g0 = src_argb[1] >> 4; - uint8 r0 = src_argb[2] >> 4; - uint8 a0 = src_argb[3] >> 4; - uint8 b1 = src_argb[4] >> 4; - uint8 g1 = src_argb[5] >> 4; - uint8 r1 = src_argb[6] >> 4; - uint8 a1 = src_argb[7] >> 4; - *(uint32*)(dst_rgb) = - b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + uint8_t b1 = src_argb[4] >> 4; + uint8_t g1 = src_argb[5] >> 4; + uint8_t r1 = src_argb[6] >> 4; + uint8_t a1 = src_argb[7] >> 4; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 4; - uint8 g0 = src_argb[1] >> 4; - uint8 r0 = src_argb[2] >> 4; - uint8 a0 = src_argb[3] >> 4; - *(uint16*)(dst_rgb) = - b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); } } -static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { - return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); + uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); + uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); + uint32_t a0 = (src_abgr[3] >> 6); + *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_abgr += 4; + } } -static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); + uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); + uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); + uint32_t a0 = (src_argb[3] >> 6); + *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_argb += 4; + } +} + +static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +} + +static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; } -static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } -#define MAKEROWY(NAME, R, G, B, BPP) \ -void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ -} \ -void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \ - src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \ - uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \ - src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \ - uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \ - src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ -} +// ARGBToY_C and ARGBToUV_C +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP]) >> \ + 2; \ + uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP]) >> \ + 2; \ + uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP]) >> \ + 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ + } MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) @@ -381,64 +465,65 @@ MAKEROWY(RAW, 0, 1, 2, 3) // g -0.41869 * 255 = -106.76595 = -107 // r 0.50000 * 255 = 127.5 = 127 -static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { - return (38 * r + 75 * g + 15 * b + 64) >> 7; +static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { + return (38 * r + 75 * g + 15 * b + 64) >> 7; } -static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } -static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } #define AVGB(a, b) (((a) + (b) + 1) >> 1) -#define MAKEROWYJ(NAME, R, G, B, BPP) \ -void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ -} \ -void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ -} +// ARGBToYJ_C and ARGBToUVJ_C +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ + } MAKEROWYJ(ARGB, 2, 1, 0, 4) #undef MAKEROWYJ -void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb565[0] & 0x1f; - uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r = src_rgb565[1] >> 3; + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; b = (b << 3) | (b >> 2); g = (g << 2) | (g >> 4); r = (r << 3) | (r >> 2); @@ -448,12 +533,12 @@ void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { } } -void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb1555[0] & 0x1f; - uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; b = (b << 3) | (b >> 2); g = (g << 3) | (g >> 2); r = (r << 3) | (r >> 2); @@ -463,12 +548,12 @@ void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { } } -void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb4444[0] & 0x0f; - uint8 g = src_argb4444[0] >> 4; - uint8 r = src_argb4444[1] & 0x0f; + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; b = (b << 4) | b; g = (g << 4) | g; r = (r << 4) | r; @@ -478,26 +563,29 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { } } -void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; +void RGB565ToUVRow_C(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_rgb565[0] & 0x1f; - uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r0 = src_rgb565[1] >> 3; - uint8 b1 = src_rgb565[2] & 0x1f; - uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); - uint8 r1 = src_rgb565[3] >> 3; - uint8 b2 = next_rgb565[0] & 0x1f; - uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8 r2 = next_rgb565[1] >> 3; - uint8 b3 = next_rgb565[2] & 0x1f; - uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); - uint8 r3 = next_rgb565[3] >> 3; - uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b1 = src_rgb565[2] & 0x1f; + uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); + uint8_t r1 = src_rgb565[3] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b3 = next_rgb565[2] & 0x1f; + uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); + uint8_t r3 = next_rgb565[3] >> 3; + uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 1) | (b >> 6); // 787 -> 888. r = (r << 1) | (r >> 6); dst_u[0] = RGBToU(r, g, b); @@ -508,15 +596,15 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, dst_v += 1; } if (width & 1) { - uint8 b0 = src_rgb565[0] & 0x1f; - uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r0 = src_rgb565[1] >> 3; - uint8 b2 = next_rgb565[0] & 0x1f; - uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8 r2 = next_rgb565[1] >> 3; - uint8 b = (b0 + b2); // 565 * 2 = 676. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b = (b0 + b2); // 565 * 2 = 676. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 2) | (b >> 4); // 676 -> 888 g = (g << 1) | (g >> 6); r = (r << 2) | (r >> 4); @@ -525,26 +613,29 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, } } -void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb1555[0] & 0x1f; - uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8 b1 = src_argb1555[2] & 0x1f; - uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); - uint8 r1 = (src_argb1555[3] & 0x7c) >> 2; - uint8 b2 = next_argb1555[0] & 0x1f; - uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8 r2 = (next_argb1555[1] & 0x7c) >> 2; - uint8 b3 = next_argb1555[2] & 0x1f; - uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); - uint8 r3 = (next_argb1555[3] & 0x7c) >> 2; - uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b1 = src_argb1555[2] & 0x1f; + uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8_t b3 = next_argb1555[2] & 0x1f; + uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 1) | (b >> 6); // 777 -> 888. g = (g << 1) | (g >> 6); r = (r << 1) | (r >> 6); @@ -556,15 +647,15 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, dst_v += 1; } if (width & 1) { - uint8 b0 = src_argb1555[0] & 0x1f; - uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8 b2 = next_argb1555[0] & 0x1f; - uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8 r2 = next_argb1555[1] >> 3; - uint8 b = (b0 + b2); // 555 * 2 = 666. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = next_argb1555[1] >> 3; + uint8_t b = (b0 + b2); // 555 * 2 = 666. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 2) | (b >> 4); // 666 -> 888. g = (g << 2) | (g >> 4); r = (r << 2) | (r >> 4); @@ -573,26 +664,29 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, } } -void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb4444[0] & 0x0f; - uint8 g0 = src_argb4444[0] >> 4; - uint8 r0 = src_argb4444[1] & 0x0f; - uint8 b1 = src_argb4444[2] & 0x0f; - uint8 g1 = src_argb4444[2] >> 4; - uint8 r1 = src_argb4444[3] & 0x0f; - uint8 b2 = next_argb4444[0] & 0x0f; - uint8 g2 = next_argb4444[0] >> 4; - uint8 r2 = next_argb4444[1] & 0x0f; - uint8 b3 = next_argb4444[2] & 0x0f; - uint8 g3 = next_argb4444[2] >> 4; - uint8 r3 = next_argb4444[3] & 0x0f; - uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b1 = src_argb4444[2] & 0x0f; + uint8_t g1 = src_argb4444[2] >> 4; + uint8_t r1 = src_argb4444[3] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b3 = next_argb4444[2] & 0x0f; + uint8_t g3 = next_argb4444[2] >> 4; + uint8_t r3 = next_argb4444[3] & 0x0f; + uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 2) | (b >> 4); // 666 -> 888. g = (g << 2) | (g >> 4); r = (r << 2) | (r >> 4); @@ -604,15 +698,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, dst_v += 1; } if (width & 1) { - uint8 b0 = src_argb4444[0] & 0x0f; - uint8 g0 = src_argb4444[0] >> 4; - uint8 r0 = src_argb4444[1] & 0x0f; - uint8 b2 = next_argb4444[0] & 0x0f; - uint8 g2 = next_argb4444[0] >> 4; - uint8 r2 = next_argb4444[1] & 0x0f; - uint8 b = (b0 + b2); // 444 * 2 = 555. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b = (b0 + b2); // 444 * 2 = 555. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 3) | (b >> 2); // 555 -> 888. g = (g << 3) | (g >> 2); r = (r << 3) | (r >> 2); @@ -621,13 +715,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, } } -void ARGBToUV444Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; for (x = 0; x < width; ++x) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; + uint8_t ab = src_argb[0]; + uint8_t ag = src_argb[1]; + uint8_t ar = src_argb[2]; dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); src_argb += 4; @@ -636,45 +732,10 @@ void ARGBToUV444Row_C(const uint8* src_argb, } } -void ARGBToUV411Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { - int x; - for (x = 0; x < width - 3; x += 4) { - uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2; - uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2; - uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb += 16; - dst_u += 1; - dst_v += 1; - } - // Odd width handling mimics 'any' function which replicates last pixel. - if ((width & 3) == 3) { - uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2; - uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2; - uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } else if ((width & 3) == 2) { - uint8 ab = (src_argb[0] + src_argb[4]) >> 1; - uint8 ag = (src_argb[1] + src_argb[5]) >> 1; - uint8 ar = (src_argb[2] + src_argb[6]) >> 1; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } else if ((width & 3) == 1) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } -} - -void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = src_argb[3]; dst_argb += 4; @@ -683,7 +744,7 @@ void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } // Convert a row of image to Sepia tone. -void ARGBSepiaRow_C(uint8* dst_argb, int width) { +void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -702,22 +763,28 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { // Apply color matrix to a row of image. Matrix is signed. // TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = src_argb[0]; int g = src_argb[1]; int r = src_argb[2]; int a = src_argb[3]; - int sb = (b * matrix_argb[0] + g * matrix_argb[1] + - r * matrix_argb[2] + a * matrix_argb[3]) >> 6; - int sg = (b * matrix_argb[4] + g * matrix_argb[5] + - r * matrix_argb[6] + a * matrix_argb[7]) >> 6; - int sr = (b * matrix_argb[8] + g * matrix_argb[9] + - r * matrix_argb[10] + a * matrix_argb[11]) >> 6; - int sa = (b * matrix_argb[12] + g * matrix_argb[13] + - r * matrix_argb[14] + a * matrix_argb[15]) >> 6; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] + + a * matrix_argb[3]) >> + 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] + + a * matrix_argb[7]) >> + 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] + + a * matrix_argb[11]) >> + 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + + a * matrix_argb[15]) >> + 6; dst_argb[0] = Clamp(sb); dst_argb[1] = Clamp(sg); dst_argb[2] = Clamp(sr); @@ -728,7 +795,9 @@ void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, } // Apply color table to a row of image. -void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -744,7 +813,9 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } // Apply color table to a row of image. -void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -757,8 +828,11 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } } -void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +void ARGBQuantizeRow_C(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -772,21 +846,23 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, } #define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v * f >> 24 +#define SHADE(f, v) v* f >> 24 -void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - const uint32 b_scale = REPEAT8(value & 0xff); - const uint32 g_scale = REPEAT8((value >> 8) & 0xff); - const uint32 r_scale = REPEAT8((value >> 16) & 0xff); - const uint32 a_scale = REPEAT8(value >> 24); +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + const uint32_t b_scale = REPEAT8(value & 0xff); + const uint32_t g_scale = REPEAT8((value >> 8) & 0xff); + const uint32_t r_scale = REPEAT8((value >> 16) & 0xff); + const uint32_t a_scale = REPEAT8(value >> 24); int i; for (i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb[0]); - const uint32 g = REPEAT8(src_argb[1]); - const uint32 r = REPEAT8(src_argb[2]); - const uint32 a = REPEAT8(src_argb[3]); + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); @@ -799,20 +875,22 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, #undef SHADE #define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v * f >> 16 +#define SHADE(f, v) v* f >> 16 -void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBMultiplyRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb0[0]); - const uint32 g = REPEAT8(src_argb0[1]); - const uint32 r = REPEAT8(src_argb0[2]); - const uint32 a = REPEAT8(src_argb0[3]); - const uint32 b_scale = src_argb1[0]; - const uint32 g_scale = src_argb1[1]; - const uint32 r_scale = src_argb1[2]; - const uint32 a_scale = src_argb1[3]; + const uint32_t b = REPEAT8(src_argb0[0]); + const uint32_t g = REPEAT8(src_argb0[1]); + const uint32_t r = REPEAT8(src_argb0[2]); + const uint32_t a = REPEAT8(src_argb0[3]); + const uint32_t b_scale = src_argb1[0]; + const uint32_t g_scale = src_argb1[1]; + const uint32_t r_scale = src_argb1[2]; + const uint32_t a_scale = src_argb1[3]; dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); @@ -827,8 +905,10 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, #define SHADE(f, v) clamp255(v + f) -void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBAddRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb0[0]; @@ -852,8 +932,10 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, #define SHADE(f, v) clamp0(f - v) -void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBSubtractRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb0[0]; @@ -876,8 +958,11 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, #undef SHADE // Sobel functions which mimics SSSE3. -void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, - uint8* dst_sobelx, int width) { +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i]; @@ -890,12 +975,14 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobelx[i] = (uint8)(clamp255(sobel)); + dst_sobelx[i] = (uint8_t)(clamp255(sobel)); } } -void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i + 0]; @@ -908,56 +995,62 @@ void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobely[i] = (uint8)(clamp255(sobel)); + dst_sobely[i] = (uint8_t)(clamp255(sobel)); } } -void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); - dst_argb[0] = (uint8)(s); - dst_argb[1] = (uint8)(s); - dst_argb[2] = (uint8)(s); - dst_argb[3] = (uint8)(255u); + dst_argb[0] = (uint8_t)(s); + dst_argb[1] = (uint8_t)(s); + dst_argb[2] = (uint8_t)(s); + dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } -void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); - dst_y[i] = (uint8)(s); + dst_y[i] = (uint8_t)(s); } } -void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int g = clamp255(r + b); - dst_argb[0] = (uint8)(b); - dst_argb[1] = (uint8)(g); - dst_argb[2] = (uint8)(r); - dst_argb[3] = (uint8)(255u); + dst_argb[0] = (uint8_t)(b); + dst_argb[1] = (uint8_t)(g); + dst_argb[2] = (uint8_t)(r); + dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } -void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // Copy a Y to RGB. int x; for (x = 0; x < width; ++x) { - uint8 y = src_y[0]; + uint8_t y = src_y[0]; dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = 255u; dst_argb += 4; @@ -974,75 +1067,69 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // B = (Y - 16) * 1.164 - U * -2.018 // Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // U and V contributions to R,G,B. #define UB -128 /* max(-128, round(-2.018 * 64)) */ -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ #define VR -102 /* round(-1.596 * 64) */ // Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) // 64 bit arm const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) // 32 bit arm const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1062,74 +1149,68 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { // Y contribution to R,G,B. Scale and bias. #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGB 32 /* 64 / 2 */ +#define YGB 32 /* 64 / 2 */ // U and V contributions to R,G,B. #define UB -113 /* round(-1.77200 * 64) */ -#define UG 22 /* round(0.34414 * 64) */ -#define VG 46 /* round(0.71414 * 64) */ -#define VR -90 /* round(-1.40200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR -90 /* round(-1.40200 * 64) */ // Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1143,81 +1224,76 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { #undef YG // BT.709 YUV to RGB reference -// * R = Y - V * -1.28033 -// * G = Y - U * 0.21482 - V * 0.38059 -// * B = Y - U * -2.12798 +// R = (Y - 16) * 1.164 - V * -1.793 +// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 +// B = (Y - 16) * 1.164 - U * -2.112 +// See also http://www.equasys.de/colorconversion.html // Y contribution to R,G,B. Scale and bias. -#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGB 32 /* 64 / 2 */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ -// TODO(fbarchard): Find way to express 2.12 instead of 2.0. +// TODO(fbarchard): Find way to express 2.112 instead of 2.0. // U and V contributions to R,G,B. -#define UB -128 /* max(-128, round(-2.12798 * 64)) */ -#define UG 14 /* round(0.21482 * 64) */ -#define VG 24 /* round(0.38059 * 64) */ -#define VR -82 /* round(-1.28033 * 64) */ +#define UB -128 /* max(-128, round(-2.112 * 64)) */ +#define UG 14 /* round(0.213 * 64) */ +#define VG 34 /* round(0.533 * 64) */ +#define VR -115 /* round(-1.793 * 64) */ // Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1231,8 +1307,14 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { #undef YG // C reference code that mimics the YUV assembly. -static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, - uint8* b, uint8* g, uint8* r, +// Reads 8 bit YUV and leaves result as 16 bit. + +static __inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; @@ -1263,22 +1345,129 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, int yg = yuvconstants->kYToRgb[0]; #endif - uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6); - *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6); - *r = Clamp((int32) (-(v * vr) + y1 + br) >> 6); + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6); + *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6); + *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6); +} + +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel8_16(uint8_t y, + uint8_t u, + uint8_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 10 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16; + u = clamp255(u >> 2); + v = clamp255(v >> 2); + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 10 bit assembly. +// Reads 10 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel10(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); } // Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // C reference code that mimics the YUV assembly. -static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { - uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32)(y1 + YGB) >> 6); - *g = Clamp((int32)(y1 + YGB) >> 6); - *r = Clamp((int32)(y1 + YGB) >> 6); +static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) { + uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32_t)(y1 + YGB) >> 6); + *g = Clamp((int32_t)(y1 + YGB) >> 6); + *r = Clamp((int32_t)(y1 + YGB) >> 6); } #undef YG @@ -1288,16 +1477,16 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. // TODO(fbarchard): Remove subsampling from Neon. -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 u = (src_u[0] + src_u[1] + 1) >> 1; - uint8 v = (src_v[0] + src_v[1] + 1) >> 1; + uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; + uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; @@ -1310,22 +1499,22 @@ void I444ToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } #else -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; @@ -1336,19 +1525,19 @@ void I444ToARGBRow_C(const uint8* src_y, #endif // Also used for 420 -void I422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1356,26 +1545,120 @@ void I422ToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void I422AlphaToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* rgb_buf, +// 10 bit YUV to ARGB +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { + uint32_t ar30; + b = b >> 4; // convert 10.6 to 10 bit. + g = g >> 4; + r = r >> 4; + b = Clamp10(b); + g = Clamp10(g); + r = Clamp10(r); + ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000; + (*(uint32_t*)rgb_buf) = ar30; +} + +// 10 bit YUV to 10 bit AR30 +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +// 8 bit YUV to 10 bit AR30 +// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 1; @@ -1384,47 +1667,47 @@ void I422AlphaToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } -void I422ToRGB24Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } -void I422ToARGB4444Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1435,8 +1718,8 @@ void I422ToARGB4444Row_C(const uint8* src_y, b1 = b1 >> 4; g1 = g1 >> 4; r1 = r1 >> 4; - *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000; + *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | + (g1 << 20) | (r1 << 24) | 0xf000f000; src_y += 2; src_u += 1; src_v += 1; @@ -1447,23 +1730,22 @@ void I422ToARGB4444Row_C(const uint8* src_y, b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; - *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | - 0xf000; + *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; } } -void I422ToARGB1555Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1474,8 +1756,8 @@ void I422ToARGB1555Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 3; r1 = r1 >> 3; - *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000; + *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | + (g1 << 21) | (r1 << 26) | 0x80008000; src_y += 2; src_u += 1; src_v += 1; @@ -1486,23 +1768,22 @@ void I422ToARGB1555Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; - *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | - 0x8000; + *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; } } -void I422ToRGB565Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1513,8 +1794,8 @@ void I422ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint32_t*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_u += 1; src_v += 1; @@ -1525,111 +1806,111 @@ void I422ToRGB565Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } -void I411ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 3; x += 4) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - YuvPixel(src_y[2], src_u[0], src_v[0], - rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants); - rgb_buf[11] = 255; - YuvPixel(src_y[3], src_u[0], src_v[0], - rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants); - rgb_buf[15] = 255; - src_y += 4; - src_u += 1; - src_v += 1; - rgb_buf += 16; // Advance 4 pixels. - } - if (width & 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void NV12ToARGBRow_C(const uint8* src_y, - const uint8* src_uv, - uint8* rgb_buf, +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_uv[0], src_uv[1], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_uv += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void NV21ToARGBRow_C(const uint8* src_y, - const uint8* src_vu, - uint8* rgb_buf, +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_vu[1], src_vu[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_vu += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void NV12ToRGB565Row_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_uv += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_vu += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); @@ -1640,8 +1921,8 @@ void NV12ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint32_t*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_uv += 2; dst_rgb565 += 4; // Advance 2 pixels. @@ -1651,67 +1932,67 @@ void NV12ToRGB565Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } -void YUY2ToARGBRow_C(const uint8* src_yuy2, - uint8* rgb_buf, +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_yuy2 += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void UYVYToARGBRow_C(const uint8* src_uyvy, - uint8* rgb_buf, +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_uyvy += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void I422ToRGBARow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, + rgb_buf + 7, yuvconstants); rgb_buf[4] = 255; src_y += 2; src_u += 1; @@ -1719,13 +2000,13 @@ void I422ToRGBARow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; } } -void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { +void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) { int x; for (x = 0; x < width - 1; x += 2) { YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); @@ -1741,7 +2022,7 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { } } -void MirrorRow_C(const uint8* src, uint8* dst, int width) { +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; src += width - 1; for (x = 0; x < width - 1; x += 2) { @@ -1754,7 +2035,10 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) { } } -void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { +void MirrorUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; src_uv += (width - 1) << 1; for (x = 0; x < width - 1; x += 2) { @@ -1770,10 +2054,10 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } -void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; - const uint32* src32 = (const uint32*)(src); - uint32* dst32 = (uint32*)(dst); + const uint32_t* src32 = (const uint32_t*)(src); + uint32_t* dst32 = (uint32_t*)(dst); src32 += width - 1; for (x = 0; x < width - 1; x += 2) { dst32[x] = src32[0]; @@ -1785,7 +2069,10 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { } } -void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_u[x] = src_uv[0]; @@ -1800,7 +2087,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } -void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -1816,20 +2105,110 @@ void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, } } -void CopyRow_C(const uint8* src, uint8* dst, int count) { +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_r[x] = src_rgb[0]; + dst_g[x] = src_rgb[1]; + dst_b[x] = src_rgb[2]; + src_rgb += 3; + } +} + +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_rgb[0] = src_r[x]; + dst_rgb[1] = src_g[x]; + dst_rgb[2] = src_b[x]; + dst_rgb += 3; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x] * scale; + dst_uv[1] = src_v[x] * scale; + dst_uv[2] = src_u[x + 1] * scale; + dst_uv[3] = src_v[x + 1] * scale; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1] * scale; + dst_uv[1] = src_v[width - 1] * scale; + } +} + +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_y[x] * scale; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = clamp255((src_y[x] * scale) >> 16); + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 1024 = 10 bits +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + scale *= 0x0101; // replicates the byte. + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { memcpy(dst, src, count); } -void CopyRow_16_C(const uint16* src, uint16* dst, int count) { +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) { memcpy(dst, src, count * 2); } -void SetRow_C(uint8* dst, uint8 v8, int width) { +void SetRow_C(uint8_t* dst, uint8_t v8, int width) { memset(dst, v8, width); } -void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { - uint32* d = (uint32*)(dst_argb); +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { + uint32_t* d = (uint32_t*)(dst_argb); int x; for (x = 0; x < width; ++x) { d[x] = v32; @@ -1837,8 +2216,11 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { } // Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values, filtering 2 rows of YUY2. int x; for (x = 0; x < width; x += 2) { @@ -1851,8 +2233,10 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, } // Copy row of YUY2 UV's (422) into U and V (422). -void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1865,7 +2249,7 @@ void YUY2ToUV422Row_C(const uint8* src_yuy2, } // Copy row of YUY2 Y's (422) into Y (420/422). -void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { @@ -1879,8 +2263,11 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { } // Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUVRow_C(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1893,8 +2280,10 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, } // Copy row of UYVY UV's (422) into U and V (422). -void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1907,7 +2296,7 @@ void UYVYToUV422Row_C(const uint8* src_uyvy, } // Copy row of UYVY Y's (422) into Y (420/422). -void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { @@ -1925,17 +2314,19 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBBlendRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint32 fb = src_argb0[0]; - uint32 fg = src_argb0[1]; - uint32 fr = src_argb0[2]; - uint32 a = src_argb0[3]; - uint32 bb = src_argb1[0]; - uint32 bg = src_argb1[1]; - uint32 br = src_argb1[2]; + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); @@ -1958,13 +2349,13 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } if (width & 1) { - uint32 fb = src_argb0[0]; - uint32 fg = src_argb0[1]; - uint32 fr = src_argb0[2]; - uint32 a = src_argb0[3]; - uint32 bb = src_argb1[0]; - uint32 bg = src_argb1[1]; - uint32 br = src_argb1[2]; + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); @@ -1973,9 +2364,12 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } #undef BLEND -#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8 -void BlendPlaneRow_C(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst[0] = UBLEND(src0[0], src1[0], alpha[0]); @@ -1995,13 +2389,13 @@ void BlendPlaneRow_C(const uint8* src0, const uint8* src1, // Multiply source RGB by alpha and store to destination. // This code mimics the SSSE3 version for better testability. -void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width - 1; i += 2) { - uint32 b = src_argb[0]; - uint32 g = src_argb[1]; - uint32 r = src_argb[2]; - uint32 a = src_argb[3]; + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); @@ -2019,10 +2413,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } if (width & 1) { - const uint32 b = src_argb[0]; - const uint32 g = src_argb[1]; - const uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; + const uint32_t b = src_argb[0]; + const uint32_t g = src_argb[1]; + const uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); @@ -2038,49 +2432,56 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { // Reciprocal method is off by 1 on some values. ie 125 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. #define T(a) 0x01000000 + (0x10000 / a) -const uint32 fixed_invtbl8[256] = { - 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), - T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), - T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), - T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), - T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), - T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), - T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), - T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), - T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), - T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), - T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), - T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), - T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), - T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), - T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), - T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), - T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), - T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), - T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), - T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), - T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), - T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), - T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), - T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), - T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), - T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), - T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), - T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), - T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), - T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), - T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), - T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 }; +const uint32_t fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), + T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), + T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), + T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), + T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), + T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), + T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), + T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), + T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), + T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), + T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), + T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), + T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), + T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), + T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), + T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), + T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), + T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), + T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), + T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), + T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), + T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), + T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), + T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), + T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), + T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), + T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), + T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), + T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), + T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), + T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), + T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), + T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T -void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { - uint32 b = src_argb[0]; - uint32 g = src_argb[1]; - uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; - const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; + const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point b = (b * ia) >> 8; g = (g * ia) >> 8; r = (r * ia) >> 8; @@ -2094,31 +2495,37 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } -void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { - int32 row_sum[4] = {0, 0, 0, 0}; +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + int32_t row_sum[4] = {0, 0, 0, 0}; int x; for (x = 0; x < width; ++x) { row_sum[0] += row[x * 4 + 0]; row_sum[1] += row[x * 4 + 1]; row_sum[2] += row[x * 4 + 2]; row_sum[3] += row[x * 4 + 3]; - cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; - cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; - cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; - cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; } } -void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, - int w, int area, uint8* dst, int count) { +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, + int w, + int area, + uint8_t* dst, + int count) { float ooa = 1.0f / area; int i; for (i = 0; i < count; ++i) { - dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); - dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); - dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); - dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); dst += 4; tl += 4; bl += 4; @@ -2127,8 +2534,11 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, // Copy pixels from rotated source to destination row with a slope. LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width) { +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { int i; // Render a row of pixels from source into a buffer. float uv[2]; @@ -2137,9 +2547,8 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, for (i = 0; i < width; ++i) { int x = (int)(uv[0]); int y = (int)(uv[1]); - *(uint32*)(dst_argb) = - *(const uint32*)(src_argb + y * src_argb_stride + - x * 4); + *(uint32_t*)(dst_argb) = + *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4); dst_argb += 4; uv[0] += uv_dudv[2]; uv[1] += uv_dudv[3]; @@ -2147,16 +2556,20 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } // Blend 2 rows into 1. -static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride, - uint8* dst_uv, int width) { +static void HalfRow_C(const uint8_t* src_uv, + ptrdiff_t src_uv_stride, + uint8_t* dst_uv, + int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } -static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride, - uint16* dst_uv, int width) { +static void HalfRow_16_C(const uint16_t* src_uv, + ptrdiff_t src_uv_stride, + uint16_t* dst_uv, + int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; @@ -2164,12 +2577,14 @@ static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride, } // C version 2x2 -> 2x1. -void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, - int width, int source_y_fraction) { + int width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr1 = src_ptr + src_stride; int x; if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); @@ -2194,12 +2609,14 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, } } -void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, ptrdiff_t src_stride, - int width, int source_y_fraction) { + int width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint16* src_ptr1 = src_ptr + src_stride; + const uint16_t* src_ptr1 = src_ptr + src_stride; int x; if (source_y_fraction == 0) { memcpy(dst_ptr, src_ptr, width * 2); @@ -2222,8 +2639,10 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, } // Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { int index0 = shuffler[0]; int index1 = shuffler[1]; int index2 = shuffler[2]; @@ -2232,10 +2651,10 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, int x; for (x = 0; x < width; ++x) { // To support in-place conversion. - uint8 b = src_argb[index0]; - uint8 g = src_argb[index1]; - uint8 r = src_argb[index2]; - uint8 a = src_argb[index3]; + uint8_t b = src_argb[index0]; + uint8_t g = src_argb[index1]; + uint8_t r = src_argb[index2]; + uint8_t a = src_argb[index3]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -2245,10 +2664,11 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, } } -void I422ToYUY2Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_y[0]; @@ -2268,10 +2688,11 @@ void I422ToYUY2Row_C(const uint8* src_y, } } -void I422ToUYVYRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_u[0]; @@ -2291,9 +2712,8 @@ void I422ToUYVYRow_C(const uint8* src_y, } } - -void ARGBPolynomialRow_C(const uint8* src_argb, - uint8* dst_argb, +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, const float* poly, int width) { int i; @@ -2323,33 +2743,75 @@ void ARGBPolynomialRow_C(const uint8* src_argb, dr += poly[14] * r3; da += poly[15] * a3; - dst_argb[0] = Clamp((int32)(db)); - dst_argb[1] = Clamp((int32)(dg)); - dst_argb[2] = Clamp((int32)(dr)); - dst_argb[3] = Clamp((int32)(da)); + dst_argb[0] = Clamp((int32_t)(db)); + dst_argb[1] = Clamp((int32_t)(dg)); + dst_argb[2] = Clamp((int32_t)(dr)); + dst_argb[3] = Clamp((int32_t)(da)); src_argb += 4; dst_argb += 4; } } -void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff) { - uint32 bc = lumacoeff & 0xff; - uint32 gc = (lumacoeff >> 8) & 0xff; - uint32 rc = (lumacoeff >> 16) & 0xff; +// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor +// adjust the source integer range to the half float range desired. + +// This magic constant is 2^-112. Multiplying by this +// is the same as subtracting 112 from the exponent, which +// is the difference in exponent bias between 32-bit and +// 16-bit floats. Once we've done this subtraction, we can +// simply extract the low bits of the exponent and the high +// bits of the mantissa from our float and we're done. + +// Work around GCC 7 punning warning -Wstrict-aliasing +#if defined(__GNUC__) +typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; +#else +typedef uint32_t uint32_alias_t; +#endif + +void HalfFloatRow_C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + float mult = 1.9259299444e-34f * scale; + for (i = 0; i < width; ++i) { + float value = src[i] * mult; + dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13); + } +} + +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + float value = src[i] * scale; + dst[i] = value; + } +} + +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + uint32_t bc = lumacoeff & 0xff; + uint32_t gc = (lumacoeff >> 8) & 0xff; + uint32_t rc = (lumacoeff >> 16) & 0xff; int i; for (i = 0; i < width - 1; i += 2) { // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; - const uint8* luma1; + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; + const uint8_t* luma1; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; dst_argb[3] = src_argb[3]; - luma1 = ((src_argb[4] * bc + src_argb[5] * gc + - src_argb[6] * rc) & 0x7F00u) + luma; + luma1 = + ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) + + luma; dst_argb[4] = luma1[src_argb[4]]; dst_argb[5] = luma1[src_argb[5]]; dst_argb[6] = luma1[src_argb[6]]; @@ -2359,8 +2821,9 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, } if (width & 1) { // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; @@ -2368,7 +2831,7 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, } } -void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[3]; @@ -2381,7 +2844,7 @@ void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { } } -void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) { +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst_a[0] = src_argb[3]; @@ -2394,7 +2857,7 @@ void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) { } } -void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[0]; @@ -2413,13 +2876,13 @@ void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { #if !(defined(_MSC_VER) && defined(_M_IX86)) && \ defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. -void I422ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2434,14 +2897,14 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_I422TOARGB1555ROW_SSSE3) -void I422ToARGB1555Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2456,14 +2919,14 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_I422TOARGB4444ROW_SSSE3) -void I422ToARGB4444Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2478,13 +2941,13 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_NV12TORGB565ROW_SSSE3) -void NV12ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); @@ -2497,14 +2960,102 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, } #endif -#if defined(HAS_I422TORGB565ROW_AVX2) -void I422ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +#if defined(HAS_NV12TORGB24ROW_SSSE3) +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_SSSE3) +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB24ROW_AVX2) +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_AVX2) +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORGB565ROW_AVX2) +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2523,14 +3074,14 @@ void I422ToRGB565Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TOARGB1555ROW_AVX2) -void I422ToARGB1555Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2549,14 +3100,14 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TOARGB4444ROW_AVX2) -void I422ToARGB4444Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2575,19 +3126,22 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TORGB24ROW_AVX2) -void I422ToRGB24Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); - // TODO(fbarchard): ARGBToRGB24Row_AVX2 +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -2598,13 +3152,13 @@ void I422ToRGB24Row_AVX2(const uint8* src_y, #endif #if defined(HAS_NV12TORGB565ROW_AVX2) -void NV12ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); @@ -2621,6 +3175,62 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, } #endif +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { + float fsum = 0.f; + int i; +#if defined(__clang__) +#pragma clang loop vectorize_width(4) +#endif + for (i = 0; i < width; ++i) { + float v = *src++; + fsum += v * v; + *dst++ = v * scale; + } + return fsum; +} + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { + float fmax = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++; + float vs = v * scale; + fmax = (v > fmax) ? v : fmax; + *dst++ = vs; + } + return fmax; +} + +void ScaleSamples_C(const float* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src++ * scale; + } +} + +void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = + (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_C(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/libs/libvpx/third_party/libyuv/source/row_gcc.cc b/libs/libvpx/third_party/libyuv/source/row_gcc.cc index 1ac7ef1aa3..8d3cb81cec 100644 --- a/libs/libvpx/third_party/libyuv/source/row_gcc.cc +++ b/libs/libvpx/third_party/libyuv/source/row_gcc.cc @@ -1,4 +1,3 @@ -// VERSION 2 /* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * @@ -23,1663 +22,2001 @@ extern "C" { #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB -static vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. -static vec8 kARGBToYJ = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) -static vec8 kARGBToU = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; -static vec8 kARGBToUJ = { - 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 -}; +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; -static vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -}; +static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0}; -static vec8 kARGBToVJ = { - -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 -}; +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; // Constants for BGRA -static vec8 kBGRAToY = { - 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 -}; +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; -static vec8 kBGRAToU = { - 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 -}; +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; -static vec8 kBGRAToV = { - 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 -}; +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR -static vec8 kABGRToY = { - 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 -}; +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; -static vec8 kABGRToU = { - -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 -}; +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; -static vec8 kABGRToV = { - 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 -}; +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static vec8 kRGBAToY = { - 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 -}; +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; -static vec8 kRGBAToU = { - 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 -}; +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; -static vec8 kRGBAToV = { - 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 -}; +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; -static uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. -static vec16 kAddYJ64 = { - 64, 64, 64, 64, 64, 64, 64, 64 -}; +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static uvec8 kAddUV128 = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static uvec16 kAddUVJ128 = { - 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u -}; +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #ifdef HAS_RGB24TOARGBROW_SSSE3 // Shuffle table for converting RGB24 to ARGB. -static uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u -}; +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static uvec8 kShuffleMaskRAWToARGB = { - 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u -}; +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. -static uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u -}; +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. -static uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u -}; +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -static uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u -}; +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 -}; +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = { - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 -}; +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 -}; +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = { - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 -}; +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; #endif // HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_J400TOARGBROW_SSE2 #ifdef HAS_RGB24TOARGBROW_SSSE3 -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" - "lea " MEMLEA(0x30,0) ",%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRGB24ToARGB) // %3 - : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" - "lea " MEMLEA(0x30,0) ",%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n" - "lea " MEMLEA(0x18,0) ",%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movq %%xmm1," MEMACCESS2(0x8,1) " \n" - "movq %%xmm2," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGB24_0), // %3 - "m"(kShuffleMaskRAWToRGB24_1), // %4 - "m"(kShuffleMaskRAWToRGB24_2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGB24_0), // %3 + "m"(kShuffleMaskRAWToRGB24_1), // %4 + "m"(kShuffleMaskRAWToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) - MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } -void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) - MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } -void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) - MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x30,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x30,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +#ifdef HAS_ARGBTORGB24ROW_AVX2 +// vpermd for 12+12 to 24 +static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; + +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI +// Shuffle table for converting ARGBToRGB24 +static const ulvec8 kPermARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, + 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, + 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u}; +static const ulvec8 kPermARGBToRGB24_1 = { + 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, + 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, + 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u}; +static const ulvec8 kPermARGBToRGB24_2 = { + 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, + 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, + 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; + +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kPermARGBToRGB24_0), // %3 + "m"(kPermARGBToRGB24_1), // %4 + "m"(kPermARGBToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORAWROW_AVX2 +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, - const uint32 dither4, int width) { - asm volatile ( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width) { + asm volatile( + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, - const uint32 dither4, int width) { - asm volatile ( - "vbroadcastss %3,%%xmm6 \n" - "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" - "vpermq $0xd8,%%ymm6,%%ymm6 \n" - "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrld $0x1b,%%ymm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $0x1a,%%ymm4,%%ymm4 \n" - "vpslld $0x5,%%ymm4,%%ymm4 \n" - "vpslld $0xb,%%ymm3,%%ymm5 \n" +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width) { + asm volatile( + "vbroadcastss %3,%%xmm6 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" - "vpsrld $0x5,%%ymm0,%%ymm2 \n" - "vpsrld $0x3,%%ymm0,%%ymm1 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - "vpand %%ymm4,%%ymm2,%%ymm2 \n" - "vpand %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpor %%ymm2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" -void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } -void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_RGB24TOARGBROW_SSSE3 +/* + +ARGBToAR30Row: + +Red Blue +With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will +produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats +wanted for the blue channel. The red needs to be shifted 4 left, so multiply by +(1024+4)*16 for red. + +Alpha Green +Alpha and Green are already in the high bits so vpand can zero out the other +bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier +could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha +would be a simple multiplier to shift it into position. It wants a gap of 10 +above the green. Green is 10 bits, so there are 6 bits in the low short. 4 +more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, +and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the +result left 10 to position the A and G channels. +*/ + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, + 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; + +static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, + 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; + +static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; +static const uint32_t kMaskRB10 = 0x3ff003ff; +static const uint32_t kMaskAG10 = 0xc000ff00; +static const uint32_t kMulAG10 = 64 * 65536 + 1028; + +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTOAR30ROW_AVX2 +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_ABGRTOAR30ROW_AVX2 +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBTOYJROW_SSSE3 #ifdef HAS_ARGBTOYROW_AVX2 // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = { - 0, 4, 1, 5, 2, 6, 3, 7 -}; +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. - "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToV), // %5 - "m"(kARGBToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_AVX2 // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -}; -void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) - VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) - "lea " MEMLEA(0x80,0) ",%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUV128), // %5 - "m"(kARGBToV), // %6 - "m"(kARGBToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) - VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) - "lea " MEMLEA(0x80,0) ",%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" +void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUVJ128), // %5 - "m"(kARGBToVJ), // %6 - "m"(kARGBToUJ), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUVJ128), // %5 + "m"(kARGBToVJ), // %6 + "m"(kARGBToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToVJ), // %5 - "m"(kARGBToUJ), // %6 - "m"(kAddUVJ128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToVJ), // %5 + "m"(kARGBToUJ), // %6 + "m"(kAddUVJ128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "m"(kARGBToV), // %4 - "m"(kARGBToU), // %5 - "m"(kAddUV128) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6" - ); + asm volatile( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); } #endif // HAS_ARGBTOUV444ROW_SSSE3 -void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kBGRAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_bgra0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)), // %4 - "m"(kBGRAToV), // %5 - "m"(kBGRAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kABGRToV), // %5 - "m"(kABGRToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_rgba0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)), // %4 - "m"(kRGBAToV), // %5 - "m"(kRGBAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 -#define READYUV444 \ - "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUV444 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV -#define READYUV422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUV422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422 10 bit, upsample to 8 UV +// TODO(fbarchard): Consider shufb to replace pack/unpack +// TODO(fbarchard): Consider pmulhuw to replace psraw +// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. +#define READYUV210 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm0 \n" \ + "psraw $0x2,%%xmm0 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ - "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ - "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" - -// Read 2 UV from 411, upsample to 8 UV. -// reading 4 bytes is an msan violation. -// "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" -// MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) -// pinsrw fails with drmemory -// __asm pinsrw xmm0, [esi], 0 /* U */ -// __asm pinsrw xmm1, [esi + edi], 0 /* V */ -#define READYUV411_TEMP \ - "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ - "movd %[temp],%%xmm0 \n" \ - MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ - "movd %[temp],%%xmm1 \n" \ - "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "punpckldq %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUVA422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "movq (%[a_buf]),%%xmm5 \n" \ + "lea 0x8(%[a_buf]),%[a_buf] \n" // Read 4 UV from NV12, upsample to 8 UV -#define READNV12 \ - "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READNV12 \ + "movq (%[uv_buf]),%%xmm0 \n" \ + "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 VU from NV21, upsample to 8 UV -#define READNV21 \ - "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ - "pshufb %[kShuffleNV21], %%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READNV21 \ + "movq (%[vu_buf]),%%xmm0 \n" \ + "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ + "pshufb %[kShuffleNV21], %%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. -#define READYUY2 \ - "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ - "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ - "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ - "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ - "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" +#define READYUY2 \ + "movdqu (%[yuy2_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ + "movdqu (%[yuy2_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. -#define READUYVY \ - "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ - "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ - "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ - "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ - "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" +#define READUYVY \ + "movdqu (%[uyvy_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ + "movdqu (%[uyvy_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP(yuvconstants) \ - "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ - "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ - "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ - "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ - "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ - "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ - "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" +#define YUVTORGB_SETUP(yuvconstants) \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm12 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ + "movdqa 192(%[yuvconstants]),%%xmm14 \n" // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa %%xmm11,%%xmm0 \n" \ - "pmaddubsw %%xmm8,%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm12,%%xmm1 \n" \ - "pmaddubsw %%xmm9,%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa %%xmm13,%%xmm2 \n" \ - "pmaddubsw %%xmm10,%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw %%xmm14,%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa %%xmm11,%%xmm0 \n" \ + "pmaddubsw %%xmm8,%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa %%xmm12,%%xmm1 \n" \ + "pmaddubsw %%xmm9,%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa %%xmm13,%%xmm2 \n" \ + "pmaddubsw %%xmm10,%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw %%xmm14,%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ - "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ - "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ - "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ + "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ + "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS #endif +#define YUVTORGB(yuvconstants) \ + YUVTORGB16(yuvconstants) \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + // Store 8 ARGB values. -#define STOREARGB \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklbw %%xmm5,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm1 \n" \ - "punpcklwd %%xmm2,%%xmm0 \n" \ - "punpckhwd %%xmm2,%%xmm1 \n" \ - "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ - "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ - "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0,(%[dst_argb]) \n" \ + "movdqu %%xmm1,0x10(%[dst_argb]) \n" \ + "lea 0x20(%[dst_argb]), %[dst_argb] \n" // Store 8 RGBA values. -#define STORERGBA \ - "pcmpeqb %%xmm5,%%xmm5 \n" \ - "punpcklbw %%xmm2,%%xmm1 \n" \ - "punpcklbw %%xmm0,%%xmm5 \n" \ - "movdqa %%xmm5,%%xmm0 \n" \ - "punpcklwd %%xmm1,%%xmm5 \n" \ - "punpckhwd %%xmm1,%%xmm0 \n" \ - "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ - "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ - "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm2,%%xmm1 \n" \ + "punpcklbw %%xmm0,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5,(%[dst_rgba]) \n" \ + "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ + "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" -void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +// Store 8 AR30 values. +#define STOREAR30 \ + "psraw $0x4,%%xmm0 \n" \ + "psraw $0x4,%%xmm1 \n" \ + "psraw $0x4,%%xmm2 \n" \ + "pminsw %%xmm7,%%xmm0 \n" \ + "pminsw %%xmm7,%%xmm1 \n" \ + "pminsw %%xmm7,%%xmm2 \n" \ + "pmaxsw %%xmm6,%%xmm0 \n" \ + "pmaxsw %%xmm6,%%xmm1 \n" \ + "pmaxsw %%xmm6,%%xmm2 \n" \ + "psllw $0x4,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm3 \n" \ + "movdqa %%xmm1,%%xmm2 \n" \ + "punpcklwd %%xmm5,%%xmm1 \n" \ + "punpckhwd %%xmm5,%%xmm2 \n" \ + "pslld $0xa,%%xmm1 \n" \ + "pslld $0xa,%%xmm2 \n" \ + "por %%xmm1,%%xmm0 \n" \ + "por %%xmm2,%%xmm3 \n" \ + "movdqu %%xmm0,(%[dst_ar30]) \n" \ + "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ + "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB @@ -1691,15 +2028,15 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgb24, +void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -1707,8 +2044,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" "sub %[u_buf],%[v_buf] \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) "punpcklbw %%xmm1,%%xmm0 \n" @@ -1719,16 +2057,16 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" - "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] @@ -1736,23 +2074,24 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } -void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB @@ -1764,24 +2103,125 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + LABELALIGN - "1: \n" + "1: \n" + READYUV422 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +// 10 bit YUV to AR30 +void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB @@ -1792,64 +2232,31 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_I422ALPHATOARGBROW_SSSE3 -#ifdef HAS_I411TOARGBROW_SSSE3 -void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - int temp; + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV411_TEMP - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [temp]"=&r"(temp), // %[temp] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif -void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB @@ -1860,21 +2267,24 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, +void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB @@ -1886,20 +2296,23 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, - uint8* dst_argb, +void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB @@ -1911,20 +2324,23 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, - uint8* dst_argb, +void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB @@ -1936,23 +2352,25 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, +void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA @@ -1964,7 +2382,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1972,179 +2390,211 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, #endif // HAS_I422TOARGBROW_SSSE3 // Read 16 UV from 444 -#define READYUV444_AVX2 \ - "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUV444_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUV422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210 10 bit, upsample to 16 UV +// TODO(fbarchard): Consider vshufb to replace pack/unpack +// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. +#define READYUV210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ - "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ - "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ - "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" - -// Read 4 UV from 411, upsample to 16 UV. -#define READYUV411_AVX2 \ - "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUVA422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" // Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READNV12_AVX2 \ + "vmovdqu (%[uv_buf]),%%xmm0 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 VU from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READNV21_AVX2 \ + "vmovdqu (%[vu_buf]),%%xmm0 \n" \ + "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ - "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ - "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ - "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ - "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" +#define READYUY2_AVX2 \ + "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ - "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ - "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ - "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ - "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" +#define READUYVY_AVX2 \ + "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ - "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ - "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ - "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ - "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ - "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ - "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" -#define YUVTORGB_AVX2(yuvconstants) \ - "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ - "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ - "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ - "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ - "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ - "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ - "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ + "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ + "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ + "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" + +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ + "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ + "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ + "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ + "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ + "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ + "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + #define YUVTORGB_REGS_AVX2 \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + #else // Convert 16 pixels: 16 UV and 16 Y. + #define YUVTORGB_SETUP_AVX2(yuvconstants) -#define YUVTORGB_AVX2(yuvconstants) \ - "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ - "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ - "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ - "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ - "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ + "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ + "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ + "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ + "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ + "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ + "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 #endif +#define YUVTORGB_AVX2(yuvconstants) \ + YUVTORGB16_AVX2(yuvconstants) \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + // Store 16 ARGB values. -#define STOREARGB_AVX2 \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ - "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ - "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ - "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ - "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ - "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ - "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" +#define STOREARGB_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vmovdqu %%ymm1,(%[dst_argb]) \n" \ + "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ + "lea 0x40(%[dst_argb]), %[dst_argb] \n" + +// Store 16 AR30 values. +#define STOREAR30_AVX2 \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ + "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ + "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ + "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ + "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ + "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ + "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ + "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm2,%%ymm2 \n" \ + "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ + "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ + "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ + "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2157,65 +2607,34 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I444TOARGBROW_AVX2 -#ifdef HAS_I411TOARGBROW_AVX2 -// 16 pixels -// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READYUV411_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I411TOARGBROW_AVX2 - #if defined(HAS_I422TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -2223,27 +2642,144 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOARGBROW_AVX2 -#if defined(HAS_I422ALPHATOARGBROW_AVX2) +#if defined(HAS_I422TOAR30ROW_AVX2) // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + LABELALIGN - "1: \n" + "1: \n" + READYUV422_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I422TOAR30ROW_AVX2 + +#if defined(HAS_I210TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I210TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOAR30ROW_AVX2 + +#if defined(HAS_I422ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2255,33 +2791,35 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_I422ALPHATOARGBROW_AVX2 #if defined(HAS_I422TORGBAROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -2292,11 +2830,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, "vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" - "sub $0x10,%[width] \n" - "jg 1b \n" + "vmovdqu %%ymm0,(%[dst_argb]) \n" + "vmovdqu %%ymm1,0x20(%[dst_argb]) \n" + "lea 0x40(%[dst_argb]),%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -2304,7 +2842,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -2313,16 +2851,18 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, #if defined(HAS_NV12TOARGBROW_AVX2) // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, +void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2334,25 +2874,28 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_NV12TOARGBROW_AVX2 #if defined(HAS_NV21TOARGBROW_AVX2) // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, +void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2365,24 +2908,27 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_NV21TOARGBROW_AVX2 #if defined(HAS_YUY2TOARGBROW_AVX2) // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, - uint8* dst_argb, +void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2395,24 +2941,27 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_YUY2TOARGBROW_AVX2 #if defined(HAS_UYVYTOARGBROW_AVX2) // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, - uint8* dst_argb, +void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2425,1131 +2974,1603 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_UYVYTOARGBROW_AVX2 #ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { - asm volatile ( - "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 - "movd %%eax,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "psubusw %%xmm3,%%xmm0 \n" - "psrlw $6, %%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" +void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" - // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "por %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "psrlw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { - asm volatile ( - "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 - "vmovd %%eax,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 - "vmovd %%eax,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpslld $0x18,%%ymm4,%%ymm4 \n" +void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "vmovd %%eax,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpslld $0x18,%%ymm4,%%ymm4 \n" - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 - "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" - "vpsrlw $0x6,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_AVX2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -static uvec8 kShuffleMirror = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %3,%%xmm5 \n" - LABELALIGN - "1: \n" - MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN - "1: \n" - MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static uvec8 kShuffleMirrorUV = { - 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u -}; -void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %4,%%xmm1 \n" - "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorUV) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + asm volatile( + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc" - , "xmm0" - ); + asm volatile( + + "lea -0x10(%0,%2,4),%0 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = { - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vmovdqu %3,%%ymm5 \n" - LABELALIGN - "1: \n" - VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror_AVX2) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + asm volatile( + + "vmovdqu %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_AVX2 -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" - "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" - "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" - "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2" - ); + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2" - ); + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_SSE2 -#ifdef HAS_COPYROW_SSE2 -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + // clang-format off asm volatile ( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(scale) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" "sub $0x20,%2 \n" "jg 1b \n" - "jmp 9f \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. LABELALIGN - "2: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT16TO8ROW_AVX2 +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" "sub $0x20,%2 \n" - "jg 2b \n" - "9: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT16TO8ROW_AVX2 + +// Use scale to convert to lsb formats depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// TODO(fbarchard): reduce to SSE2 +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT8TO16ROW_AVX2 +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT8TO16ROW_AVX2 + +#ifdef HAS_SPLITRGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, + 2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u}; + +static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, + 3u, 6u, 9u, 12u, 15u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 2u, + 5u, 8u, 11u, 14u}; + +static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 0u, 3u, + 6u, 9u, 12u, 15u}; + +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRGBToR0), // %5 + "m"(kShuffleMaskRGBToR1), // %6 + "m"(kShuffleMaskRGBToR2), // %7 + "m"(kShuffleMaskRGBToG0), // %8 + "m"(kShuffleMaskRGBToG1), // %9 + "m"(kShuffleMaskRGBToG2), // %10 + "m"(kShuffleMaskRGBToB0), // %11 + "m"(kShuffleMaskRGBToB1), // %12 + "m"(kShuffleMaskRGBToB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_SPLITRGBROW_SSSE3 + +#ifdef HAS_MERGERGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, + 2u, 128u, 128u, 3u, 128u, 128u, + 4u, 128u, 128u, 5u}; +static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, + 128u, 2u, 128u, 128u, 3u, 128u, + 128u, 4u, 128u, 128u}; +static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, + 128u, 128u, 2u, 128u, 128u, 3u, + 128u, 128u, 4u, 128u}; + +static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, + 7u, 128u, 128u, 8u, 128u, 128u, + 9u, 128u, 128u, 10u}; +static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, + 128u, 7u, 128u, 128u, 8u, 128u, + 128u, 9u, 128u, 128u}; +static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, + 128u, 128u, 8u, 128u, 128u, 9u, + 128u, 128u, 10u, 128u}; + +static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, + 12u, 128u, 128u, 13u, 128u, 128u, + 14u, 128u, 128u, 15u}; +static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, + 128u, 13u, 128u, 128u, 14u, 128u, + 128u, 15u, 128u, 128u}; +static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, + 128u, 128u, 13u, 128u, 128u, 14u, + 128u, 128u, 15u, 128u}; + +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRToRGB0), // %5 + "m"(kShuffleMaskGToRGB0), // %6 + "m"(kShuffleMaskBToRGB0), // %7 + "m"(kShuffleMaskRToRGB1), // %8 + "m"(kShuffleMaskGToRGB1), // %9 + "m"(kShuffleMaskBToRGB1), // %10 + "m"(kShuffleMaskRToRGB2), // %11 + "m"(kShuffleMaskGToRGB2), // %12 + "m"(kShuffleMaskBToRGB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGERGBROW_SSSE3 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" + + LABELALIGN + "2: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX -void CopyRow_AVX(const uint8* src, uint8* dst, int count) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_AVX #ifdef HAS_COPYROW_ERMS // Multiple of 1. -void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep movsb " MEMMOVESTRING(0,1) " \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc" - ); + asm volatile( + + "rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); } #endif // HAS_COPYROW_ERMS #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm4 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1," MEMACCESS(1) " \n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYALPHAROW_AVX2 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ", %%xmm0 \n" - "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" - "lea " MEMLEA(0x20, 0) ", %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8, 1) ", %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +static const uvec8 kShuffleAlphaShort_AVX2 = { + 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, + 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; + +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "vmovdqa %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : "m"(kPermdARGBToY_AVX), // %3 + "m"(kShuffleAlphaShort_AVX2) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm4 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - LABELALIGN - "1: \n" - "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" - "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "vpslld $0x18,%%ymm1,%%ymm1 \n" - "vpslld $0x18,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1," MEMACCESS(1) " \n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -void SetRow_X86(uint8* dst, uint8 v8, int width) { +void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); - const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. + asm volatile( + + "rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } -void SetRow_ERMS(uint8* dst, uint8 v8, int width) { +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosb " MEMSTORESTRING(al,0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v8) // %2 - : "memory", "cc"); + asm volatile( + + "rep stosb \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); } -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(dst_argb), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm volatile( + + "rep stosl \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 -void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_AVX2 -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); -} -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -static uvec8 kShuffleAlpha = { - 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 -}; +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" +void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" - // 1 pixel loop. - "91: \n" - "movd " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : "m"(kShuffleAlpha) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 1 pixel loop. + "91: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBBLENDROW_SSSE3 @@ -3559,46 +4580,49 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); } #endif // HAS_BLENDPLANEROW_SSSE3 @@ -3608,312 +4632,308 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsllw $0x8,%%ymm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm6 \n" - "vbroadcastss %%xmm6,%%ymm6 \n" - "mov $0x807f807f,%%eax \n" - "vmovd %%eax,%%xmm7 \n" - "vbroadcastss %%xmm7,%%ymm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" + "vbroadcastss %%xmm6,%%ymm6 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" + "vbroadcastss %%xmm7,%%ymm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" - // 32 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%2),%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm5,%%ymm3,%%ymm3 \n" - "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%ymm1 \n" - "vmovdqu (%1,%2,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3,%2,1) \n" - "lea 0x20(%2),%2 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - :: "memory", "cc", "eax", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 32 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha -static uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u -}; -static uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u -}; +static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, + 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; +static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; // Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha0), // %3 - "m"(kShuffleAlpha1) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u -}; +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha_AVX2) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movzb " MEMACCESS2(0x03,0) ",%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x07,0) ",%3 \n" - MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "movzb " MEMACCESS2(0x0b,0) ",%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x0f,0) ",%3 \n" - MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u -}; + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // Unattenuate 8 pixels at a time. -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( - "sub %0,%1 \n" - "vbroadcastf128 %5,%%ymm5 \n" + asm volatile( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - // replace VPGATHER - "movzb " MEMACCESS2(0x03,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 - "movzb " MEMACCESS2(0x07,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 - "movzb " MEMACCESS2(0x0b,0) ",%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x0f,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 - "movzb " MEMACCESS2(0x13,0) ",%3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 - "movzb " MEMACCESS2(0x17,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 - "movzb " MEMACCESS2(0x1b,0) ",%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x1f,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 - "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" - "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" - "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" - "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" - // end of VPGATHER + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER - "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" - "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8), // %4 - "m"(kUnattenShuffleAlpha_AVX2) // %5 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBUNATTENUATEROW_AVX2 #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBGRAYROW_SSSE3 @@ -3922,412 +4942,415 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone -static vec8 kARGBToSepiaB = { - 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 -}; +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; -static vec8 kARGBToSepiaG = { - 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 -}; +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; -static vec8 kARGBToSepiaR = { - 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 -}; +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { - asm volatile ( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm5 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm5 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "m"(kARGBToSepiaB), // %2 - "m"(kARGBToSepiaG), // %3 - "m"(kARGBToSepiaR) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBSEPIAROW_SSSE3 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { - asm volatile ( - "movdqu " MEMACCESS(3) ",%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { - asm volatile ( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBQUANTIZEROW_SSE2 #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" #if defined(__AVX2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + , + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif - ); + ); } #endif // HAS_ARGBMULTIPLYROW_AVX2 #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0" - ); +void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBADDROW_AVX2 #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0" - ); +void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBSUBTRACTROW_AVX2 @@ -4336,52 +5359,53 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { - asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 - MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 - MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELXROW_SSE2 @@ -4390,50 +5414,50 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { - asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" - MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" - MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELYROW_SSE2 @@ -4443,79 +5467,79 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" - "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELROW_SSE2 #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_SOBELTOPLANEROW_SSE2 @@ -4525,1004 +5549,1123 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6," MEMACCESS(2) " \n" - "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" - "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" - "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_SOBELXYROW_SSE2 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { - asm volatile ( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(2) ",%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" - "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "movd " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(2) ",%%xmm2 \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + // 1 pixel loop. + LABELALIGN + "10: \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" - "19: \n" - : "+r"(row), // %0 - "+r"(cumsum), // %1 - "+r"(previous_cumsum), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, int count) { - asm volatile ( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" + asm volatile( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" - // 4 pixel small loop \n" - LABELALIGN - "4: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 - MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 - MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" - "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 - MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 - MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 - "lea " MEMLEA(0x40,1) ",%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" + // 4 pixel small loop. + LABELALIGN + "4: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 - MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 - MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" - "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 - MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 - MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 - "lea " MEMLEA(0x40,1) ",%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + // 4 pixel loop + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - "lea " MEMLEA(0x10,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - "lea " MEMLEA(0x10,1) ",%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((intptr_t)(width)), // %4 - "rm"(area) // %5 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 1 pixel loop + LABELALIGN + "10: \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* src_dudv, int width) { +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* src_dudv, + int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; - asm volatile ( - "movq " MEMACCESS(3) ",%%xmm2 \n" - "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" + asm volatile( + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 - MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1," MEMACCESS(2) " \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0," MEMACCESS2(0x08,2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + // 4 pixel loop + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x04,2) ",%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" - "19: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_stride_temp), // %1 - "+r"(dst_argb), // %2 - "+r"(src_dudv), // %3 - "+rm"(width), // %4 - "=&r"(temp) // %5 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 1 pixel loop + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "=&r"(temp) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - MEMOPMEM(movdqu,xmm2,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { - asm volatile ( - "cmp $0x0,%3 \n" - "je 100f \n" - "sub %1,%0 \n" - "cmp $0x80,%3 \n" - "je 50f \n" + asm volatile( + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x80,%3 \n" + "je 50f \n" - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" - "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" - "vbroadcastss %%xmm4,%%ymm4 \n" + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vbroadcastss %%xmm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" - // General purpose row blend. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" - // Blend 50 / 50. - LABELALIGN - "50: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "rep movsb " MEMMOVESTRING(1,0) " \n" - "jmp 999f \n" + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "rep movsb \n" + "jmp 999f \n" - "99: \n" - "vzeroupper \n" - "999: \n" - : "+D"(dst_ptr), // %0 - "+S"(src_ptr), // %1 - "+cm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" - ); + "99: \n" + "vzeroupper \n" + "999: \n" + : "+D"(dst_ptr), // %0 + "+S"(src_ptr), // %1 + "+cm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_AVX2 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - "movdqu " MEMACCESS(3) ",%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "movdqu (%3),%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "vbroadcastf128 (%3),%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_AVX2 -#ifdef HAS_ARGBSHUFFLEROW_SSE2 -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - uintptr_t pixel_temp; - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" - "mov " MEMACCESS(4) ",%k2 \n" - "cmp $0x3000102,%k2 \n" - "je 3012f \n" - "cmp $0x10203,%k2 \n" - "je 123f \n" - "cmp $0x30201,%k2 \n" - "je 321f \n" - "cmp $0x2010003,%k2 \n" - "je 2103f \n" - - LABELALIGN - "1: \n" - "movzb " MEMACCESS(4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS(1) " \n" - "movzb " MEMACCESS2(0x1,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x1,1) " \n" - "movzb " MEMACCESS2(0x2,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x2,1) " \n" - "movzb " MEMACCESS2(0x3,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x3,1) " \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "sub $0x1,%3 \n" - "jg 1b \n" - "jmp 99f \n" - - LABELALIGN - "123: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x1b,%%xmm0,%%xmm0 \n" - "pshuflw $0x1b,%%xmm0,%%xmm0 \n" - "pshufhw $0x1b,%%xmm1,%%xmm1 \n" - "pshuflw $0x1b,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 123b \n" - "jmp 99f \n" - - LABELALIGN - "321: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x39,%%xmm0,%%xmm0 \n" - "pshuflw $0x39,%%xmm0,%%xmm0 \n" - "pshufhw $0x39,%%xmm1,%%xmm1 \n" - "pshuflw $0x39,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 321b \n" - "jmp 99f \n" - - LABELALIGN - "2103: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x93,%%xmm0,%%xmm0 \n" - "pshuflw $0x93,%%xmm0,%%xmm0 \n" - "pshufhw $0x93,%%xmm1,%%xmm1 \n" - "pshuflw $0x93,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 2103b \n" - "jmp 99f \n" - - LABELALIGN - "3012: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0xc6,%%xmm0,%%xmm0 \n" - "pshuflw $0xc6,%%xmm0,%%xmm0 \n" - "pshufhw $0xc6,%%xmm1,%%xmm1 \n" - "pshuflw $0xc6,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 3012b \n" - - "99: \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "=&d"(pixel_temp), // %2 - "+r"(width) // %3 - : "r"(shuffler) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); -} -#endif // HAS_ARGBSHUFFLEROW_SSE2 - #ifdef HAS_I422TOYUY2ROW_SSE2 -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(1) ",%%xmm2 \n" - MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 - "lea " MEMLEA(0x8,1) ",%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(3) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" - "lea " MEMLEA(0x20,3) ",%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_SSE2 #ifdef HAS_I422TOUYVYROW_SSE2 -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(1) ",%%xmm2 \n" - MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 - "lea " MEMLEA(0x8,1) ",%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS(3) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" - "lea " MEMLEA(0x20,3) ",%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_SSE2 -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { - asm volatile ( - "pxor %%xmm3,%%xmm3 \n" +#ifdef HAS_I422TOYUY2ROW_AVX2 +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( - // 2 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" - "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" - "addps " MEMACCESS(3) ",%%xmm0 \n" - "addps " MEMACCESS(3) ",%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" - "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" - "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" - "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOYUY2ROW_AVX2 + +#ifdef HAS_I422TOUYVYROW_AVX2 +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOUYVYROW_AVX2 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + asm volatile( + + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, int width) { - asm volatile ( - "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" - "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" - "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" - "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" + asm volatile( + "vbroadcastf128 (%3),%%ymm4 \n" + "vbroadcastf128 0x10(%3),%%ymm5 \n" + "vbroadcastf128 0x20(%3),%%ymm6 \n" + "vbroadcastf128 0x30(%3),%%ymm7 \n" - // 2 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels - "lea " MEMLEA(0x8,0) ",%0 \n" - "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats - "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X - "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X - "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X - "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X - "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X - "vcvttps2dq %%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" - "vmovq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels + "lea 0x8(%0),%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * + // X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "vmovq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_SSE2 +static float kScaleBias = 1.9259299444e-34f; +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "movd %3,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" // 8 shorts + "add $0x10,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 + "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats + "punpckhwd %%xmm5,%%xmm3 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "psrld $0xd,%%xmm2 \n" + "psrld $0xd,%%xmm3 \n" + "packssdw %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,-0x10(%0,%1,1) \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(scale) // %3 + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { + asm volatile( + "sub %0,%1 \n" + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm2", "xmm3"); +} +#endif // HAS_HALFFLOATROW_F16C + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, int width) { uintptr_t pixel_temp; - asm volatile ( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb " MEMACCESS(0) ",%1 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x4,0) " \n" - "movzb " MEMACCESS2(-0x3,0) ",%1 \n" - MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x3,0) " \n" - "movzb " MEMACCESS2(-0x2,0) ",%1 \n" - MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x2,0) " \n" - "movzb " MEMACCESS2(-0x1,0) ",%1 \n" - MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x1,0) " \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); } #endif // HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { uintptr_t pixel_temp; - asm volatile ( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb " MEMACCESS(0) ",%1 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x4,0) " \n" - "movzb " MEMACCESS2(-0x3,0) ",%1 \n" - MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x3,0) " \n" - "movzb " MEMACCESS2(-0x2,0) ",%1 \n" - MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x2,0) " \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); } #endif // HAS_RGBCOLORTABLEROW_X86 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - const uint8* luma, uint32 lumacoeff) { + const uint8_t* luma, + uint32_t lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; - asm volatile ( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + asm volatile( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(2) ",%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movzb " MEMACCESS(2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS(3) " \n" - "movzb " MEMACCESS2(0x1,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x1,3) " \n" - "movzb " MEMACCESS2(0x2,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x2,3) " \n" - "movzb " MEMACCESS2(0x3,2) ",%0 \n" - "mov %b0," MEMACCESS2(0x3,3) " \n" + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movzb " MEMACCESS2(0x4,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x4,3) " \n" - "movzb " MEMACCESS2(0x5,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x5,3) " \n" - "movzb " MEMACCESS2(0x6,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x6,3) " \n" - "movzb " MEMACCESS2(0x7,2) ",%0 \n" - "mov %b0," MEMACCESS2(0x7,3) " \n" + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movzb " MEMACCESS2(0x8,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x8,3) " \n" - "movzb " MEMACCESS2(0x9,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x9,3) " \n" - "movzb " MEMACCESS2(0xa,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xa,3) " \n" - "movzb " MEMACCESS2(0xb,2) ",%0 \n" - "mov %b0," MEMACCESS2(0xb,3) " \n" + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" - "movzb " MEMACCESS2(0xc,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xc,3) " \n" - "movzb " MEMACCESS2(0xd,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xd,3) " \n" - "movzb " MEMACCESS2(0xe,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xe,3) " \n" - "movzb " MEMACCESS2(0xf,2) ",%0 \n" - "mov %b0," MEMACCESS2(0xf,3) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "lea " MEMLEA(0x10,3) ",%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" - : "=&d"(pixel_temp), // %0 - "=&a"(table_temp), // %1 - "+r"(src_argb), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : "r"(luma), // %5 - "rm"(lumacoeff) // %6 - : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" - ); + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" + : "=&d"(pixel_temp), // %0 + "=&a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 diff --git a/libs/libvpx/third_party/libyuv/source/row_mips.cc b/libs/libvpx/third_party/libyuv/source/row_mips.cc deleted file mode 100644 index 285f0b5adc..0000000000 --- a/libs/libvpx/third_party/libyuv/source/row_mips.cc +++ /dev/null @@ -1,782 +0,0 @@ -/* - * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -#ifdef HAS_COPYROW_MIPS -void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { - __asm__ __volatile__ ( - ".set noreorder \n" - ".set noat \n" - "slti $at, %[count], 8 \n" - "bne $at ,$zero, $last8 \n" - "xor $t8, %[src], %[dst] \n" - "andi $t8, $t8, 0x3 \n" - - "bne $t8, $zero, unaligned \n" - "negu $a3, %[dst] \n" - // make dst/src aligned - "andi $a3, $a3, 0x3 \n" - "beq $a3, $zero, $chk16w \n" - // word-aligned now count is the remining bytes count - "subu %[count], %[count], $a3 \n" - - "lwr $t8, 0(%[src]) \n" - "addu %[src], %[src], $a3 \n" - "swr $t8, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - - // Now the dst/src are mutually word-aligned with word-aligned addresses - "$chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, chk8w \n" - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" - // t0 is the "past the end" address - - // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past - // the "t0-32" address - // This means: for x=128 the last "safe" a1 address is "t0-160" - // Alternatively, for x=64 the last "safe" a1 address is "t0-96" - // we will use "pref 30,128(a1)", so "t0-160" is the limit - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line of src - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // In case the a1 > t9 don't use "pref 30" at all - "sgtu $v1, %[dst], $t9 \n" - "bgtz $v1, $loop16w \n" - "nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$loop16w: \n" - "pref 0, 96(%[src]) \n" - "lw $t0, 0(%[src]) \n" - "bgtz $v1, $skip_pref30_96 \n" // skip - "lw $t1, 4(%[src]) \n" - "pref 30, 96(%[dst]) \n" // continue - "$skip_pref30_96: \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lw $t0, 32(%[src]) \n" - "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) - "lw $t1, 36(%[src]) \n" - "pref 30, 128(%[dst]) \n" // set dest, addr 128 - "$skip_pref30_128: \n" - "lw $t2, 40(%[src]) \n" - "lw $t3, 44(%[src]) \n" - "lw $t4, 48(%[src]) \n" - "lw $t5, 52(%[src]) \n" - "lw $t6, 56(%[src]) \n" - "lw $t7, 60(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst], %[dst], 64 \n" // adding 64 to dest - "sgtu $v1, %[dst], $t9 \n" - "bne %[dst], $a3, $loop16w \n" - " addiu %[src], %[src], 64 \n" // adding 64 to src - "move %[count], $t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count past 32-bytes - "beq %[count], $t8, chk1w \n" - // count=t8,no 32-byte chunk - " nop \n" - - "lw $t0, 0(%[src]) \n" - "lw $t1, 4(%[src]) \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, $last8 \n" - " subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - // copying in words (4-byte chunks) - "$wordCopy_loop: \n" - "lw $t3, 0(%[src]) \n" - // the first t3 may be equal t0 ... optimize? - "addiu %[src], %[src],4 \n" - "addiu %[dst], %[dst],4 \n" - "bne %[dst], $a3,$wordCopy_loop \n" - " sw $t3, -4(%[dst]) \n" - - // For the last (<8) bytes - "$last8: \n" - "blez %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 -last dst address - "$last8loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst], $a3, $last8loop \n" - " sb $v1, -1(%[dst]) \n" - - "leave: \n" - " j $ra \n" - " nop \n" - - // - // UNALIGNED case - // - - "unaligned: \n" - // got here with a3="negu a1" - "andi $a3, $a3, 0x3 \n" // a1 is word aligned? - "beqz $a3, $ua_chk16w \n" - " subu %[count], %[count], $a3 \n" - // bytes left after initial a3 bytes - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 - "swr $v1, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - // below the dst will be word aligned (NOTE1) - "$ua_chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, ua_chk8w \n" - // if a2==t8, no 64-byte chunks - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" // t0 "past the end" - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line addr 32 - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // safe, as we have at least 64 bytes ahead - // In case the a1 > t9 don't use "pref 30" at all - "sgtu $v1, %[dst], $t9 \n" - "bgtz $v1, $ua_loop16w \n" - // skip "pref 30,64(a1)" for too short arrays - " nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$ua_loop16w: \n" - "pref 0, 96(%[src]) \n" - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "bgtz $v1, $ua_skip_pref30_96 \n" - " lwl $t1, 7(%[src]) \n" - "pref 30, 96(%[dst]) \n" - // continue setting up the dest, addr 96 - "$ua_skip_pref30_96: \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lwr $t0, 32(%[src]) \n" - "lwl $t0, 35(%[src]) \n" - "lwr $t1, 36(%[src]) \n" - "bgtz $v1, ua_skip_pref30_128 \n" - " lwl $t1, 39(%[src]) \n" - "pref 30, 128(%[dst]) \n" - // continue setting up the dest, addr 128 - "ua_skip_pref30_128: \n" - - "lwr $t2, 40(%[src]) \n" - "lwl $t2, 43(%[src]) \n" - "lwr $t3, 44(%[src]) \n" - "lwl $t3, 47(%[src]) \n" - "lwr $t4, 48(%[src]) \n" - "lwl $t4, 51(%[src]) \n" - "lwr $t5, 52(%[src]) \n" - "lwl $t5, 55(%[src]) \n" - "lwr $t6, 56(%[src]) \n" - "lwl $t6, 59(%[src]) \n" - "lwr $t7, 60(%[src]) \n" - "lwl $t7, 63(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst],%[dst],64 \n" // adding 64 to dest - "sgtu $v1,%[dst],$t9 \n" - "bne %[dst],$a3,$ua_loop16w \n" - " addiu %[src],%[src],64 \n" // adding 64 to src - "move %[count],$t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "ua_chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count - "beq %[count], $t8, $ua_chk1w \n" - // when count==t8, no 32-byte chunk - - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "lwl $t1, 7(%[src]) \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "$ua_chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, ua_smallCopy \n" - "subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - - // copying in words (4-byte chunks) - "$ua_wordCopy_loop: \n" - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addiu %[src], %[src], 4 \n" - "addiu %[dst], %[dst], 4 \n" - // note: dst=a1 is word aligned here, see NOTE1 - "bne %[dst], $a3, $ua_wordCopy_loop \n" - " sw $v1,-4(%[dst]) \n" - - // Now less than 4 bytes (value in count) left to copy - "ua_smallCopy: \n" - "beqz %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 = last dst address - "$ua_smallCopy_loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst],$a3,$ua_smallCopy_loop \n" - " sb $v1, -1(%[dst]) \n" - - "j $ra \n" - " nop \n" - ".set at \n" - ".set reorder \n" - : [dst] "+r" (dst), [src] "+r" (src) - : [count] "r" (count) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", - "t8", "t9", "a3", "v1", "at" - ); -} -#endif // HAS_COPYROW_MIPS - -// DSPR2 functions -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) - -void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "srl $t4, %[width], 4 \n" // multiplies of 16 - "blez $t4, 2f \n" - " andi %[width], %[width], 0xf \n" // residual - - "1: \n" - "addiu $t4, $t4, -1 \n" - "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 - "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 - "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 - "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 - "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 - "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10 - "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12 - "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14 - "addiu %[src_uv], %[src_uv], 32 \n" - "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 - "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 - "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 - "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 - "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 - "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 - "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 - "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 - "sw $t9, 0(%[dst_v]) \n" - "sw $t0, 0(%[dst_u]) \n" - "sw $t1, 4(%[dst_v]) \n" - "sw $t2, 4(%[dst_u]) \n" - "sw $t3, 8(%[dst_v]) \n" - "sw $t5, 8(%[dst_u]) \n" - "sw $t6, 12(%[dst_v]) \n" - "sw $t7, 12(%[dst_u]) \n" - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz $t4, 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - - "beqz %[width], 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, 0(%[src_uv]) \n" - "lbu $t1, 1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], 2 \n" - "addiu %[width], %[width], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[width], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [width] "+r" (width), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6", "t7", "t8", "t9" - ); -} - -void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t4, %[width], 4 \n" // multiplies of 16 - "andi $t5, %[width], 0xf \n" - "blez $t4, 2f \n" - " addu %[src], %[src], %[width] \n" // src += width - - "1: \n" - "lw $t0, -16(%[src]) \n" // |3|2|1|0| - "lw $t1, -12(%[src]) \n" // |7|6|5|4| - "lw $t2, -8(%[src]) \n" // |11|10|9|8| - "lw $t3, -4(%[src]) \n" // |15|14|13|12| - "wsbh $t0, $t0 \n" // |2|3|0|1| - "wsbh $t1, $t1 \n" // |6|7|4|5| - "wsbh $t2, $t2 \n" // |10|11|8|9| - "wsbh $t3, $t3 \n" // |14|15|12|13| - "rotr $t0, $t0, 16 \n" // |0|1|2|3| - "rotr $t1, $t1, 16 \n" // |4|5|6|7| - "rotr $t2, $t2, 16 \n" // |8|9|10|11| - "rotr $t3, $t3, 16 \n" // |12|13|14|15| - "addiu %[src], %[src], -16 \n" - "addiu $t4, $t4, -1 \n" - "sw $t3, 0(%[dst]) \n" // |15|14|13|12| - "sw $t2, 4(%[dst]) \n" // |11|10|9|8| - "sw $t1, 8(%[dst]) \n" // |7|6|5|4| - "sw $t0, 12(%[dst]) \n" // |3|2|1|0| - "bgtz $t4, 1b \n" - " addiu %[dst], %[dst], 16 \n" - "beqz $t5, 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, -1(%[src]) \n" - "addiu $t5, $t5, -1 \n" - "addiu %[src], %[src], -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgez $t5, 2b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src] "+r" (src), [dst] "+r" (dst) - : [width] "r" (width) - : "t0", "t1", "t2", "t3", "t4", "t5" - ); -} - -void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { - int x; - int y; - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "addu $t4, %[width], %[width] \n" - "srl %[x], %[width], 4 \n" - "andi %[y], %[width], 0xf \n" - "blez %[x], 2f \n" - " addu %[src_uv], %[src_uv], $t4 \n" - - "1: \n" - "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| - "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| - "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| - "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| - "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| - "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| - "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| - "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| - - "rotr $t0, $t0, 16 \n" // |1|0|3|2| - "rotr $t1, $t1, 16 \n" // |5|4|7|6| - "rotr $t2, $t2, 16 \n" // |9|8|11|10| - "rotr $t3, $t3, 16 \n" // |13|12|15|14| - "rotr $t4, $t4, 16 \n" // |17|16|19|18| - "rotr $t6, $t6, 16 \n" // |21|20|23|22| - "rotr $t7, $t7, 16 \n" // |25|24|27|26| - "rotr $t8, $t8, 16 \n" // |29|28|31|30| - "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| - "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| - "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| - "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| - "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| - "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| - "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| - "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| - "addiu %[src_uv], %[src_uv], -32 \n" - "addiu %[x], %[x], -1 \n" - "swr $t4, 0(%[dst_u]) \n" - "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| - "swr $t6, 0(%[dst_v]) \n" - "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| - "swr $t2, 4(%[dst_u]) \n" - "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| - "swr $t3, 4(%[dst_v]) \n" - "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| - "swr $t0, 8(%[dst_u]) \n" - "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| - "swr $t1, 8(%[dst_v]) \n" - "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| - "swr $t9, 12(%[dst_u]) \n" - "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| - "swr $t5, 12(%[dst_v]) \n" - "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz %[x], 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - "beqz %[y], 3f \n" - " nop \n" - "b 2f \n" - " nop \n" - - "2: \n" - "lbu $t0, -2(%[src_uv]) \n" - "lbu $t1, -1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], -2 \n" - "addiu %[y], %[y], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[y], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v), - [x] "=&r" (x), - [y] "=&r" (y) - : [width] "r" (width) - : "t0", "t1", "t2", "t3", "t4", - "t5", "t7", "t8", "t9" - ); -} - -// Convert (4 Y and 2 VU) I422 and arrange RGB values into -// t5 = | 0 | B0 | 0 | b0 | -// t4 = | 0 | B1 | 0 | b1 | -// t9 = | 0 | G0 | 0 | g0 | -// t8 = | 0 | G1 | 0 | g1 | -// t2 = | 0 | R0 | 0 | r0 | -// t1 = | 0 | R1 | 0 | r1 | -#define YUVTORGB \ - "lw $t0, 0(%[y_buf]) \n" \ - "lhu $t1, 0(%[u_buf]) \n" \ - "lhu $t2, 0(%[v_buf]) \n" \ - "preceu.ph.qbr $t1, $t1 \n" \ - "preceu.ph.qbr $t2, $t2 \n" \ - "preceu.ph.qbra $t3, $t0 \n" \ - "preceu.ph.qbla $t0, $t0 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t3, $t3, $s4 \n" \ - "subu.ph $t0, $t0, $s4 \n" \ - "mul.ph $t3, $t3, $s0 \n" \ - "mul.ph $t0, $t0, $s0 \n" \ - "shll.ph $t4, $t1, 0x7 \n" \ - "subu.ph $t4, $t4, $t1 \n" \ - "mul.ph $t6, $t1, $s1 \n" \ - "mul.ph $t1, $t2, $s2 \n" \ - "addq_s.ph $t5, $t4, $t3 \n" \ - "addq_s.ph $t4, $t4, $t0 \n" \ - "shra.ph $t5, $t5, 6 \n" \ - "shra.ph $t4, $t4, 6 \n" \ - "addiu %[u_buf], 2 \n" \ - "addiu %[v_buf], 2 \n" \ - "addu.ph $t6, $t6, $t1 \n" \ - "mul.ph $t1, $t2, $s3 \n" \ - "addu.ph $t9, $t6, $t3 \n" \ - "addu.ph $t8, $t6, $t0 \n" \ - "shra.ph $t9, $t9, 6 \n" \ - "shra.ph $t8, $t8, 6 \n" \ - "addu.ph $t2, $t1, $t3 \n" \ - "addu.ph $t1, $t1, $t0 \n" \ - "shra.ph $t2, $t2, 6 \n" \ - "shra.ph $t1, $t1, 6 \n" \ - "subu.ph $t5, $t5, $s5 \n" \ - "subu.ph $t4, $t4, $s5 \n" \ - "subu.ph $t9, $t9, $s5 \n" \ - "subu.ph $t8, $t8, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "shll_s.ph $t5, $t5, 8 \n" \ - "shll_s.ph $t4, $t4, 8 \n" \ - "shll_s.ph $t9, $t9, 8 \n" \ - "shll_s.ph $t8, $t8, 8 \n" \ - "shll_s.ph $t2, $t2, 8 \n" \ - "shll_s.ph $t1, $t1, 8 \n" \ - "shra.ph $t5, $t5, 8 \n" \ - "shra.ph $t4, $t4, 8 \n" \ - "shra.ph $t9, $t9, 8 \n" \ - "shra.ph $t8, $t8, 8 \n" \ - "shra.ph $t2, $t2, 8 \n" \ - "shra.ph $t1, $t1, 8 \n" \ - "addu.ph $t5, $t5, $s5 \n" \ - "addu.ph $t4, $t4, $s5 \n" \ - "addu.ph $t9, $t9, $s5 \n" \ - "addu.ph $t8, $t8, $s5 \n" \ - "addu.ph $t2, $t2, $s5 \n" \ - "addu.ph $t1, $t1, $s5 \n" - -// TODO(fbarchard): accept yuv conversion constants. -void I422ToARGBRow_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| - "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| - "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| - "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| - "repl.ph $s4, 16 \n" // |0|16|0|16| - "repl.ph $s5, 128 \n" // |128|128| // clipping - "lui $s6, 0xff00 \n" - "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| - - "1: \n" - YUVTORGB -// Arranging into argb format - "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| - "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| - "addiu %[width], -4 \n" - "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0| - "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0| - "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| - - "addiu %[y_buf], 4 \n" - "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| - "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| - "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0| - "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0| - "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1| - "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1| - "sll $t9, $t9, 16 \n" - "sll $t8, $t8, 16 \n" - "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0| - "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0| -// Store results. - "sw $t2, 0(%[rgb_buf]) \n" - "sw $t0, 4(%[rgb_buf]) \n" - "sw $t1, 8(%[rgb_buf]) \n" - "sw $t3, 12(%[rgb_buf]) \n" - "bnez %[width], 1b \n" - " addiu %[rgb_buf], 16 \n" - "2: \n" - ".set pop \n" - :[y_buf] "+r" (y_buf), - [u_buf] "+r" (u_buf), - [v_buf] "+r" (v_buf), - [width] "+r" (width), - [rgb_buf] "+r" (rgb_buf) - : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); -} - -// Bilinear filter 8x2 -> 8x1 -void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - int y0_fraction = 256 - source_y_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "replv.ph $t0, %[y0_fraction] \n" - "replv.ph $t1, %[source_y_fraction] \n" - - "1: \n" - "lw $t2, 0(%[src_ptr]) \n" - "lw $t3, 0(%[src_ptr1]) \n" - "lw $t4, 4(%[src_ptr]) \n" - "lw $t5, 4(%[src_ptr1]) \n" - "muleu_s.ph.qbl $t6, $t2, $t0 \n" - "muleu_s.ph.qbr $t7, $t2, $t0 \n" - "muleu_s.ph.qbl $t8, $t3, $t1 \n" - "muleu_s.ph.qbr $t9, $t3, $t1 \n" - "muleu_s.ph.qbl $t2, $t4, $t0 \n" - "muleu_s.ph.qbr $t3, $t4, $t0 \n" - "muleu_s.ph.qbl $t4, $t5, $t1 \n" - "muleu_s.ph.qbr $t5, $t5, $t1 \n" - "addq.ph $t6, $t6, $t8 \n" - "addq.ph $t7, $t7, $t9 \n" - "addq.ph $t2, $t2, $t4 \n" - "addq.ph $t3, $t3, $t5 \n" - "shra.ph $t6, $t6, 8 \n" - "shra.ph $t7, $t7, 8 \n" - "shra.ph $t2, $t2, 8 \n" - "shra.ph $t3, $t3, 8 \n" - "precr.qb.ph $t6, $t6, $t7 \n" - "precr.qb.ph $t2, $t2, $t3 \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[src_ptr1], %[src_ptr1], 8 \n" - "addiu %[dst_width], %[dst_width], -8 \n" - "sw $t6, 0(%[dst_ptr]) \n" - "sw $t2, 4(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[dst_ptr], %[dst_ptr], 8 \n" - - ".set pop \n" - : [dst_ptr] "+r" (dst_ptr), - [src_ptr1] "+r" (src_ptr1), - [src_ptr] "+r" (src_ptr), - [dst_width] "+r" (dst_width) - : [source_y_fraction] "r" (source_y_fraction), - [y0_fraction] "r" (y0_fraction), - [src_stride] "r" (src_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} -#endif // __mips_dsp_rev >= 2 - -#endif // defined(__mips__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libs/libvpx/third_party/libyuv/source/row_msa.cc b/libs/libvpx/third_party/libyuv/source/row_msa.cc new file mode 100644 index 0000000000..4fb2631f0b --- /dev/null +++ b/libs/libvpx/third_party/libyuv/source/row_msa.cc @@ -0,0 +1,3512 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ALPHA_VAL (-1) + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ + { \ + ub = __msa_fill_w(yuvconst->kUVToB[0]); \ + vr = __msa_fill_w(yuvconst->kUVToR[1]); \ + ug = __msa_fill_w(yuvconst->kUVToG[0]); \ + vg = __msa_fill_w(yuvconst->kUVToG[1]); \ + bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ + bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ + br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ + yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ + } + +// Load YUV 422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m; \ + uint32_t u_m, v_m; \ + v4i32 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LW(psrc_u); \ + v_m = LW(psrc_v); \ + out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ + out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ + } + +// Clip input vector elements between 0 to 255 +#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ + { \ + v4i32 max_m = __msa_ldi_w(0xFF); \ + \ + in0 = __msa_maxi_s_w(in0, 0); \ + in1 = __msa_maxi_s_w(in1, 0); \ + in2 = __msa_maxi_s_w(in2, 0); \ + in3 = __msa_maxi_s_w(in3, 0); \ + in4 = __msa_maxi_s_w(in4, 0); \ + in5 = __msa_maxi_s_w(in5, 0); \ + in0 = __msa_min_s_w(max_m, in0); \ + in1 = __msa_min_s_w(max_m, in1); \ + in2 = __msa_min_s_w(max_m, in2); \ + in3 = __msa_min_s_w(max_m, in3); \ + in4 = __msa_min_s_w(max_m, in4); \ + in5 = __msa_min_s_w(max_m, in5); \ + } + +// Convert 8 pixels of YUV 420 to RGB. +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ + { \ + v8i16 vec0_m, vec1_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m; \ + v16i8 zero_m = {0}; \ + \ + vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ + vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ + reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg0_m *= yg; \ + reg1_m *= yg; \ + reg2_m *= ubvr; \ + reg3_m *= ubvr; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ + reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ + reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ + reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ + reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ + reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ + reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ + reg5_m = reg0_m - reg5_m; \ + reg6_m = reg1_m - reg6_m; \ + reg2_m = reg0_m - reg2_m; \ + reg3_m = reg1_m - reg3_m; \ + reg7_m = reg0_m - reg7_m; \ + reg4_m = reg1_m - reg4_m; \ + reg5_m += bb; \ + reg6_m += bb; \ + reg7_m += bg; \ + reg4_m += bg; \ + reg2_m += br; \ + reg3_m += br; \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg6_m = __msa_srai_w(reg6_m, 6); \ + reg7_m = __msa_srai_w(reg7_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ + out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ + out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ + out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in0, in1, in2, in3, pdst_argb) \ + { \ + v8i16 vec0_m, vec1_m; \ + v16u8 dst0_m, dst1_m; \ + vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ + dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ + ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ + } + +// Takes ARGB input and calculates Y. +#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ + y_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8u16 reg0_m, reg1_m; \ + \ + vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ + vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ + vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ + vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const0); \ + reg1_m = __msa_dotp_u_h(vec1_m, const0); \ + reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ + reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ + reg0_m += const2; \ + reg1_m += const2; \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ + y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Loads current and next row of ARGB input and averages it to calculate U and V +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ + { \ + v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v16u8 vec8_m, vec9_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ + v8u16 reg8_m, reg9_m; \ + \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \ + vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ + reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ + reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ + reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Takes ARGB input and calculates U and V. +#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const1); \ + reg1_m = __msa_dotp_u_h(vec1_m, const1); \ + reg2_m = __msa_dotp_u_h(vec4_m, const1); \ + reg3_m = __msa_dotp_u_h(vec5_m, const1); \ + reg0_m += const3; \ + reg1_m += const3; \ + reg2_m += const3; \ + reg3_m += const3; \ + reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ + v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ + } + +// Load I444 pixel data +#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m, u_m, v_m; \ + v2i64 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LD(psrc_u); \ + v_m = LD(psrc_v); \ + out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ + out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ + } + +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + src += width - 64; + + for (x = 0; x < width; x += 64) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + src += width * 4 - 64; + + for (x = 0; x < width; x += 16) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); + ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); + ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); + ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); + ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(alpha, vec0, vec1, vec2, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int64_t data_a; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v4i32 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + data_a = LD(src_a); + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); + STOREARGB(vec0, vec1, vec2, src3, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + src_a += 8; + dst_argb += 32; + } +} + +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int64_t data_u, data_v; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 reg0, reg1, reg2, reg3; + v2i64 zero = {0}; + v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; + v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; + v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, + 11, 29, 12, 13, 30, 14, 15, 31}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); + data_u = LD(src_u); + data_v = LD(src_v); + src1 = (v16u8)__msa_insert_d(zero, 0, data_u); + src2 = (v16u8)__msa_insert_d(zero, 0, data_v); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec3, vec4, vec5); + reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); + reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); + reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); + ST_UB2(dst0, dst1, dst_argb, 16); + ST_UB(dst2, (dst_argb + 32)); + src_y += 16; + src_u += 8; + src_v += 8; + dst_argb += 48; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec2, vec1); + vec0 = __msa_srai_h(vec0, 3); + vec1 = __msa_srai_h(vec1, 3); + vec2 = __msa_srai_h(vec2, 2); + vec1 = __msa_slli_h(vec1, 11); + vec2 = __msa_slli_h(vec2, 5); + vec0 |= vec1; + dst0 = (v16u8)(vec2 | vec0); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_u += 4; + src_v += 4; + dst_rgb565 += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 4); + reg1 = (v8u16)__msa_srai_h(vec1, 4); + reg2 = (v8u16)__msa_srai_h(vec2, 4); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); + reg1 |= const_0xF000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb4444); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb4444 += 16; + } +} + +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 3); + reg1 = (v8u16)__msa_srai_h(vec1, 3); + reg2 = (v8u16)__msa_srai_h(vec2, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); + reg1 |= const_0x8000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb1555); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb1555 += 16; + } +} + +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16i8 zero = {0}; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); + reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); + reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); + reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVRow_MSA(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* src_argb0_next = src_argb0 + src_stride_argb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v16u8 dst0, dst1; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 = __msa_hadd_u_h(vec8, vec8); + reg1 = __msa_hadd_u_h(vec9, vec9); + reg2 = __msa_hadd_u_h(vec4, vec4); + reg3 = __msa_hadd_u_h(vec5, vec5); + reg4 = __msa_hadd_u_h(vec0, vec0); + reg5 = __msa_hadd_u_h(vec1, vec1); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 += __msa_hadd_u_h(vec8, vec8); + reg1 += __msa_hadd_u_h(vec9, vec9); + reg2 += __msa_hadd_u_h(vec4, vec4); + reg3 += __msa_hadd_u_h(vec5, vec5); + reg4 += __msa_hadd_u_h(vec0, vec0); + reg5 += __msa_hadd_u_h(vec1, vec1); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); + reg6 = reg0 * const_0x70; + reg7 = reg1 * const_0x70; + reg8 = reg2 * const_0x4A; + reg9 = reg3 * const_0x4A; + reg6 += const_0x8080; + reg7 += const_0x8080; + reg8 += reg4 * const_0x26; + reg9 += reg5 * const_0x26; + reg0 *= const_0x12; + reg1 *= const_0x12; + reg2 *= const_0x5E; + reg3 *= const_0x5E; + reg4 *= const_0x70; + reg5 *= const_0x70; + reg2 += reg0; + reg3 += reg1; + reg4 += const_0x8080; + reg5 += const_0x8080; + reg6 -= reg8; + reg7 -= reg9; + reg4 -= reg2; + reg5 -= reg3; + reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); + reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb0 += 128; + src_argb0_next += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; + v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, + 16, 17, 18, 20, 21, 22, 24, 25}; + v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, + 21, 22, 24, 25, 26, 28, 29, 30}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; + v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, + 18, 17, 16, 22, 21, 20, 26, 25}; + v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, + 21, 20, 26, 25, 24, 30, 29, 28}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); + vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); + vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec1 = __msa_binsli_b(vec2, vec3, 4); + vec4 = __msa_binsli_b(vec4, vec5, 2); + vec5 = __msa_binsli_b(vec6, vec7, 4); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); + vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); + vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); + vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); + vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); + vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); + vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); + vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec5 = __msa_binsli_b(vec5, vec6, 2); + vec1 = __msa_binsli_b(vec2, vec3, 5); + vec6 = __msa_binsli_b(vec7, vec8, 5); + vec1 = __msa_binsli_b(vec1, vec4, 0); + vec6 = __msa_binsli_b(vec6, vec9, 0); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1; + v16u8 vec0, vec1; + v16u8 dst0; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); + vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); + src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); + src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); + vec0 = __msa_binsli_b(vec0, src0, 3); + vec1 = __msa_binsli_b(vec1, src1, 3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int32_t x; + v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11; + v8u16 const_112 = (v8u16)__msa_ldi_h(112); + v8u16 const_74 = (v8u16)__msa_ldi_h(74); + v8u16 const_38 = (v8u16)__msa_ldi_h(38); + v8u16 const_94 = (v8u16)__msa_ldi_h(94); + v8u16 const_18 = (v8u16)__msa_ldi_h(18); + v8u16 const_32896 = (v8u16)__msa_fill_h(32896); + v16i8 zero = {0}; + + for (x = width; x > 0; x -= 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec10 = vec0 * const_18; + vec11 = vec1 * const_18; + vec8 = vec2 * const_94; + vec9 = vec3 * const_94; + vec6 = vec4 * const_112; + vec7 = vec5 * const_112; + vec0 *= const_112; + vec1 *= const_112; + vec2 *= const_74; + vec3 *= const_74; + vec4 *= const_38; + vec5 *= const_38; + vec8 += vec10; + vec9 += vec11; + vec6 += const_32896; + vec7 += const_32896; + vec0 += const_32896; + vec1 += const_32896; + vec2 += vec4; + vec3 += vec5; + vec0 -= vec2; + vec1 -= vec3; + vec6 -= vec8; + vec7 -= vec9; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); + vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + v8i16 zero = {0}; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + dst0 = __msa_subs_u_b(src0, src2); + dst1 = __msa_subs_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 zero = {0}; + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); + vec4 = (v8u16)__msa_fill_h(vec0[3]); + vec5 = (v8u16)__msa_fill_h(vec0[7]); + vec6 = (v8u16)__msa_fill_h(vec1[3]); + vec7 = (v8u16)__msa_fill_h(vec1[7]); + vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec6 = (v8u16)__msa_fill_h(vec2[3]); + vec7 = (v8u16)__msa_fill_h(vec2[7]); + vec8 = (v8u16)__msa_fill_h(vec3[3]); + vec9 = (v8u16)__msa_fill_h(vec3[7]); + vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); + reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); + reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); + reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); + reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); + reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); + reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); + reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, src0, mask); + dst1 = __msa_bmnz_v(dst1, src1, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + v16u8 src0, src1, dst0, vec0, vec1; + v8i16 vec_d0; + v8i16 reg0, reg1, reg2; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(0xFF); + + vec_d0 = (v8i16)__msa_fill_w(dither4); + vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); + reg0 += vec_d0; + reg1 += vec_d0; + reg2 += vec_d0; + reg0 = __msa_maxi_s_h((v8i16)reg0, 0); + reg1 = __msa_maxi_s_h((v8i16)reg1, 0); + reg2 = __msa_maxi_s_h((v8i16)reg2, 0); + reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); + reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); + reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); + reg0 = __msa_srai_h(reg0, 3); + reg2 = __msa_srai_h(reg2, 3); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_slli_h(reg2, 11); + reg1 = __msa_slli_h(reg1, 5); + reg0 |= reg1; + dst0 = (v16u8)(reg0 | reg2); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v16i8 vec0; + v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + int32_t val = LW((int32_t*)shuffler); + + vec0 = (v16i8)__msa_fill_w(val); + shuffler_vec += vec0; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16); + dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + v16u8 src0, dst0; + v8u16 vec0, vec1; + v4u32 reg0, reg1, reg2, reg3, rgba_scale; + v8i16 zero = {0}; + + rgba_scale[0] = value; + rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); + rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= rgba_scale; + reg1 *= rgba_scale; + reg2 *= rgba_scale; + reg3 *= rgba_scale; + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0, dst1; + v8u16 reg0; + v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); + reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); + vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2; + v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); + v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); + v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); + v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); + v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); + v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); + v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); + reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); + reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); + reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); + reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); + reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); + reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); + reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); + reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); + vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); + vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); + vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); + ST_UB2(dst0, dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1; + v8u16 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 16); + vec0 = (v8u16)__msa_andi_b(src0, 0x0F); + vec1 = (v8u16)__msa_andi_b(src1, 0x0F); + vec2 = (v8u16)__msa_andi_b(src0, 0xF0); + vec3 = (v8u16)__msa_andi_b(src1, 0xF0); + vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); + vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); + vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); + vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb4444 += 32; + dst_argb += 64; + } +} + +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; + v16u8 dst0, dst1, dst2, dst3; + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); + reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); + reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); + reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); + reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); + reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); + reg3 = -reg3; + reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); + reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); + reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); + reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb1555 += 32; + dst_argb += 64; + } +} + +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); + res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); + res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb565 += 32; + dst_argb += 64; + } +} + +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb24 += 48; + dst_argb += 64; + } +} + +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_raw += 48; + dst_argb += 64; + } +} + +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 dst0; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); + reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); + reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); + reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); + reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); + reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); + reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb1555 += 32; + dst_y += 16; + } +} + +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v4u32 res0, res1, res2, res3; + v16u8 dst0; + v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); + v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); + v8i16 const_0x1080 = __msa_fill_h(0x1080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); + vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); + vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); + vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); + vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); + vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); + vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); + res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); + res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); + res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); + res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); + res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); + res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); + res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); + res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); + res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); + res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); + res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); + res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); + vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); + vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_rgb565 += 32; + dst_y += 16; + } +} + +void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); + v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); + v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_argb1555; + const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + vec2 += src2 & const_0x1F; + vec3 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); + vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); + reg0 = vec6 * const_0x70; + reg1 = vec0 * const_0x4A; + reg2 = vec2 * const_0x70; + reg3 = vec0 * const_0x5E; + reg0 += const_0x8080; + reg1 += vec2 * const_0x26; + reg2 += const_0x8080; + reg3 += vec6 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_rgb565; + const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x3F; + vec3 = src1 & const_0x3F; + vec2 += src2 & const_0x3F; + vec3 += src3 & const_0x3F; + vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + reg0 = vec3 * const_0x70; + reg1 = vec1 * const_0x4A; + reg2 = vec4 * const_0x70; + reg3 = vec1 * const_0x5E; + reg0 += const_32896; + reg1 += vec4 * const_0x26; + reg2 += const_32896; + reg3 += vec3 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h((v8i16)reg0, 2); + reg1 = __msa_srai_h((v8i16)reg1, 2); + reg2 = __msa_srai_h((v8i16)reg2, 2); + reg3 = __msa_srai_h((v8i16)reg3, 2); + vec4 = (v8u16)__msa_pckev_h(reg1, reg0); + vec5 = (v8u16)__msa_pckev_h(reg3, reg2); + vec6 = (v8u16)__msa_pckod_h(reg1, reg0); + vec7 = (v8u16)__msa_pckod_h(reg3, reg2); + vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void RAWToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h(reg0, 2); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_srai_h(reg2, 2); + reg3 = __msa_srai_h(reg3, 2); + vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_uv += 8; + dst_argb += 32; + } +} + +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + vec0 = vec0 >> 3; + vec1 = (vec1 >> 2) << 5; + vec2 = (vec2 >> 3) << 11; + dst0 = (v16u8)(vec0 | vec1 | vec2); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_uv += 8; + dst_rgb565 += 16; + } +} + +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16u8 zero = {0}; + v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_vu); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_vu += 8; + dst_argb += 32; + } +} + +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; + v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; + v16i8 const_0x4 = __msa_ldi_b(0x4); + v16i8 mask1 = mask0 + const_0x4; + v16i8 mask2 = mask1 + const_0x4; + v16i8 mask3 = mask2 + const_0x4; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); + dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, vec1, vec2; + v16u8 reg0, reg1, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); + reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); + reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); + v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); + v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); + v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); + v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); + v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); + v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((const v16i8*)t, 48); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec0 = __msa_aver_u_b(src4, src6); + vec1 = __msa_aver_u_b(src5, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 64); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 80); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 96); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 112); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 64); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 80); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 96); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 112); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec2 = __msa_aver_u_b(src4, src6); + vec3 = __msa_aver_u_b(src5, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void BGRAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void ABGRToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); + v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, src0, src1, src2, src3); + ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void RGBAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0, dst1; + v8u16 vec0, vec1, vec2; + v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + + for (x = 0; x < width; x += 8) { + READI444(src_y, src_u, src_v, src0, src1, src2); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg4 = reg0 + vec_br; + reg5 = reg1 + vec_br; + reg2 = reg0 + vec_bg; + reg3 = reg1 + vec_bg; + reg0 += vec_bb; + reg1 += vec_bb; + vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); + reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + reg0 -= reg6 * vec_ub; + reg1 -= reg7 * vec_ub; + reg2 -= reg6 * vec_ug; + reg3 -= reg7 * vec_ug; + reg4 -= reg8 * vec_vr; + reg5 -= reg9 * vec_vr; + reg2 -= reg8 * vec_vg; + reg3 -= reg9 * vec_vg; + reg0 = __msa_srai_w(reg0, 6); + reg1 = __msa_srai_w(reg1, 6); + reg2 = __msa_srai_w(reg2, 6); + reg3 = __msa_srai_w(reg3, 6); + reg4 = __msa_srai_w(reg4, 6); + reg5 = __msa_srai_w(reg5, 6); + CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); + dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_u += 8; + src_v += 8; + dst_argb += 32; + } +} + +void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1; + v4i32 reg0, reg1, reg2, reg3; + v4i32 vec_yg = __msa_fill_w(0x4A35); + v8i16 vec_ygb = __msa_fill_h(0xFB78); + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 max = __msa_ldi_h(0xFF); + v8i16 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h(zero, vec0); + reg1 = (v4i32)__msa_ilvl_h(zero, vec0); + reg2 = (v4i32)__msa_ilvr_h(zero, vec1); + reg3 = (v4i32)__msa_ilvl_h(zero, vec1); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg2 *= vec_yg; + reg3 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg2 = __msa_srai_w(reg2, 16); + reg3 = __msa_srai_w(reg3, 16); + vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec0 += vec_ygb; + vec1 += vec_ygb; + vec0 = __msa_srai_h(vec0, 6); + vec1 = __msa_srai_h(vec1, 6); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); + res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); + res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); + res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_yuy2, 0); + src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_yuy2 += 16; + dst_argb += 32; + } +} + +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uyvy, 0); + src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_uyvy += 16; + dst_argb += 32; + } +} + +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int32_t source_y_fraction) { + int32_t y1_fraction = source_y_fraction; + int32_t y0_fraction = 256 - y1_fraction; + uint16_t y_fractions; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, y_frac; + + if (0 == y1_fraction) { + memcpy(dst_ptr, src_ptr, width); + return; + } + + if (128 == y1_fraction) { + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + dst0 = __msa_aver_u_b(src0, src2); + dst1 = __msa_aver_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } + return; + } + + y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); + y_frac = (v8u16)__msa_fill_h(y_fractions); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); + vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); + vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); + vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } +} + +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { + int x; + v4i32 dst0 = __builtin_msa_fill_w(v32); + + for (x = 0; x < width; x += 4) { + ST_UB(dst0, dst_argb); + dst_argb += 16; + } +} + +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; + v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, + 18, 17, 16, 21, 20, 19, 24, 23}; + v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, + 24, 23, 28, 27, 26, 31, 30, 29}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32); + src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); + ST_UB2(dst0, dst1, dst_rgb24, 16); + ST_UB(dst2, (dst_rgb24 + 32)); + src_raw += 48; + dst_rgb24 += 48; + } +} + +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_u, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_v, 0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); + ST_UB2(dst0, dst1, dst_uv, 16); + src_u += 16; + src_v += 16; + dst_uv += 32; + } +} + +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + int i; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + + for (i = 0; i < width; i += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_a); + src_argb += 64; + dst_a += 16; + } +} + +void ARGBBlendRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11, vec12, vec13; + v8u16 const_256 = (v8u16)__msa_ldi_h(256); + v16u8 const_255 = (v16u8)__msa_ldi_b(255); + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); + vec8 = (v8u16)__msa_fill_h(vec0[3]); + vec9 = (v8u16)__msa_fill_h(vec0[7]); + vec10 = (v8u16)__msa_fill_h(vec1[3]); + vec11 = (v8u16)__msa_fill_h(vec1[7]); + vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec10 = (v8u16)__msa_fill_h(vec2[3]); + vec11 = (v8u16)__msa_fill_h(vec2[7]); + vec12 = (v8u16)__msa_fill_h(vec3[3]); + vec13 = (v8u16)__msa_fill_h(vec3[7]); + vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); + vec8 = const_256 - vec8; + vec9 = const_256 - vec9; + vec10 = const_256 - vec10; + vec11 = const_256 - vec11; + vec8 *= vec4; + vec9 *= vec5; + vec10 *= vec6; + vec11 *= vec7; + vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); + vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); + vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); + vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); + vec0 += vec8; + vec1 += vec9; + vec2 += vec10; + vec3 += vec11; + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, const_255, mask); + dst1 = __msa_bmnz_v(dst1, const_255, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v4i32 vec_scale = __msa_fill_w(scale); + v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); + v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); + v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48); + vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); + tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); + tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); + tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); + tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); + tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); + tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); + tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); + tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); + tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); + tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); + tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); + tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); + tmp0 *= vec_scale; + tmp1 *= vec_scale; + tmp2 *= vec_scale; + tmp3 *= vec_scale; + tmp4 *= vec_scale; + tmp5 *= vec_scale; + tmp6 *= vec_scale; + tmp7 *= vec_scale; + tmp8 *= vec_scale; + tmp9 *= vec_scale; + tmp10 *= vec_scale; + tmp11 *= vec_scale; + tmp12 *= vec_scale; + tmp13 *= vec_scale; + tmp14 *= vec_scale; + tmp15 *= vec_scale; + tmp0 >>= 16; + tmp1 >>= 16; + tmp2 >>= 16; + tmp3 >>= 16; + tmp4 >>= 16; + tmp5 >>= 16; + tmp6 >>= 16; + tmp7 >>= 16; + tmp8 >>= 16; + tmp9 >>= 16; + tmp10 >>= 16; + tmp11 >>= 16; + tmp12 >>= 16; + tmp13 >>= 16; + tmp14 >>= 16; + tmp15 >>= 16; + vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + dst0 *= vec_int_sz; + dst1 *= vec_int_sz; + dst2 *= vec_int_sz; + dst3 *= vec_int_sz; + dst0 += vec_int_ofst; + dst1 += vec_int_ofst; + dst2 += vec_int_ofst; + dst3 += vec_int_ofst; + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + dst_argb += 64; + } +} + +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int32_t x; + v16i8 src0; + v16u8 src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + src0 = __msa_ld_b((v16i8*)matrix_argb, 0); + vec0 = (v8i16)__msa_ilvr_b(zero, src0); + vec1 = (v8i16)__msa_ilvl_b(zero, src0); + + for (x = 0; x < width; x += 8) { + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); + vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); + vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); + vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); + vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); + vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); + vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); + vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); + vec10 = vec2 * vec0; + vec11 = vec2 * vec1; + vec12 = vec6 * vec0; + vec13 = vec6 * vec1; + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + vec14 = vec3 * vec0; + vec15 = vec3 * vec1; + vec16 = vec7 * vec0; + vec17 = vec7 * vec1; + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + tmp0 = __msa_srai_w(tmp0, 6); + tmp1 = __msa_srai_w(tmp1, 6); + tmp2 = __msa_srai_w(tmp2, 6); + tmp3 = __msa_srai_w(tmp3, 6); + vec2 = vec4 * vec0; + vec6 = vec4 * vec1; + vec3 = vec8 * vec0; + vec7 = vec8 * vec1; + tmp8 = __msa_hadd_s_w(vec2, vec2); + tmp9 = __msa_hadd_s_w(vec6, vec6); + tmp10 = __msa_hadd_s_w(vec3, vec3); + tmp11 = __msa_hadd_s_w(vec7, vec7); + vec4 = vec5 * vec0; + vec8 = vec5 * vec1; + vec5 = vec9 * vec0; + vec9 = vec9 * vec1; + tmp12 = __msa_hadd_s_w(vec4, vec4); + tmp13 = __msa_hadd_s_w(vec8, vec8); + tmp14 = __msa_hadd_s_w(vec5, vec5); + tmp15 = __msa_hadd_s_w(vec9, vec9); + vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + tmp4 = __msa_srai_w(tmp4, 6); + tmp5 = __msa_srai_w(tmp5, 6); + tmp6 = __msa_srai_w(tmp6, 6); + tmp7 = __msa_srai_w(tmp7, 6); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec10 = __msa_maxi_s_h(vec10, 0); + vec11 = __msa_maxi_s_h(vec11, 0); + vec12 = __msa_maxi_s_h(vec12, 0); + vec13 = __msa_maxi_s_h(vec13, 0); + vec10 = __msa_min_s_h(vec10, max); + vec11 = __msa_min_s_h(vec11, max); + vec12 = __msa_min_s_h(vec12, max); + vec13 = __msa_min_s_h(vec13, max); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_u, 16); + ST_UB2(dst2, dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { + int x; + v16u8 dst0 = (v16u8)__msa_fill_b(v8); + + for (x = 0; x < width; x += 16) { + ST_UB(dst0, dst); + dst += 16; + } +} + +void MirrorUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; + v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; + + src_uv += (2 * width); + + for (x = 0; x < width; x += 32) { + src_uv -= 64; + src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16); + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_v, 16); + ST_UB2(dst2, dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int32_t width) { + int x; + v16u8 src0, src1, src2, src3, src4, src5, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; + v16i8 tmp = __msa_ldi_b(8); + v16i8 mask1 = mask0 + tmp; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 16); + src4 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 0); + src5 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 16); + vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); + vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobelx); + src_y0 += 16; + src_y1 += 16; + src_y2 += 16; + dst_sobelx += 16; + } +} + +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int32_t width) { + int x; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + vec0 -= vec2; + vec1 -= vec3; + vec6[0] = src_y0[16] - src_y1[16]; + vec6[1] = src_y0[17] - src_y1[17]; + vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); + vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); + vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); + vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobely); + src_y0 += 16; + src_y1 += 16; + dst_sobely += 16; + } +} + +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; + v4f32 mult_vec; + v8i16 zero = {0}; + mult_vec[0] = 1.9259299444e-34f * scale; + mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); + + for (i = 0; i < width; i += 32) { + src0 = (v8u16)__msa_ld_h((v8i16*)src, 0); + src1 = (v8u16)__msa_ld_h((v8i16*)src, 16); + src2 = (v8u16)__msa_ld_h((v8i16*)src, 32); + src3 = (v8u16)__msa_ld_h((v8i16*)src, 48); + vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); + vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); + vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); + vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); + vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); + vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); + vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); + vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); + fvec0 = __msa_ffint_u_w(vec0); + fvec1 = __msa_ffint_u_w(vec1); + fvec2 = __msa_ffint_u_w(vec2); + fvec3 = __msa_ffint_u_w(vec3); + fvec4 = __msa_ffint_u_w(vec4); + fvec5 = __msa_ffint_u_w(vec5); + fvec6 = __msa_ffint_u_w(vec6); + fvec7 = __msa_ffint_u_w(vec7); + fvec0 *= mult_vec; + fvec1 *= mult_vec; + fvec2 *= mult_vec; + fvec3 *= mult_vec; + fvec4 *= mult_vec; + fvec5 *= mult_vec; + fvec6 *= mult_vec; + fvec7 *= mult_vec; + vec0 = ((v4u32)fvec0) >> 13; + vec1 = ((v4u32)fvec1) >> 13; + vec2 = ((v4u32)fvec2) >> 13; + vec3 = ((v4u32)fvec3) >> 13; + vec4 = ((v4u32)fvec4) >> 13; + vec5 = ((v4u32)fvec5) >> 13; + vec6 = ((v4u32)fvec6) >> 13; + vec7 = ((v4u32)fvec7) >> 13; + dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); + dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + ST_UH2(dst0, dst1, dst, 8); + ST_UH2(dst2, dst3, dst + 16, 8); + src += 32; + dst += 32; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/libs/libvpx/third_party/libyuv/source/row_neon.cc b/libs/libvpx/third_party/libyuv/source/row_neon.cc index 909df060c6..ff87e74c62 100644 --- a/libs/libvpx/third_party/libyuv/source/row_neon.cc +++ b/libs/libvpx/third_party/libyuv/source/row_neon.cc @@ -10,6 +10,8 @@ #include "libyuv/row.h" +#include + #ifdef __cplusplus namespace libyuv { extern "C" { @@ -20,1446 +22,1311 @@ extern "C" { !defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.32 {d2[0]}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.32 {d2[1]}, [%2]! \n" - -// Read 8 Y, 2 U and 2 V from 422 -#define READYUV411 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.16 {d2[0]}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.16 {d2[1]}, [%2]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d2, d3 \n" +#define READYUV422 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.32 {d2[0]}, [%1]! \n" \ + "vld1.32 {d2[1]}, [%2]! \n" // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.8 {d3}, [%2]! \n" \ - "vpaddl.u8 q1, q1 \n" \ - "vrshrn.u16 d2, q1, #1 \n" +#define READYUV444 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - "vmov.u8 d2, #128 \n" +#define READYUV400 \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 Y and 4 VU from NV21 #define READNV21 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d3, d2 \n" \ - "vtrn.u32 d2, d3 \n" + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 YUY2 -#define READYUY2 \ - MEMACCESS(0) \ - "vld2.8 {d0, d2}, [%0]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READYUY2 \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 UYVY -#define READUYVY \ - MEMACCESS(0) \ - "vld2.8 {d2, d3}, [%0]! \n" \ - "vmov.u8 d0, d3 \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READUYVY \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" -#define YUVTORGB_SETUP \ - MEMACCESS([kUVToRB]) \ - "vld1.8 {d24}, [%[kUVToRB]] \n" \ - MEMACCESS([kUVToG]) \ - "vld1.8 {d25}, [%[kUVToG]] \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ - MEMACCESS([kYToRgb]) \ - "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" +#define YUVTORGB_SETUP \ + "vld1.8 {d24}, [%[kUVToRB]] \n" \ + "vld1.8 {d25}, [%[kUVToG]] \n" \ + "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ + "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" -#define YUVTORGB \ - "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\ - "vmull.u8 q9, d2, d25 \n" /* u/v G component */\ - "vmovl.u8 q0, d0 \n" /* Y */\ - "vmovl.s16 q10, d1 \n" \ - "vmovl.s16 q0, d0 \n" \ - "vmul.s32 q10, q10, q15 \n" \ - "vmul.s32 q0, q0, q15 \n" \ - "vqshrun.s32 d0, q0, #16 \n" \ - "vqshrun.s32 d1, q10, #16 \n" /* Y */\ - "vadd.s16 d18, d19 \n" \ - "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\ - "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\ - "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\ - "vaddw.u16 q1, q1, d16 \n" \ - "vaddw.u16 q10, q10, d17 \n" \ - "vaddw.u16 q3, q3, d18 \n" \ - "vqadd.s16 q8, q0, q13 \n" /* B */ \ - "vqadd.s16 q9, q0, q14 \n" /* R */ \ - "vqadd.s16 q0, q0, q4 \n" /* G */ \ - "vqadd.s16 q8, q8, q1 \n" /* B */ \ - "vqadd.s16 q9, q9, q10 \n" /* R */ \ - "vqsub.s16 q0, q0, q3 \n" /* G */ \ - "vqshrun.s16 d20, q8, #6 \n" /* B */ \ - "vqshrun.s16 d22, q9, #6 \n" /* R */ \ - "vqshrun.s16 d21, q0, #6 \n" /* G */ +#define YUVTORGB \ + "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \ + "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \ + "vmovl.u8 q0, d0 \n" /* Y */ \ + "vmovl.s16 q10, d1 \n" \ + "vmovl.s16 q0, d0 \n" \ + "vmul.s32 q10, q10, q15 \n" \ + "vmul.s32 q0, q0, q15 \n" \ + "vqshrun.s32 d0, q0, #16 \n" \ + "vqshrun.s32 d1, q10, #16 \n" /* Y */ \ + "vadd.s16 d18, d19 \n" \ + "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \ + "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \ + "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \ + "vaddw.u16 q1, q1, d16 \n" \ + "vaddw.u16 q10, q10, d17 \n" \ + "vaddw.u16 q3, q3, d18 \n" \ + "vqadd.s16 q8, q0, q13 \n" /* B */ \ + "vqadd.s16 q9, q0, q14 \n" /* R */ \ + "vqadd.s16 q0, q0, q4 \n" /* G */ \ + "vqadd.s16 q8, q8, q1 \n" /* B */ \ + "vqadd.s16 q9, q9, q10 \n" /* R */ \ + "vqsub.s16 q0, q0, q3 \n" /* G */ \ + "vqshrun.s16 d20, q8, #6 \n" /* B */ \ + "vqshrun.s16 d22, q9, #6 \n" /* R */ \ + "vqshrun.s16 d21, q0, #6 \n" /* G */ -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV444 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV444 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I422AlphaToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %5, %5, #8 \n" - MEMACCESS(3) - "vld1.8 {d23}, [%3]! \n" - MEMACCESS(4) - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %5, %5, #8 \n" + "vld1.8 {d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV411 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB - MEMACCESS(3) - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgba), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -#define ARGBTORGB565 \ - "vshll.u8 q0, d22, #8 \n" /* R */ \ - "vshll.u8 q8, d21, #8 \n" /* G */ \ - "vshll.u8 q9, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #5 \n" /* RG */ \ - "vsri.16 q0, q9, #11 \n" /* RGB */ +#define ARGBTORGB565 \ + "vshll.u8 q0, d22, #8 \n" /* R */ \ + "vshll.u8 q8, d21, #8 \n" /* G */ \ + "vshll.u8 q9, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #5 \n" /* RG */ \ + "vsri.16 q0, q9, #11 \n" /* RGB */ -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - ARGBTORGB565 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -#define ARGBTOARGB1555 \ - "vshll.u8 q0, d23, #8 \n" /* A */ \ - "vshll.u8 q8, d22, #8 \n" /* R */ \ - "vshll.u8 q9, d21, #8 \n" /* G */ \ - "vshll.u8 q10, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #1 \n" /* AR */ \ - "vsri.16 q0, q9, #6 \n" /* ARG */ \ - "vsri.16 q0, q10, #11 \n" /* ARGB */ +#define ARGBTOARGB1555 \ + "vshll.u8 q0, d23, #8 \n" /* A */ \ + "vshll.u8 q8, d22, #8 \n" /* R */ \ + "vshll.u8 q9, d21, #8 \n" /* G */ \ + "vshll.u8 q10, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #1 \n" /* AR */ \ + "vsri.16 q0, q9, #6 \n" /* ARG */ \ + "vsri.16 q0, q10, #11 \n" /* ARGB */ -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" - ARGBTOARGB1555 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -#define ARGBTOARGB4444 \ - "vshr.u8 d20, d20, #4 \n" /* B */ \ - "vbic.32 d21, d21, d4 \n" /* G */ \ - "vshr.u8 d22, d22, #4 \n" /* R */ \ - "vbic.32 d23, d23, d4 \n" /* A */ \ - "vorr d0, d20, d21 \n" /* BG */ \ - "vorr d1, d22, d23 \n" /* RA */ \ - "vzip.u8 d0, d1 \n" /* BGRA */ +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" - ARGBTOARGB4444 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb4444), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d4, #0x0f \n" // vbic bits to clear + "1: \n" + + READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV400 - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), - [kUVToG]"r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV400 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB), + [kUVToG] "r"(&kYuvI601Constants.kUVToG), + [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR), + [kYToRgb] "r"(&kYuvI601Constants.kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void J400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d23, #255 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d20", "d21", "d22", "d23" - ); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d23, #255 \n" + "1: \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23"); } -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READNV12 - YUVTORGB - "subs %3, %3, #8 \n" - MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READNV21 - YUVTORGB - "subs %3, %3, #8 \n" - MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READNV12 - YUVTORGB - "subs %3, %3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUY2 - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READUYVY - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store U - MEMACCESS(2) - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load U - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(2) - "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" - : - "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "d0", "d1", "d2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); } // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8* src, uint8* dst, int count) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - MEMACCESS(1) - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); } -// SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint8 v8, int count) { - asm volatile ( - "vdup.8 q0, %2 \n" // duplicate 16 bytes - "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v8) // %2 - : "cc", "memory", "q0" - ); +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "vdup.8 q0, %2 \n" // duplicate 16 bytes + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "q0"); } -// ARGBSetRow writes 'count' pixels using an 32 bit value repeated. -void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "cc", "memory", "q0" - ); +// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0"); } -void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2 \n" - "sub %0, #16 \n" +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" - ); + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); } -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" - "1: \n" - MEMACCESS(0) - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // dst += 8 - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "r12", "q0" - ); + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0"); } -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" - ); + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); } -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); } -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); } -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3" // Clobber List - ); +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3" // Clobber List + ); } -#define RGB565TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ - "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ - "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); } -#define ARGB1555TOARGB \ - "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ - "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ - "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ - "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ - "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ - "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ - "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ - "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ - "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ - "vorr.u8 q1, q1, q3 \n" /* R,A */ \ - "vorr.u8 q0, q0, q2 \n" /* B,G */ \ +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ - "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ - "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); } -#define ARGB4444TOARGB \ - "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ - "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ - "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ - "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ - "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ - "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ - "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ - "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); } -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 U. - MEMACCESS(2) - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); } -void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 U. - MEMACCESS(2) - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); } -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 U. - MEMACCESS(3) - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List - ); +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); } -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_uyvy - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 U. - MEMACCESS(3) - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(stride_uyvy), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List - ); +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_uyvy + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // shuffler - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); } -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 Us - MEMACCESS(2) - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3" - ); +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); } -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 Us - MEMACCESS(2) - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3" - ); +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); } -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); } -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { - asm volatile ( - "vdup.32 d2, %2 \n" // dither4 - "1: \n" - MEMACCESS(1) - "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d20, d20, d2 \n" - "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" - ARGBTORGB565 - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11" - ); +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + asm volatile( + "vdup.32 d2, %2 \n" // dither4 + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" // add for dither + ARGBTORGB565 + "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); } -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); } -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, int width) { - asm volatile ( - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); + asm volatile( + "vmov.u8 d4, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); } -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); } -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + asm volatile( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 + // coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" - ); -} - -// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { - asm volatile ( - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(0) - "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. - MEMACCESS(0) - "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. - "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. - - "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. - "vpadd.u16 d1, d8, d9 \n" // B - "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. - "vpadd.u16 d3, d10, d11 \n" // G - "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. - "vpadd.u16 d5, d12, d13 \n" // R - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %3, %3, #32 \n" // 32 processed per loop. - "vmul.s16 q8, q0, q10 \n" // B - "vmls.s16 q8, q1, q11 \n" // G - "vmls.s16 q8, q2, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q2, q10 \n" // R - "vmls.s16 q9, q1, q14 \n" // G - "vmls.s16 q9, q0, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", + "q15"); } +// clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ - "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ - "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1468,17 +1335,13 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1490,9 +1353,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1507,8 +1368,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient @@ -1517,17 +1381,13 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1539,9 +1399,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1555,8 +1413,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1565,17 +1426,13 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. @@ -1587,9 +1444,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q3, q2, q1) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -1603,8 +1458,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1613,17 +1471,13 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1635,9 +1489,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -1651,8 +1503,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1661,17 +1516,13 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. @@ -1683,9 +1534,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -1699,8 +1548,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) { +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1709,17 +1561,13 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1731,9 +1579,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -1747,8 +1593,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) { +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1757,17 +1606,13 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1779,9 +1624,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -1796,875 +1639,815 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_stride_rgb565), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_stride_argb1555), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_stride_argb4444), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { int y1_fraction = source_y_fraction; - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction) // %4 - : - : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" - ); + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "subs %3, #8 \n" - "blt 89f \n" - // Blend 8 pixels. - "8: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" - "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" - // Blend 1 pixels. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - MEMACCESS(1) - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - MEMACCESS(2) - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" + // Blend 1 pixels. + "1: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" - "99: \n" + "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" - ); + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); } // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - // Attenuate 8 pixels. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12" - ); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { - asm volatile ( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" - "vqdmulh.s16 q0, q0, q8 \n" // b * scale - "vqdmulh.s16 q1, q1, q8 \n" // g - "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - MEMACCESS(0) - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" - ); + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - asm volatile ( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" - "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 - "vqrdmulh.s16 q11, q11, d0[1] \n" // g - "vqrdmulh.s16 q12, q12, d0[2] \n" // r - "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - MEMACCESS(1) - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "q0", "q10", "q11", "q12", "q13" - ); + // 8 pixel loop. + "1: \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - MEMACCESS(0) - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "q0", "q1", "q2", "q3", - "q10", "q11", "q12", "q13", "q14", "q15" - ); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", + "q14", "q15"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - "1: \n" - MEMACCESS(0) - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R - "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - MEMACCESS(1) - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" - ); + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -2672,54 +2455,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. @@ -2727,115 +2506,186 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0],%5 \n" // top - MEMACCESS(0) - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - MEMACCESS(1) - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - MEMACCESS(1) - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - MEMACCESS(2) - "vld1.8 {d2}, [%2],%5 \n" // bottom - MEMACCESS(2) - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - MEMACCESS(3) - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2), // %5 - "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List - ); +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); } // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0],%4 \n" // left - MEMACCESS(1) - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - MEMACCESS(1) - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0],%5 \n" // right - MEMACCESS(1) - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1), // %4 - "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List - ); +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); } -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +// %y passes a float as a scalar vector for vector * scalar multiply. +// the regoster must be d0 to d15 and indexed with [0] or [1] to access +// the float in the first or second float of the d-reg + +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/third_party/libyuv/source/row_neon64.cc b/libs/libvpx/third_party/libyuv/source/row_neon64.cc index 6375d4f55f..24b4520bab 100644 --- a/libs/libvpx/third_party/libyuv/source/row_neon64.cc +++ b/libs/libvpx/third_party/libyuv/source/row_neon64.cc @@ -19,118 +19,103 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v1.s}[0], [%1], #4 \n" \ - MEMACCESS(2) \ - "ld1 {v1.s}[1], [%2], #4 \n" - -// Read 8 Y, 2 U and 2 V from 422 -#define READYUV411 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.h}[0], [%1], #2 \n" \ - MEMACCESS(2) \ - "ld1 {v2.h}[1], [%2], #2 \n" \ - "zip1 v1.8b, v2.8b, v2.8b \n" +#define READYUV422 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.s}[0], [%1], #4 \n" \ + "ld1 {v1.s}[1], [%2], #4 \n" // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v1.d}[0], [%1], #8 \n" \ - MEMACCESS(2) \ - "ld1 {v1.d}[1], [%2], #8 \n" \ - "uaddlp v1.8h, v1.16b \n" \ - "rshrn v1.8b, v1.8h, #1 \n" +#define READYUV444 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.d}[0], [%1], #8 \n" \ + "ld1 {v1.d}[1], [%2], #8 \n" \ + "uaddlp v1.8h, v1.16b \n" \ + "rshrn v1.8b, v1.8h, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "movi v1.8b , #128 \n" +#define READYUV400 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "movi v1.8b , #128 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READNV12 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 Y and 4 VU from NV21 -#define READNV21 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v3.8b, v2.8b, v2.8b \n" \ - "uzp2 v1.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READNV21 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v3.8b, v2.8b, v2.8b \n" \ + "uzp2 v1.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 YUY2 -#define READYUY2 \ - MEMACCESS(0) \ - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ - "uzp2 v3.8b, v1.8b, v1.8b \n" \ - "uzp1 v1.8b, v1.8b, v1.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READYUY2 \ + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ + "uzp2 v3.8b, v1.8b, v1.8b \n" \ + "uzp1 v1.8b, v1.8b, v1.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 UYVY -#define READUYVY \ - MEMACCESS(0) \ - "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ - "orr v0.8b, v3.8b, v3.8b \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READUYVY \ + "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ + "orr v0.8b, v3.8b, v3.8b \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" -#define YUVTORGB_SETUP \ - "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ - "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ - "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" +#define YUVTORGB_SETUP \ + "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ + "ld1r {v31.4s}, [%[kYToRgb]] \n" \ + "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ + "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" -#define YUVTORGB(vR, vG, vB) \ - "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ - "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ - "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ - "ushll v0.4s, v0.4h, #0 \n" \ - "mul v3.4s, v3.4s, v31.4s \n" \ - "mul v0.4s, v0.4s, v31.4s \n" \ - "sqshrun v0.4h, v0.4s, #16 \n" \ - "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ - "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ - "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ - "uxtl v2.8h, v2.8b \n" \ - "uxtl v1.8h, v1.8b \n" /* Extract U */ \ - "mul v3.8h, v1.8h, v27.8h \n" \ - "mul v5.8h, v1.8h, v29.8h \n" \ - "mul v6.8h, v2.8h, v30.8h \n" \ - "mul v7.8h, v2.8h, v28.8h \n" \ - "sqadd v6.8h, v6.8h, v5.8h \n" \ - "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ - "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ - "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ - "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ - "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ - "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ - "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ - "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ - "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ +#define YUVTORGB(vR, vG, vB) \ + "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ + "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ + "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ + "ushll v0.4s, v0.4h, #0 \n" \ + "mul v3.4s, v3.4s, v31.4s \n" \ + "mul v0.4s, v0.4s, v31.4s \n" \ + "sqshrun v0.4h, v0.4s, #16 \n" \ + "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ + "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ + "uxtl v2.8h, v2.8b \n" \ + "uxtl v1.8h, v1.8b \n" /* Extract U */ \ + "mul v3.8h, v1.8h, v27.8h \n" \ + "mul v5.8h, v1.8h, v29.8h \n" \ + "mul v6.8h, v2.8h, v30.8h \n" \ + "mul v7.8h, v2.8h, v28.8h \n" \ + "sqadd v6.8h, v6.8h, v5.8h \n" \ + "sqadd " #vB \ + ".8h, v24.8h, v0.8h \n" /* B */ \ + "sqadd " #vG \ + ".8h, v25.8h, v0.8h \n" /* G */ \ + "sqadd " #vR \ + ".8h, v26.8h, v0.8h \n" /* R */ \ + "sqadd " #vB ".8h, " #vB \ + ".8h, v3.8h \n" /* B */ \ + "sqsub " #vG ".8h, " #vG \ + ".8h, v6.8h \n" /* G */ \ + "sqadd " #vR ".8h, " #vR \ + ".8h, v7.8h \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB \ + ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG \ + ".8h, #6 \n" /* G */ \ + "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -140,7 +125,6 @@ void I444ToARGBRow_NEON(const uint8* src_y, READYUV444 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -157,10 +141,10 @@ void I444ToARGBRow_NEON(const uint8* src_y, ); } -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -170,7 +154,6 @@ void I422ToARGBRow_NEON(const uint8* src_y, READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -187,11 +170,11 @@ void I422ToARGBRow_NEON(const uint8* src_y, ); } -void I422AlphaToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -199,10 +182,8 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, "1: \n" READYUV422 YUVTORGB(v22, v21, v20) - MEMACCESS(3) "ld1 {v23.8b}, [%3], #8 \n" "subs %w5, %w5, #8 \n" - MEMACCESS(4) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -220,40 +201,10 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, ); } -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" - READYUV411 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - MEMACCESS(3) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); -} - -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -263,7 +214,6 @@ void I422ToRGBARow_NEON(const uint8* src_y, READYUV422 YUVTORGB(v23, v22, v21) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -280,10 +230,10 @@ void I422ToRGBARow_NEON(const uint8* src_y, ); } -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -292,7 +242,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y, READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -309,97 +258,91 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ); } -#define ARGBTORGB565 \ - "shll v0.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v21.8h, #5 \n" /* RG */ \ - "sri v0.8h, v20.8h, #11 \n" /* RGB */ +#define ARGBTORGB565 \ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - ARGBTORGB565 - MEMACCESS(3) - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } -#define ARGBTOARGB1555 \ - "shll v0.8h, v23.8b, #8 \n" /* A */ \ - "shll v22.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v22.8h, #1 \n" /* AR */ \ - "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ - "sri v0.8h, v20.8h, #11 \n" /* ARGB */ +#define ARGBTOARGB1555 \ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - ARGBTOARGB1555 - MEMACCESS(3) - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } -#define ARGBTOARGB4444 \ - /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ - "ushr v20.8b, v20.8b, #4 \n" /* B */ \ - "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ - "ushr v22.8b, v22.8b, #4 \n" /* R */ \ - "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ - "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ - "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ - "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ +#define ARGBTOARGB4444 \ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -411,7 +354,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB4444 - MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -428,9 +370,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ); } -void I400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile ( YUVTORGB_SETUP "movi v23.8b, #255 \n" @@ -438,7 +378,6 @@ void I400ToARGBRow_NEON(const uint8* src_y, READYUV400 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -453,31 +392,26 @@ void I400ToARGBRow_NEON(const uint8* src_y, ); } -void J400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { - asm volatile ( - "movi v23.8b, #255 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v20.8b}, [%0], #8 \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - MEMACCESS(1) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v20", "v21", "v22", "v23" - ); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "movi v23.8b, #255 \n" + "1: \n" + "ld1 {v20.8b}, [%0], #8 \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v20", "v21", "v22", "v23"); } -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -487,7 +421,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y, READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -503,9 +436,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ); } -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -515,7 +448,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y, READNV21 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -531,24 +463,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ); } -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( YUVTORGB_SETUP "1: \n" READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 + "+r"(dst_rgb24), // %2 "+r"(width) // %3 : [kUVToRB]"r"(&yuvconstants->kUVToRB), [kUVToG]"r"(&yuvconstants->kUVToG), @@ -559,8 +489,59 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ); } -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READNV21 + YUVTORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB( + v22, v21, + v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -570,7 +551,6 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, READYUY2 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_yuy2), // %0 @@ -585,8 +565,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ); } -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -596,7 +576,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, READUYVY YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" : "+r"(src_uyvy), // %0 @@ -612,869 +591,819 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %w3, %w3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store U - MEMACCESS(2) - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load U - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - MEMACCESS(2) - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" - : - "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); } -// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8* src, uint8* dst, int count) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 - "subs %w2, %w2, #32 \n" // 32 processed per loop - MEMACCESS(1) - "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint8 v8, int count) { - asm volatile ( - "dup v0.16b, %w2 \n" // duplicate 16 bytes - "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v8) // %2 - : "cc", "memory", "v0" - ); -} - -void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( - "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "cc", "memory", "v0" - ); -} - -void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "rev64 v0.16b, v0.16b \n" - MEMACCESS(1) - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - MEMACCESS(1) - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0" - ); -} - -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %w3, %w3, #8 \n" // 8 pixels per loop. - "rev64 v0.8b, v0.8b \n" - "rev64 v1.8b, v1.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // dst += 8 - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)-16) // %4 - : "cc", "memory", "v0", "v1" - ); + asm volatile( + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); } -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "rev64 v0.4s, v0.4s \n" - MEMACCESS(1) - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - MEMACCESS(1) - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0" - ); +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); } -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "movi v4.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); +// Copy multiple of 32. +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); } -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "movi v5.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - MEMACCESS(1) - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "dup v0.16b, %w2 \n" // duplicate 16 bytes + "1: \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "v0"); } -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - MEMACCESS(1) - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0"); } -#define RGB565TOARGB \ - "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ - "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ - "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ - "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ - "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ - "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ - "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ - "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ - "dup v2.2D, v0.D[1] \n" /* R */ - -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List - ); +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); } -#define ARGB1555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ - \ - "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ - "xtn2 v3.16b, v2.8h \n" \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ - "dup v1.2D, v0.D[1] \n" \ - "dup v3.2D, v2.D[1] \n" +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %w3, %w3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1"); +} + +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v4.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "movi v5.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +#define RGB565TOARGB \ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ + +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ - "dup v1.2D, v0.D[1] \n" /* G */ \ +#define RGB555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -#define ARGB4444TOARGB \ - "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ - "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ - "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ - "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ - "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ - "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ - "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ - "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ - "dup v0.2D, v2.D[1] \n" \ - "dup v1.2D, v3.D[1] \n" +#define ARGB4444TOARGB \ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); } -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g - "orr v5.8b, v1.8b, v1.8b \n" // mov b - MEMACCESS(1) - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - MEMACCESS(2) - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of + // RGB24. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); } -void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - MEMACCESS(2) - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - MEMACCESS(3) - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(src_yuy2b), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v5", "v6", "v7" // Clobber List - ); +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_uyvyb = src_uyvy + stride_uyvy; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - MEMACCESS(3) - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(src_uyvyb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v5", "v6", "v7" // Clobber List - ); +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(src_uyvyb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %w2, %w2, #4 \n" // 4 processed per loop - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %w2, %w2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); } -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "orr v2.8b, v1.8b, v1.8b \n" - MEMACCESS(1) - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - MEMACCESS(2) - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - MEMACCESS(3) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "orr v2.8b, v1.8b, v1.8b \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "orr v3.8b, v2.8b, v2.8b \n" - MEMACCESS(1) - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - MEMACCESS(2) - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - MEMACCESS(3) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23" - ); +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); } -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { - asm volatile ( - "dup v1.4s, %w2 \n" // dither4 - "1: \n" - MEMACCESS(1) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v20.8b, v20.8b, v1.8b \n" - "uqadd v21.8b, v21.8b, v1.8b \n" - "uqadd v22.8b, v22.8b, v1.8b \n" - ARGBTORGB565 - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" - ); +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + asm volatile( + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"); } -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23" - ); + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB1555. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); } -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, int width) { - asm volatile ( - "movi v4.16b, #0x0f \n" // bits to clear with vbic. - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" - ); + asm volatile( + "movi v4.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB4444. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); } -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 + // pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #15 \n" // B * 0.11400 coefficient - "movi v5.8b, #75 \n" // G * 0.58700 coefficient - "movi v6.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + asm volatile( + "movi v24.8b, #112 \n" // UB / VR 0.875 + // coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + // pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v24", "v25", "v26", "v27", "v28", "v29" - ); + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", + "v27", "v28", "v29"); } -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - -// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(0) - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. - "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. - "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. - "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w3, %w3, #32 \n" // 32 processed per loop. - "mul v3.8h, v0.8h, v20.8h \n" // B - "mls v3.8h, v1.8h, v21.8h \n" // G - "mls v3.8h, v2.8h, v22.8h \n" // R - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "mul v4.8h, v2.8h, v20.8h \n" // R - "mls v4.8h, v1.8h, v24.8h \n" // G - "mls v4.8h, v0.8h, v23.8h \n" // B - "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ - "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ - "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ - "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ - "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ - "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ - "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ - "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ - "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ - "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +// clang-format off +#define RGBTOUV(QB, QG, QR) \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ + "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ + "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ + "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb_1 = src_argb + src_stride_argb; +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1486,9 +1415,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1503,9 +1430,12 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb_1 = src_argb + src_stride_argb; +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 @@ -1514,12 +1444,10 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1531,9 +1459,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1547,18 +1473,19 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_bgra_1 = src_bgra + src_stride_bgra; +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. @@ -1570,9 +1497,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_bgra), // %0 @@ -1586,18 +1511,19 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_abgr_1 = src_abgr + src_stride_abgr; +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1609,9 +1535,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_abgr), // %0 @@ -1625,18 +1549,19 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_rgba_1 = src_rgba + src_stride_rgba; +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. @@ -1648,9 +1573,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgba), // %0 @@ -1664,18 +1587,19 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1687,9 +1611,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgb24), // %0 @@ -1703,18 +1625,19 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_raw_1 = src_raw + src_stride_raw; +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1726,9 +1649,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1743,699 +1664,656 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; - asm volatile ( - "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 - "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 - "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 - "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 - "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 - "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; + asm volatile( + "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / + // 2 + "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 + "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 + "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 + "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 + "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v17.D[0] \n" - "ins v18.D[1], v19.D[0] \n" - "ins v20.D[1], v21.D[0] \n" + "ins v16.D[1], v17.D[0] \n" + "ins v18.D[1], v19.D[0] \n" + "ins v20.D[1], v21.D[0] \n" - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v18.8h, #1 \n" - "urshr v6.8h, v20.8h, #1 \n" + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v18.8h, #1 \n" + "urshr v6.8h, v20.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v16.8h, v4.8h, v22.8h \n" // B - "mls v16.8h, v5.8h, v23.8h \n" // G - "mls v16.8h, v6.8h, v24.8h \n" // R - "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned - "mul v17.8h, v6.8h, v22.8h \n" // R - "mls v17.8h, v5.8h, v26.8h \n" // G - "mls v17.8h, v4.8h, v25.8h \n" // B - "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_rgb565_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27" - ); + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v16.8h, v4.8h, v22.8h \n" // B + "mls v16.8h, v5.8h, v23.8h \n" // G + "mls v16.8h, v6.8h, v24.8h \n" // R + "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned + "mul v17.8h, v6.8h, v22.8h \n" // R + "mls v17.8h, v5.8h, v26.8h \n" // G + "mls v17.8h, v4.8h, v25.8h \n" // B + "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_rgb565_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_argb1555_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28" - ); + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_argb1555_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_argb4444_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28" + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_argb4444_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28" - ); + ); } -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { - asm volatile ( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", - "v24", "v25", "v26", "v27" - ); +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", + "v27"); } -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { - asm volatile ( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" - ); +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" + const uint8_t* src_ptr1 = src_ptr + src_stride; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction), // %4 - "+r"(y0_fraction) // %5 - : - : "cc", "memory", "v0", "v1", "v3", "v4", "v5" - ); + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" - // Blend 8 pixels. - "8: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.ge 8b \n" +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" + // Blend 8 pixels. + "8: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + // pixels + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + // pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" - "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" + "89: \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" - // Blend 1 pixels. - "1: \n" - MEMACCESS(0) - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. - MEMACCESS(1) - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - MEMACCESS(2) - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" + // Blend 1 pixels. + "1: \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" - "99: \n" + "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18" - ); + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18"); } // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - // Attenuate 8 pixels. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { - asm volatile ( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - MEMACCESS(0) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - asm volatile ( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" // b (0 .. 255) - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g - "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r - "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - MEMACCESS(1) - "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "v0", "v4", "v5", "v6", "v7" - ); + // 8 pixel loop. + "1: \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "movi v24.8b, #15 \n" // B * 0.11400 coefficient - "movi v25.8b, #75 \n" // G * 0.58700 coefficient - "movi v26.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" - ); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. @@ -2443,194 +2321,180 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { - asm volatile ( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - MEMACCESS(0) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" - ); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - "1: \n" - MEMACCESS(0) - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - MEMACCESS(1) - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v22", "v23", "v24", "v25" - ); + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v22", "v23", "v24", "v25"); } // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v4.8b \n" - "uqadd v1.8b, v1.8b, v5.8b \n" - "uqadd v2.8b, v2.8b, v6.8b \n" - "uqadd v3.8b, v3.8b, v7.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqsub v0.8b, v0.8b, v4.8b \n" - "uqsub v1.8b, v1.8b, v5.8b \n" - "uqsub v2.8b, v2.8b, v6.8b \n" - "uqsub v3.8b, v3.8b, v7.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -2638,54 +2502,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - MEMACCESS(1) - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "orr v1.8b, v0.8b, v0.8b \n" - "orr v2.8b, v0.8b, v0.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "uqadd v0.16b, v0.16b, v1.16b \n" // add - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1" - ); +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. @@ -2693,114 +2553,329 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - MEMACCESS(1) - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v1.8b, v0.8b, v2.8b \n" // add - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0],%5 \n" // top - MEMACCESS(0) - "ld1 {v1.8b}, [%0],%6 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - MEMACCESS(1) - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%6 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - MEMACCESS(2) - "ld1 {v2.8b}, [%2],%5 \n" // bottom - MEMACCESS(2) - "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - MEMACCESS(3) - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2LL), // %5 - "r"(6LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2LL), // %5 + "r"(6LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0],%4 \n" // left - MEMACCESS(1) - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0],%5 \n" // right - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1LL), // %4 - "r"(6LL) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1LL), // %4 + "r"(6LL) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } + +// Caveat - rounds float to half float whereas scaling version truncates. +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fmax; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fmax; +} + +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fsum; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fsum) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fsum; +} + +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { + asm volatile( + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + asm volatile( + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { + const uint32_t* src1 = src + 1; + const uint32_t* src2 = src + 2; + const uint32_t* src3 = src + 3; + asm volatile( + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(width) // %5 + : "r"(32LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/libs/libvpx/third_party/libyuv/source/row_win.cc b/libs/libvpx/third_party/libyuv/source/row_win.cc index 2a3da8969f..5500d7f5a6 100644 --- a/libs/libvpx/third_party/libyuv/source/row_win.cc +++ b/libs/libvpx/third_party/libyuv/source/row_win.cc @@ -28,72 +28,71 @@ extern "C" { #if defined(_M_X64) // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; +#define READYUV422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; +#define READYUVA422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm2 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ - xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ - xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ - xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm0 = _mm_adds_epi16(xmm0, xmm4); \ - xmm1 = _mm_adds_epi16(xmm1, xmm4); \ - xmm2 = _mm_adds_epi16(xmm2, xmm4); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ - xmm2 = _mm_packus_epi16(xmm2, xmm2); +#define YUVTORGB(yuvconstants) \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm2 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ + xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ + xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ + xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm0 = _mm_adds_epi16(xmm0, xmm4); \ + xmm1 = _mm_adds_epi16(xmm1, xmm4); \ + xmm2 = _mm_adds_epi16(xmm2, xmm4); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); // Store 8 ARGB values. -#define STOREARGB \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ - _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ - _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ - dst_argb += 32; - +#define STOREARGB \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ + _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ + _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ + dst_argb += 32; #if defined(HAS_I422TOARGBROW_SSSE3) -void I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUV422 YUVTORGB(yuvconstants) @@ -104,15 +103,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, #endif #if defined(HAS_I422ALPHATOARGBROW_SSSE3) -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm4, xmm5; - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA422 YUVTORGB(yuvconstants) @@ -127,175 +126,143 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, #ifdef HAS_ARGBTOYROW_SSSE3 // Constants for ARGB. -static const vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. -static const vec8 kARGBToYJ = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; -static const vec8 kARGBToU = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; -static const vec8 kARGBToUJ = { - 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 -}; +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; static const vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; -static const vec8 kARGBToVJ = { - -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 -}; +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -}; + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; // Constants for BGRA. -static const vec8 kBGRAToY = { - 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 -}; +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; -static const vec8 kBGRAToU = { - 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 -}; +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; -static const vec8 kBGRAToV = { - 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 -}; +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR. -static const vec8 kABGRToY = { - 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 -}; +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; -static const vec8 kABGRToU = { - -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 -}; +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; -static const vec8 kABGRToV = { - 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 -}; +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static const vec8 kRGBAToY = { - 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 -}; +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; -static const vec8 kRGBAToU = { - 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 -}; +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; -static const vec8 kRGBAToV = { - 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 -}; +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; -static const uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. -static const vec16 kAddYJ64 = { - 64, 64, 64, 64, 64, 64, 64, 64 -}; +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static const uvec8 kAddUV128 = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static const uvec16 kAddUVJ128 = { - 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u -}; +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u -}; + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = { - 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u -}; +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. static const uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. static const uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 static const uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 -}; +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = { - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 -}; +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 -}; +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = { - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 -}; +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { +__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 convertloop: @@ -318,13 +285,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { #ifdef HAS_J400TOARGBROW_AVX2 // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) -void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { +__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: @@ -348,13 +316,14 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { } #endif // HAS_J400TOARGBROW_AVX2 -__declspec(naked) -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_rgb24 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB @@ -364,17 +333,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 @@ -386,14 +355,14 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { } } -__declspec(naked) -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int width) { +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB @@ -403,17 +372,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 @@ -425,11 +394,12 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, } } -__declspec(naked) -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { +__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_rgb24 + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_rgb24 mov ecx, [esp + 12] // width movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 @@ -460,9 +430,9 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions. -__declspec(naked) -void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int width) { +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -470,33 +440,33 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits movd xmm6, eax pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 - pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green psllw xmm4, 10 psrlw xmm4, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqa xmm1, xmm0 movdqa xmm2, xmm0 - pand xmm1, xmm3 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pmulhuw xmm1, xmm5 // * (256 + 8) - pmulhuw xmm2, xmm5 // * (256 + 8) + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) psllw xmm1, 8 - por xmm1, xmm2 // RB - pand xmm0, xmm4 // G in middle 6 bits - pmulhuw xmm0, xmm6 // << 5 * (256 + 4) - por xmm0, xmm7 // AG + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 @@ -516,9 +486,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, // v * 256 + v * 8 // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) -void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int width) { +__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax @@ -526,32 +496,32 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green vpsllw ymm4, ymm4, 10 vpsrlw ymm4, ymm4, 5 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 - vpand ymm1, ymm0, ymm3 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpand ymm0, ymm0, ymm4 // G in middle 6 bits - vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) - vpor ymm0, ymm0, ymm7 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 @@ -567,9 +537,9 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, #endif // HAS_RGB565TOARGBROW_AVX2 #ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) -void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax @@ -577,33 +547,33 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 - vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 - vpsllw ymm1, ymm0, 1 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits vpand ymm1, ymm1, ymm3 - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpsraw ymm2, ymm0, 8 // A - vpand ymm0, ymm0, ymm4 // G in middle 5 bits - vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) vpand ymm2, ymm2, ymm7 - vpor ymm0, ymm0, ymm2 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpor ymm0, ymm0, ymm2 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 @@ -619,29 +589,29 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, #endif // HAS_ARGB1555TOARGBROW_AVX2 #ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) -void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f vmovd xmm4, eax vbroadcastss ymm4, xmm4 - vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 - vpand ymm2, ymm0, ymm5 // mask high nibbles - vpand ymm0, ymm0, ymm4 // mask low nibbles + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles vpsrlw ymm3, ymm2, 4 vpsllw ymm1, ymm0, 4 vpor ymm2, ymm2, ymm3 vpor ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm2, ymm2, 0xd8 vpunpckhbw ymm1, ymm0, ymm2 vpunpcklbw ymm0, ymm0, ymm2 @@ -657,9 +627,9 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, #endif // HAS_ARGB4444TOARGBROW_AVX2 // 24 instructions -__declspec(naked) -void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -667,36 +637,36 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits movd xmm6, eax pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 - movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green psrlw xmm4, 6 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqa xmm1, xmm0 movdqa xmm2, xmm0 - psllw xmm1, 1 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits pand xmm1, xmm3 - pmulhuw xmm2, xmm5 // * (256 + 8) - pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) psllw xmm1, 8 - por xmm1, xmm2 // RB + por xmm1, xmm2 // RB movdqa xmm2, xmm0 - pand xmm0, xmm4 // G in middle 5 bits - psraw xmm2, 8 // A - pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) pand xmm2, xmm7 - por xmm0, xmm2 // AG + por xmm0, xmm2 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 @@ -710,26 +680,26 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, } // 18 instructions. -__declspec(naked) -void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f movd xmm4, eax pshufd xmm4, xmm4, 0 - movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles pslld xmm5, 4 - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqa xmm2, xmm0 - pand xmm0, xmm4 // mask low nibbles - pand xmm2, xmm5 // mask high nibbles + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles movdqa xmm1, xmm0 movdqa xmm3, xmm2 psllw xmm1, 4 @@ -748,37 +718,38 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, } } -__declspec(naked) -void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -786,37 +757,38 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -824,33 +796,34 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 @@ -861,41 +834,42 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - movd xmm6, [esp + 12] // dither4 + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + movd xmm6, [esp + 12] // dither4 mov ecx, [esp + 16] // width - punpcklbw xmm6, xmm6 // make dither 16 bytes + punpcklbw xmm6, xmm6 // make dither 16 bytes movdqa xmm7, xmm6 punpcklwd xmm6, xmm6 punpckhwd xmm7, xmm7 - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - paddusb xmm0, xmm6 // add dither - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 // add dither + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 @@ -907,39 +881,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) -void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb vbroadcastss xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes + mov ecx, [esp + 16] // width + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes vpermq ymm6, ymm6, 0xd8 vpunpcklwd ymm6, ymm6, ymm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpaddusb ymm0, ymm0, ymm6 // add dither - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 // add dither + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -950,37 +925,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 // TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) -void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f psrld xmm4, 27 - movdqa xmm5, xmm4 // generate mask 0x000003e0 + movdqa xmm5, xmm4 // generate mask 0x000003e0 pslld xmm5, 5 - movdqa xmm6, xmm4 // generate mask 0x00007c00 + movdqa xmm6, xmm4 // generate mask 0x00007c00 pslld xmm6, 10 - pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pslld xmm7, 15 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - movdqa xmm3, xmm0 // R - psrad xmm0, 16 // A - psrld xmm1, 3 // B - psrld xmm2, 6 // G - psrld xmm3, 9 // R - pand xmm0, xmm7 // A - pand xmm1, xmm4 // B - pand xmm2, xmm5 // G - pand xmm3, xmm6 // R - por xmm0, xmm1 // BA - por xmm2, xmm3 // GR - por xmm0, xmm2 // BGRA + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 @@ -991,22 +967,23 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 psllw xmm4, 12 - movdqa xmm3, xmm4 // generate mask 0x00f000f0 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 psrlw xmm3, 8 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 - pand xmm0, xmm3 // low nibble - pand xmm1, xmm4 // high nibble + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble psrld xmm0, 4 psrld xmm1, 8 por xmm0, xmm1 @@ -1021,33 +998,34 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } #ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) -void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1058,36 +1036,37 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) -void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width vpcmpeqb ymm4, ymm4, ymm4 - vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f - vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 - vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 vpslld ymm7, ymm7, 15 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm3, ymm0, 9 // R - vpsrld ymm2, ymm0, 6 // G - vpsrld ymm1, ymm0, 3 // B - vpsrad ymm0, ymm0, 16 // A - vpand ymm3, ymm3, ymm6 // R - vpand ymm2, ymm2, ymm5 // G - vpand ymm1, ymm1, ymm4 // B - vpand ymm0, ymm0, ymm7 // A - vpor ymm0, ymm0, ymm1 // BA - vpor ymm2, ymm2, ymm3 // GR - vpor ymm0, ymm0, ymm2 // BGRA + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA vpackssdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1098,27 +1077,28 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) -void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 vpsllw ymm4, ymm4, 12 - vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpand ymm1, ymm0, ymm4 // high nibble - vpand ymm0, ymm0, ymm3 // low nibble + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpand ymm1, ymm0, ymm4 // high nibble + vpand ymm0, ymm0, ymm3 // low nibble vpsrld ymm1, ymm1, 8 vpsrld ymm0, ymm0, 4 vpor ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1129,12 +1109,13 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTOARGB4444ROW_AVX2 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToY movdqa xmm5, xmmword ptr kAddY16 @@ -1164,12 +1145,13 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 @@ -1200,17 +1182,16 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { #ifdef HAS_ARGBTOYROW_AVX2 // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = { - 0, 4, 1, 5, 2, 6, 3, 7 -}; +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToY vbroadcastf128 ymm5, xmmword ptr kAddY16 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX @@ -1244,12 +1225,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ vbroadcastf128 ymm5, xmmword ptr kAddYJ64 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX @@ -1283,12 +1265,13 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { } #endif // HAS_ARGBTOYJROW_AVX2 -__declspec(naked) -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kBGRAToY movdqa xmm5, xmmword ptr kAddY16 @@ -1316,12 +1299,13 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kABGRToY movdqa xmm5, xmmword ptr kAddY16 @@ -1349,12 +1333,13 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kRGBAToY movdqa xmm5, xmmword ptr kAddY16 @@ -1382,24 +1367,26 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1423,9 +1410,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1437,11 +1424,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1452,24 +1439,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUVJ128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1493,9 +1482,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1510,9 +1499,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm1, 8 packsswb xmm0, xmm1 - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1524,24 +1513,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) -void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kAddUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ + /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] @@ -1558,9 +1549,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1574,9 +1565,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw vpaddb ymm0, ymm0, ymm5 // -> unsigned - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -1590,24 +1581,26 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) -void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kAddUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ + /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] @@ -1624,9 +1617,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1641,9 +1634,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpermq ymm0, ymm0, 0xd8 // For vpacksswb vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -1656,23 +1649,24 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) -void ARGBToUV444Row_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* convert to U and V */ - movdqu xmm0, [eax] // U + /* convert to U and V */ + movdqu xmm0, [eax] // U movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] @@ -1688,7 +1682,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, paddb xmm0, xmm5 movdqu [edx], xmm0 - movdqu xmm0, [eax] // V + movdqu xmm0, [eax] // V movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] @@ -1713,24 +1707,26 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) -void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm7, xmmword ptr kBGRAToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1754,9 +1750,9 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1768,11 +1764,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1783,24 +1779,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kABGRToV movdqa xmm7, xmmword ptr kABGRToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1824,9 +1822,9 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1838,11 +1836,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1853,24 +1851,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm7, xmmword ptr kRGBAToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1894,9 +1894,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1908,11 +1908,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1925,109 +1925,95 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOYROW_SSSE3 // Read 16 UV from 444 -#define READYUV444_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ +#define READYUV444_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ +#define READYUV422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ +#define READYUVA422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vmovdqu xmm5, [ebp] /* A */ \ __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16] \ - } - -// Read 4 UV from 411, upsample to 16 UV. -#define READYUV411_AVX2 __asm { \ - __asm vmovd xmm0, dword ptr [esi] /* U */ \ - __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea ebp, [ebp + 16]} // Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ +#define READNV12_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from NV21, upsample to 16 UV. -#define READNV21_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ +#define READNV21_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ +#define READYUY2_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vmovdqu ymm0, [eax] /* UV */ \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 32] \ - } + __asm lea eax, [eax + 32]} // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ +#define READUYVY_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vmovdqu ymm0, [eax] /* UV */ \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 32] \ - } + __asm lea eax, [eax + 32]} // Convert 16 pixels: 16 UV and 16 Y. -#define YUVTORGB_AVX2(YuvConstants) __asm { \ +#define YUVTORGB_AVX2(YuvConstants) \ + __asm { \ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ @@ -2036,68 +2022,67 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ __asm vpsubw ymm1, ymm3, ymm1 \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ - __asm vpsubw ymm0, ymm3, ymm0 \ - /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ - __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ - __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ + __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ + __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ + __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ __asm vpsraw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ - __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ - __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ + __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ + __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ + __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ } // Store 16 ARGB values. -#define STOREARGB_AVX2 __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ +#define STOREARGB_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ __asm vmovdqu 0[edx], ymm1 \ __asm vmovdqu 32[edx], ymm0 \ - __asm lea edx, [edx + 64] \ - } + __asm lea edx, [edx + 64]} // Store 16 RGBA values. -#define STORERGBA_AVX2 __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ +#define STORERGBA_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ __asm vmovdqu [edx], ymm0 \ __asm vmovdqu [edx + 32], ymm1 \ - __asm lea edx, [edx + 64] \ - } + __asm lea edx, [edx + 64]} #ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 @@ -2119,21 +2104,21 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I422ALPHATOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -__declspec(naked) -void I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422AlphaToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb @@ -2162,25 +2147,25 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I444ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I444ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV444_AVX2 YUVTORGB_AVX2(ebx) @@ -2198,64 +2183,24 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I444TOARGBROW_AVX2 -#ifdef HAS_I411TOARGBROW_AVX2 -// 16 pixels -// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I411ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // abgr - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV411_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I411TOARGBROW_AVX2 - #ifdef HAS_NV12TOARGBROW_AVX2 // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void NV12ToARGBRow_AVX2(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV12ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV12_AVX2 @@ -2276,21 +2221,21 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_NV21TOARGBROW_AVX2 // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void NV21ToARGBRow_AVX2(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV21ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV21_AVX2 @@ -2311,18 +2256,18 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_YUY2TOARGBROW_AVX2 // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) -void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void YUY2ToARGBRow_AVX2( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUY2_AVX2 @@ -2342,18 +2287,18 @@ void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, #ifdef HAS_UYVYTOARGBROW_AVX2 // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) -void UYVYToARGBRow_AVX2(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void UYVYToARGBRow_AVX2( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READUYVY_AVX2 @@ -2373,25 +2318,25 @@ void UYVYToARGBRow_AVX2(const uint8* src_uyvy, #ifdef HAS_I422TORGBAROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -__declspec(naked) -void I422ToRGBARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGBARow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 @@ -2415,100 +2360,83 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, // Allows a conversion with half size scaling. // Read 8 UV from 444. -#define READYUV444 __asm { \ +#define READYUV444 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ +#define READYUV422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ +#define READYUVA422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8] \ - } - -// Read 2 UV from 411, upsample to 8 UV. -// drmemory fails with memory fault if pinsrw used. libyuv bug: 525 -// __asm pinsrw xmm0, [esi], 0 /* U */ -// __asm pinsrw xmm1, [esi + edi], 0 /* V */ -#define READYUV411_EBX __asm { \ - __asm movzx ebx, word ptr [esi] /* U */ \ - __asm movd xmm0, ebx \ - __asm movzx ebx, word ptr [esi + edi] /* V */ \ - __asm movd xmm1, ebx \ - __asm lea esi, [esi + 2] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8]} // Read 4 UV from NV12, upsample to 8 UV. -#define READNV12 __asm { \ +#define READNV12 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 VU from NV21, upsample to 8 UV. -#define READNV21 __asm { \ +#define READNV21 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. -#define READYUY2 __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ +#define READYUY2 \ + __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm0, [eax] /* UV */ \ + __asm movdqu xmm0, [eax] /* UV */ \ __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. -#define READUYVY __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ +#define READUYVY \ + __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm0, [eax] /* UV */ \ + __asm movdqu xmm0, [eax] /* UV */ \ __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(YuvConstants) __asm { \ +#define YUVTORGB(YuvConstants) \ + __asm { \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ __asm movdqa xmm3, xmm0 \ @@ -2522,129 +2450,125 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ __asm psubw xmm2, xmm3 \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm paddsw xmm0, xmm4 /* B += Y */ \ - __asm paddsw xmm1, xmm4 /* G += Y */ \ - __asm paddsw xmm2, xmm4 /* R += Y */ \ + __asm paddsw xmm0, xmm4 /* B += Y */ \ + __asm paddsw xmm1, xmm4 /* G += Y */ \ + __asm paddsw xmm2, xmm4 /* R += Y */ \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. -#define STOREARGB __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ +#define STOREARGB \ + __asm { \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm0 \ __asm movdqu 16[edx], xmm1 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 BGRA values. -#define STOREBGRA __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ +#define STOREBGRA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 RGBA values. -#define STORERGBA __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ +#define STORERGBA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 RGB24 values. -#define STORERGB24 __asm { \ - /* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ +#define STORERGB24 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ - __asm lea edx, [edx + 24] \ - } + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24]} // Store 8 RGB565 values. -#define STORERGB565 __asm { \ - /* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ +#define STORERGB565 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ - __asm lea edx, [edx + 16] \ - } + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm lea edx, [edx + 16]} // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I444ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV444 @@ -2663,19 +2587,19 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) -void I422ToRGB24Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGB24Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants @@ -2701,30 +2625,30 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) -void I422ToRGB565Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb565_buf, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGB565Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb565_buf, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f psrld xmm5, 27 - pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 psrld xmm6, 26 pslld xmm6, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 pslld xmm7, 11 convertloop: @@ -2744,25 +2668,25 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV422 @@ -2781,21 +2705,21 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. -__declspec(naked) -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422AlphaToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb @@ -2819,63 +2743,23 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels. -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -// Similar to I420 but duplicate UV once more. -__declspec(naked) -void I411ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov edx, [esp + 16 + 16] // abgr - mov ebp, [esp + 16 + 20] // yuvconstants - mov ecx, [esp + 16 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV411_EBX - YUVTORGB(ebp) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void NV12ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV12ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV12 @@ -2893,21 +2777,21 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV21ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV21 @@ -2925,18 +2809,18 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void YUY2ToARGBRow_SSSE3( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUY2 @@ -2953,18 +2837,18 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, // 8 pixels. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void UYVYToARGBRow_SSSE3( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READUYVY @@ -2979,19 +2863,19 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, } } -__declspec(naked) -void I422ToRGBARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGBARow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants @@ -3016,39 +2900,38 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) -void I400ToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* rgb_buf, + int width) { __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) movd xmm2, eax pshufd xmm2, xmm2,0 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) movd xmm3, eax pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] lea eax, [eax + 8] - punpcklbw xmm0, xmm0 // Y.Y + punpcklbw xmm0, xmm0 // Y.Y pmulhuw xmm0, xmm2 psubusw xmm0, xmm3 psrlw xmm0, 6 - packuswb xmm0, xmm0 // G + packuswb xmm0, xmm0 // G - // Step 2: Weave into ARGB - punpcklbw xmm0, xmm0 // GG + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 // BGRA first 4 pixels - punpckhwd xmm1, xmm1 // BGRA next 4 pixels + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 movdqu [edx], xmm0 @@ -3064,41 +2947,40 @@ void I400ToARGBRow_SSE2(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) -void I400ToARGBRow_AVX2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* rgb_buf, + int width) { __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) vmovd xmm2, eax vbroadcastss ymm2, xmm2 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) vmovd xmm3, eax vbroadcastss ymm3, xmm3 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 vpslld ymm4, ymm4, 24 - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 vmovdqu xmm0, [eax] lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates - vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y vpmulhuw ymm0, ymm0, ymm2 vpsubusw ymm0, ymm0, ymm3 vpsrlw ymm0, ymm0, 6 - vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 - // TODO(fbarchard): Weave alpha with unpack. - // Step 2: Weave into ARGB - vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates vpermq ymm1, ymm1, 0xd8 - vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels - vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels vpor ymm0, ymm0, ymm4 vpor ymm1, ymm1, ymm4 vmovdqu [edx], ymm0 @@ -3114,16 +2996,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; // TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width movdqa xmm5, xmmword ptr kShuffleMirror @@ -3140,11 +3022,12 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vbroadcastf128 ymm5, xmmword ptr kShuffleMirror @@ -3164,17 +3047,17 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = { - 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u -}; +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -__declspec(naked) -void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_u + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm1, xmmword ptr kShuffleMirrorUV @@ -3198,11 +3081,12 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width lea eax, [eax - 16 + ecx * 4] // last 4 pixels. @@ -3221,15 +3105,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = { - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -__declspec(naked) -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 @@ -3246,16 +3129,17 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3265,10 +3149,10 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes + pand xmm0, xmm5 // even bytes pand xmm1, xmm5 packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes + psrlw xmm2, 8 // odd bytes psrlw xmm3, 8 packuswb xmm2, xmm3 movdqu [edx], xmm0 @@ -3285,16 +3169,17 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3302,9 +3187,9 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm2, ymm0, 8 // odd bytes vpsrlw ymm3, ymm1, 8 - vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm0, ymm0, ymm5 // even bytes vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm2, ymm2, ymm3 @@ -3324,24 +3209,25 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) { +__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: - movdqu xmm0, [eax] // read 16 U's + movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's lea eax, [eax + 16] movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs movdqu [edi], xmm0 movdqu [edi + 16], xmm2 lea edi, [edi + 32] @@ -3355,24 +3241,25 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) { +__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: - vmovdqu ymm0, [eax] // read 32 U's - vmovdqu ymm1, [eax + edx] // and 32 V's + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's lea eax, [eax + 32] - vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 - vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vextractf128 [edi], ymm2, 0 // bytes 0..15 + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vextractf128 [edi], ymm2, 0 // bytes 0..15 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 @@ -3388,13 +3275,14 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_COPYROW_SSE2 -// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { +// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) void CopyRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width test eax, 15 jne convertloopu test edx, 15 @@ -3426,13 +3314,14 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX -// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) -void CopyRow_AVX(const uint8* src, uint8* dst, int count) { +// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) void CopyRow_AVX(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] @@ -3451,14 +3340,15 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_AVX // Multiple of 1. -__declspec(naked) -void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { +__declspec(naked) void CopyRow_ERMS(const uint8_t* src, + uint8_t* dst, + int width) { __asm { mov eax, esi mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // width rep movsb mov edi, edx mov esi, eax @@ -3468,15 +3358,16 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: @@ -3504,14 +3395,15 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -__declspec(naked) -void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vmovdqu ymm1, [eax] @@ -3533,11 +3425,12 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { +__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a mov ecx, [esp + 12] // width extractloop: @@ -3558,17 +3451,54 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX + + extractloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpsrld ymm0, ymm0, 24 + vpsrld ymm1, ymm1, 24 + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + lea eax, [eax + 128] + vpackssdw ymm0, ymm0, ymm1 // mutates + vpsrld ymm2, ymm2, 24 + vpsrld ymm3, ymm3, 24 + vpackssdw ymm2, ymm2, ymm3 // mutates + vpackuswb ymm0, ymm0, ymm2 // mutates + vpermd ymm0, ymm4, ymm0 // unmutate + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg extractloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: @@ -3598,14 +3528,15 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -__declspec(naked) -void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vpmovzxbd ymm1, qword ptr [eax] @@ -3628,17 +3559,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -// Write 'count' bytes using an 8 bit value repeated. -// Count should be multiple of 4. -__declspec(naked) -void SetRow_X86(uint8* dst, uint8 v8, int count) { +// Write 'width' bytes using an 8 bit value repeated. +// width should be multiple of 4. +__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { __asm { - movzx eax, byte ptr [esp + 8] // v8 + movzx eax, byte ptr [esp + 8] // v8 mov edx, 0x01010101 // Duplicate byte to all bytes. - mul edx // overwrites edx with upper part of result. + mul edx // overwrites edx with upper part of result. mov edx, edi - mov edi, [esp + 4] // dst - mov ecx, [esp + 12] // count + mov edi, [esp + 4] // dst + mov ecx, [esp + 12] // width shr ecx, 2 rep stosd mov edi, edx @@ -3646,28 +3576,28 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) { } } -// Write 'count' bytes using an 8 bit value repeated. -__declspec(naked) -void SetRow_ERMS(uint8* dst, uint8 v8, int count) { +// Write 'width' bytes using an 8 bit value repeated. +__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { __asm { mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v8 - mov ecx, [esp + 12] // count + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 + mov ecx, [esp + 12] // width rep stosb mov edi, edx ret } } -// Write 'count' 32 bit values. -__declspec(naked) -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { +// Write 'width' 32 bit values. +__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, + uint32_t v32, + int width) { __asm { mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 - mov ecx, [esp + 12] // count + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // width rep stosd mov edi, edx ret @@ -3676,12 +3606,13 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { +__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 @@ -3689,9 +3620,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm0, ymm0, ymm5 // even bytes are Y vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -3702,18 +3633,20 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { } } -__declspec(naked) -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3723,18 +3656,18 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3746,16 +3679,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3763,18 +3697,18 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3785,21 +3719,21 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, } } -__declspec(naked) -void UYVYToYRow_AVX2(const uint8* src_uyvy, - uint8* dst_y, int width) { +__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm0, ymm0, 8 // odd bytes are Y vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -3810,18 +3744,20 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, } } -__declspec(naked) -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3831,18 +3767,18 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3854,16 +3790,17 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3871,18 +3808,18 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3895,21 +3832,21 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) -void YUY2ToYRow_SSE2(const uint8* src_yuy2, - uint8* dst_y, int width) { +__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y + pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -3920,18 +3857,20 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3943,13 +3882,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -3963,16 +3902,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3980,13 +3920,13 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -3999,19 +3939,19 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, } } -__declspec(naked) -void UYVYToYRow_SSE2(const uint8* src_uyvy, - uint8* dst_y, int width) { +__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y + psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -4022,18 +3962,20 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -4045,13 +3987,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV + pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -4065,16 +4007,17 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -4082,13 +4025,13 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV + pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -4108,13 +4051,15 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { __asm { push esi push edi - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. movd xmm6, eax @@ -4123,8 +4068,8 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, mov eax, 0x807f807f // 32768 + 127 for unbias and round. movd xmm7, eax pshufd xmm7, xmm7, 0x00 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width @@ -4132,17 +4077,17 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, sub edx, esi sub edi, esi - // 8 pixel loop. + // 8 pixel loop. convertloop8: - movq xmm0, qword ptr [esi] // alpha + movq xmm0, qword ptr [esi] // alpha punpcklbw xmm0, xmm0 - pxor xmm0, xmm5 // a, 255-a + pxor xmm0, xmm5 // a, 255-a movq xmm1, qword ptr [eax + esi] // src0 movq xmm2, qword ptr [edx + esi] // src1 punpcklbw xmm1, xmm2 - psubb xmm1, xmm6 // bias src0/1 - 128 + psubb xmm1, xmm6 // bias src0/1 - 128 pmaddubsw xmm0, xmm1 - paddw xmm0, xmm7 // unbias result - 32768 and round. + paddw xmm0, xmm7 // unbias result - 32768 and round. psrlw xmm0, 8 packuswb xmm0, xmm0 movq qword ptr [edi + esi], xmm0 @@ -4163,13 +4108,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { __asm { push esi push edi - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 vpsllw ymm5, ymm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. vmovd xmm6, eax @@ -4177,8 +4124,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, mov eax, 0x807f807f // 32768 + 127 for unbias and round. vmovd xmm7, eax vbroadcastss ymm7, xmm7 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width @@ -4186,23 +4133,23 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, sub edx, esi sub edi, esi - // 32 pixel loop. + // 32 pixel loop. convertloop32: - vmovdqu ymm0, [esi] // alpha - vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 - vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 - vpxor ymm3, ymm3, ymm5 // a, 255-a - vpxor ymm0, ymm0, ymm5 // a, 255-a + vmovdqu ymm0, [esi] // alpha + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 + vpxor ymm3, ymm3, ymm5 // a, 255-a + vpxor ymm0, ymm0, ymm5 // a, 255-a vmovdqu ymm1, [eax + esi] // src0 vmovdqu ymm2, [edx + esi] // src1 vpunpckhbw ymm4, ymm1, ymm2 vpunpcklbw ymm1, ymm1, ymm2 - vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 - vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 vpmaddubsw ymm3, ymm3, ymm4 vpmaddubsw ymm0, ymm0, ymm1 - vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. - vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. vpsrlw ymm3, ymm3, 8 vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm3 @@ -4221,52 +4168,51 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -static const uvec8 kShuffleAlpha = { - 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 -}; +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. -__declspec(naked) -void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 0x0001 + pcmpeqb xmm7, xmm7 // generate constant 0x0001 psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 sub ecx, 4 - jl convertloop4b // less than 4 pixels? + jl convertloop4b // less than 4 pixels? - // 4 pixel loop. + // 4 pixel loop. convertloop4: - movdqu xmm3, [eax] // src argb + movdqu xmm3, [eax] // src argb lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4276,26 +4222,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, add ecx, 4 - 1 jl convertloop1b - // 1 pixel loop. + // 1 pixel loop. convertloop1: - movd xmm3, [eax] // src argb + movd xmm3, [eax] // src argb lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -4311,41 +4257,42 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, }; static const uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; -__declspec(naked) -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pslld xmm3, 24 movdqa xmm4, xmmword ptr kShuffleAlpha0 movdqa xmm5, xmmword ptr kShuffleAlpha1 convertloop: - movdqu xmm0, [eax] // read 4 pixels - pshufb xmm0, xmm4 // isolate first 2 alphas - movdqu xmm1, [eax] // read 4 pixels - punpcklbw xmm1, xmm1 // first 2 pixel rgbs - pmulhuw xmm0, xmm1 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - pshufb xmm1, xmm5 // isolate next 2 alphas - movdqu xmm2, [eax] // read 4 pixels - punpckhbw xmm2, xmm2 // next 2 pixel rgbs - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // mask original alpha + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha lea eax, [eax + 16] pand xmm2, xmm3 psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 - por xmm0, xmm2 // copy original alpha + por xmm0, xmm2 // copy original alpha movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4358,22 +4305,23 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u -}; -__declspec(naked) -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpshufb ymm2, ymm0, ymm4 // low 4 alphas @@ -4398,40 +4346,40 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -__declspec(naked) -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, - int width) { +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width lea ebx, fixed_invtbl8 convertloop: - movdqu xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels movzx esi, byte ptr [eax + 3] // first alpha movzx edi, byte ptr [eax + 7] // second alpha - punpcklbw xmm0, xmm0 // first 2 + punpcklbw xmm0, xmm0 // first 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 - pmulhuw xmm0, xmm2 // rgb * a + pmulhuw xmm0, xmm2 // rgb * a - movdqu xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels movzx esi, byte ptr [eax + 11] // third alpha movzx edi, byte ptr [eax + 15] // forth alpha - punpckhbw xmm1, xmm1 // next 2 + punpckhbw xmm1, xmm1 // next 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 - pmulhuw xmm1, xmm2 // rgb * a + pmulhuw xmm1, xmm2 // rgb * a lea eax, [eax + 16] packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -4450,25 +4398,24 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u -}; + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. #ifdef USE_GATHER -__declspec(naked) -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, - int width) { +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. - vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a @@ -4488,50 +4435,50 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ret } } -#else // USE_GATHER -__declspec(naked) -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, - int width) { +#else // USE_GATHER +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width sub edx, eax lea ebx, fixed_invtbl8 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: - // replace VPGATHER - movzx esi, byte ptr [eax + 3] // alpha0 - movzx edi, byte ptr [eax + 7] // alpha1 + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] - movzx esi, byte ptr [eax + 11] // alpha2 - movzx edi, byte ptr [eax + 15] // alpha3 - vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] - movzx esi, byte ptr [eax + 19] // alpha4 - movzx edi, byte ptr [eax + 23] // alpha5 - vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] - movzx esi, byte ptr [eax + 27] // alpha6 - movzx edi, byte ptr [eax + 31] // alpha7 - vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] - vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] - vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] - vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] - vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] // end of VPGATHER - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a @@ -4540,7 +4487,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpackuswb ymm0, ymm0, ymm1 // unmutated. vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] sub ecx, 8 @@ -4558,12 +4505,13 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 @@ -4575,20 +4523,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 G bytes + packuswb xmm0, xmm0 // 8 G bytes movdqu xmm2, [eax] // A movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 packuswb xmm2, xmm3 - packuswb xmm2, xmm2 // 8 A bytes - movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA - punpcklbw xmm0, xmm0 // 8 GG words - punpcklbw xmm3, xmm2 // 8 GA words + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words movdqa xmm1, xmm0 - punpcklwd xmm0, xmm3 // GGGA first 4 - punpckhwd xmm1, xmm3 // GGGA next 4 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -4604,24 +4552,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone. -static const vec8 kARGBToSepiaB = { - 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 -}; +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; -static const vec8 kARGBToSepiaG = { - 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 -}; +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; -static const vec8 kARGBToSepiaR = { - 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 -}; +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] /* dst_argb */ - mov ecx, [esp + 8] /* width */ + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ movdqa xmm2, xmmword ptr kARGBToSepiaB movdqa xmm3, xmmword ptr kARGBToSepiaG movdqa xmm4, xmmword ptr kARGBToSepiaR @@ -4633,32 +4577,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values + packuswb xmm0, xmm0 // 8 B values movdqu xmm5, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values movdqu xmm5, [eax] // R movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values + packuswb xmm5, xmm5 // 8 R values movdqu xmm6, [eax] // A movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm5, xmm6 // 8 RA values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 movdqu [eax], xmm0 movdqu [eax + 16], xmm1 lea eax, [eax + 32] @@ -4674,19 +4618,20 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // Same as Sepia except matrix is provided. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* matrix_argb */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ movdqu xmm5, [ecx] pshufd xmm2, xmm5, 0x00 pshufd xmm3, xmm5, 0x55 pshufd xmm4, xmm5, 0xaa pshufd xmm5, xmm5, 0xff - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ convertloop: movdqu xmm0, [eax] // B @@ -4697,31 +4642,31 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values movdqu xmm1, [eax] // R movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R + phaddsw xmm1, xmm7 // R movdqu xmm6, [eax] // A movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 - phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A - packuswb xmm1, xmm1 // 8 R values - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm1, xmm6 // 8 RA values - movdqa xmm6, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm1 // BGRA first 4 - punpckhwd xmm6, xmm1 // BGRA next 4 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm6 lea eax, [eax + 32] @@ -4735,15 +4680,17 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { __asm { - mov eax, [esp + 4] /* dst_argb */ - movd xmm2, [esp + 8] /* scale */ - movd xmm3, [esp + 12] /* interval_size */ - movd xmm4, [esp + 16] /* interval_offset */ - mov ecx, [esp + 20] /* width */ + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ pshuflw xmm2, xmm2, 040h pshufd xmm2, xmm2, 044h pshuflw xmm3, xmm3, 040h @@ -4756,16 +4703,16 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, convertloop: movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels + punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size + pmullw xmm0, xmm3 // * interval_size movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 @@ -4780,25 +4727,26 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -__declspec(naked) -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { +__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 convertloop: - movdqu xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 @@ -4814,28 +4762,29 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) -void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 @@ -4853,13 +4802,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) -void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4867,11 +4817,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, jl convertloop49 convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4882,11 +4832,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, jl convertloop19 convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb0 + movd xmm0, [eax] // read 1 pixels from src_argb0 lea eax, [eax + 4] - movd xmm1, [esi] // read 1 pixels from src_argb1 + movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -4901,22 +4851,23 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) -void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb0 - src_argb1 + psubusb xmm0, xmm1 // src_argb0 - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4930,28 +4881,29 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 lea esi, [esi + 32] - vpunpcklbw ymm0, ymm1, ymm1 // low 4 - vpunpckhbw ymm1, ymm1, ymm1 // high 4 - vpunpcklbw ymm2, ymm3, ymm5 // low 4 - vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -4967,20 +4919,21 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -4996,20 +4949,21 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -5028,14 +4982,16 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -__declspec(naked) -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 mov edi, [esp + 8 + 12] // src_y2 mov edx, [esp + 8 + 16] // dst_sobelx mov ecx, [esp + 8 + 20] // width @@ -5045,17 +5001,17 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5063,7 +5019,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 @@ -5084,13 +5040,14 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -__declspec(naked) -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 mov edx, [esp + 4 + 12] // dst_sobely mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5098,17 +5055,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5116,7 +5073,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 @@ -5137,36 +5094,37 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -__declspec(naked) -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - pslld xmm5, 24 // 0xff000000 + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqa xmm2, xmm0 // GG - punpcklbw xmm2, xmm0 // First 8 - punpckhbw xmm0, xmm0 // Next 8 - movdqa xmm1, xmm2 // GGGG - punpcklwd xmm1, xmm2 // First 4 - punpckhwd xmm2, xmm2 // Next 4 - por xmm1, xmm5 // GGGA + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA por xmm2, xmm5 - movdqa xmm3, xmm0 // GGGG - punpcklwd xmm3, xmm0 // Next 4 - punpckhwd xmm0, xmm0 // Last 4 - por xmm3, xmm5 // GGGA + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA por xmm0, xmm5 movdqu [edx], xmm1 movdqu [edx + 16], xmm2 @@ -5184,22 +5142,23 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely + paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -5217,36 +5176,37 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -__declspec(naked) -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 + pcmpeqb xmm5, xmm5 // alpha 255 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 - paddusb xmm2, xmm1 // sobel = sobelx + sobely - movdqa xmm3, xmm0 // XA + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA punpcklbw xmm3, xmm5 punpckhbw xmm0, xmm5 - movdqa xmm4, xmm1 // YS + movdqa xmm4, xmm1 // YS punpcklbw xmm4, xmm2 punpckhbw xmm1, xmm2 - movdqa xmm6, xmm4 // YSXA - punpcklwd xmm6, xmm3 // First 4 - punpckhwd xmm4, xmm3 // Next 4 - movdqa xmm7, xmm1 // YSXA - punpcklwd xmm7, xmm0 // Next 4 - punpckhwd xmm1, xmm0 // Last 4 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 movdqu [edx], xmm6 movdqu [edx + 16], xmm4 movdqu [edx + 32], xmm7 @@ -5275,8 +5235,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // count is number of averaged pixels to produce. // Does 4 pixels at a time. // This function requires alignment on accumulation buffer pointers. -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, int count) { __asm { mov eax, topleft // eax topleft @@ -5294,18 +5257,18 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, cmp area, 128 // 128 pixels will not overflow 15 bits. ja l4 - pshufd xmm5, xmm5, 0 // area - pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 psrld xmm6, 16 cvtdq2ps xmm6, xmm6 - addps xmm5, xmm6 // (65536.0 + area - 1) - mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area - cvtps2dq xmm5, xmm5 // 0.16 fixed point - packssdw xmm5, xmm5 // 16 bit shorts + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts - // 4 pixel loop small blocks. + // 4 pixel loop small blocks. s4: - // top left + // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -5345,9 +5308,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, jmp l4b - // 4 pixel loop + // 4 pixel loop l4: - // top left + // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -5373,7 +5336,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, paddd xmm3, [esi + edx * 4 + 48] lea esi, [esi + 64] - cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area cvtdq2ps xmm1, xmm1 mulps xmm0, xmm4 mulps xmm1, xmm4 @@ -5397,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] @@ -5422,8 +5385,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { __asm { mov eax, row mov edx, cumsum @@ -5437,7 +5402,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, test edx, 15 jne l4b - // 4 pixel loop + // 4 pixel loop l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] @@ -5483,7 +5448,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. lea eax, [eax + 4] @@ -5505,10 +5470,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) -LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width) { +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { __asm { push esi push edi @@ -5519,46 +5485,46 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, movq xmm2, qword ptr [ecx] // uv movq xmm7, qword ptr [ecx + 8] // dudv mov ecx, [esp + 28] // width - shl esi, 16 // 4, stride + shl esi, 16 // 4, stride add esi, 4 movd xmm5, esi sub ecx, 4 jl l4b - // setup for 4 pixel loop + // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride - movdqa xmm0, xmm2 // x0, y0, x1, y1 + movdqa xmm0, xmm2 // x0, y0, x1, y1 addps xmm0, xmm7 movlhps xmm2, xmm0 movdqa xmm4, xmm7 - addps xmm4, xmm4 // dudv *= 2 - movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 addps xmm3, xmm4 - addps xmm4, xmm4 // dudv *= 4 + addps xmm4, xmm4 // dudv *= 4 - // 4 pixel loop + // 4 pixel loop l4: - cvttps2dq xmm0, xmm2 // x, y float to int first 2 - cvttps2dq xmm1, xmm3 // x, y float to int next 2 - packssdw xmm0, xmm1 // x, y as 8 shorts - pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd xmm1, [eax + esi] // read pixel 0 movd xmm6, [eax + edi] // read pixel 1 - punpckldq xmm1, xmm6 // combine pixel 0 and 1 - addps xmm2, xmm4 // x, y += dx, dy first 2 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 movq qword ptr [edx], xmm1 movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 movd xmm6, [eax + esi] // read pixel 2 movd xmm0, [eax + edi] // read pixel 3 - punpckldq xmm6, xmm0 // combine pixel 2 and 3 - addps xmm3, xmm4 // x, y += dx, dy next 2 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 movq qword ptr 8[edx], xmm6 lea edx, [edx + 16] sub ecx, 4 @@ -5568,12 +5534,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: - cvttps2dq xmm0, xmm2 // x, y float to int - packssdw xmm0, xmm0 // x, y as shorts - pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride - addps xmm2, xmm7 // x, y += dx, dy + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy movd esi, xmm0 movd xmm0, [eax + esi] // copy a pixel movd [edx], xmm0 @@ -5590,15 +5556,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -__declspec(naked) -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -5607,7 +5574,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, je xloop100 // 0 / 256. Blend 100 / 0. sub edi, esi cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. vmovd xmm0, eax // high fraction 0..255 neg eax @@ -5634,14 +5601,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpaddw ymm0, ymm0, ymm4 vpsrlw ymm1, ymm1, 8 vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates + vpackuswb ymm0, ymm0, ymm1 // unmutates vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] sub ecx, 32 jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] vpavgb ymm0, ymm0, [esi + edx] @@ -5651,7 +5618,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb @@ -5666,25 +5633,26 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, // Bilinear filter 16x2 -> 16x1 // TODO(fbarchard): Consider allowing 256 using memcpy. -__declspec(naked) -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi - // Dispatch to specialized filters if applicable. + // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. movd xmm0, eax // high fraction 0..255 neg eax @@ -5703,7 +5671,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 + psubb xmm0, xmm4 // bias image by -128 psubb xmm1, xmm4 movdqa xmm2, xmm5 movdqa xmm3, xmm5 @@ -5720,7 +5688,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -5731,7 +5699,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: movdqu xmm0, [esi] movdqu [esi + edi], xmm0 @@ -5747,15 +5715,16 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler movdqu xmm5, [ecx] - mov ecx, [esp + 16] // width + mov ecx, [esp + 16] // width wloop: movdqu xmm0, [eax] @@ -5773,15 +5742,16 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. - mov ecx, [esp + 16] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // width wloop: vmovdqu ymm0, [eax] @@ -5801,152 +5771,36 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - __asm { - push ebx - push esi - mov eax, [esp + 8 + 4] // src_argb - mov edx, [esp + 8 + 8] // dst_argb - mov esi, [esp + 8 + 12] // shuffler - mov ecx, [esp + 8 + 16] // width - pxor xmm5, xmm5 - - mov ebx, [esi] // shuffler - cmp ebx, 0x03000102 - je shuf_3012 - cmp ebx, 0x00010203 - je shuf_0123 - cmp ebx, 0x00030201 - je shuf_0321 - cmp ebx, 0x02010003 - je shuf_2103 - - // TODO(fbarchard): Use one source pointer and 3 offsets. - shuf_any1: - movzx ebx, byte ptr [esi] - movzx ebx, byte ptr [eax + ebx] - mov [edx], bl - movzx ebx, byte ptr [esi + 1] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 1], bl - movzx ebx, byte ptr [esi + 2] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 2], bl - movzx ebx, byte ptr [esi + 3] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 3], bl - lea eax, [eax + 4] - lea edx, [edx + 4] - sub ecx, 1 - jg shuf_any1 - jmp shuf99 - - shuf_0123: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB - pshuflw xmm0, xmm0, 01Bh - pshufhw xmm1, xmm1, 01Bh - pshuflw xmm1, xmm1, 01Bh - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_0123 - jmp shuf99 - - shuf_0321: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB - pshuflw xmm0, xmm0, 039h - pshufhw xmm1, xmm1, 039h - pshuflw xmm1, xmm1, 039h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_0321 - jmp shuf99 - - shuf_2103: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA - pshuflw xmm0, xmm0, 093h - pshufhw xmm1, xmm1, 093h - pshuflw xmm1, xmm1, 093h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_2103 - jmp shuf99 - - shuf_3012: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB - pshuflw xmm0, xmm0, 0C6h - pshufhw xmm1, xmm1, 0C6h - pshuflw xmm1, xmm1, 0C6h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_3012 - - shuf99: - pop esi - pop ebx - ret - } -} - // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 -__declspec(naked) -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 // YUYV + punpcklbw xmm0, xmm2 // YUYV punpckhbw xmm1, xmm2 movdqu [edi], xmm0 movdqu [edi + 16], xmm1 @@ -5960,30 +5814,30 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, } } -__declspec(naked) -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y movdqa xmm1, xmm2 lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY + punpcklbw xmm1, xmm0 // UYVY punpckhbw xmm2, xmm0 movdqu [edi], xmm1 movdqu [edi + 16], xmm2 @@ -5998,22 +5852,22 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* src_argb */ - mov edx, [esp + 4 + 8] /* dst_argb */ - mov esi, [esp + 4 + 12] /* poly */ - mov ecx, [esp + 4 + 16] /* width */ + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - // 2 pixel loop. + // 2 pixel loop. convertloop: -// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel -// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel movq xmm0, qword ptr [eax] // BGRABGRA lea eax, [eax + 8] punpcklbw xmm0, xmm3 @@ -6057,25 +5911,25 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* poly */ - vbroadcastf128 ymm4, [ecx] // C0 + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 vbroadcastf128 ymm5, [ecx + 16] // C1 vbroadcastf128 ymm6, [ecx + 32] // C2 vbroadcastf128 ymm7, [ecx + 48] // C3 - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ // 2 pixel loop. convertloop: vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels lea eax, [eax + 8] - vcvtdq2ps ymm0, ymm0 // X 8 floats + vcvtdq2ps ymm0, ymm0 // X 8 floats vmulps ymm2, ymm0, ymm0 // X * X vmulps ymm3, ymm0, ymm7 // C3 * X vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X @@ -6095,16 +5949,125 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_SSE2 +static float kExpBias = 1.9259299444e-34f; +__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + mulss xmm4, kExpBias + pshufd xmm4, xmm4, 0 + pxor xmm5, xmm5 + sub edx, eax + + // 8 pixel loop. + convertloop: + movdqu xmm2, xmmword ptr [eax] // 8 shorts + add eax, 16 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm5 + cvtdq2ps xmm2, xmm2 // convert 8 ints to floats + punpckhwd xmm3, xmm5 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + psrld xmm2, 13 + psrld xmm3, 13 + packssdw xmm2, xmm3 + movdqu [eax + edx - 16], xmm2 + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + + vmulss xmm4, xmm4, kExpBias + vbroadcastss ymm4, xmm4 + vpxor ymm5, ymm5, ymm5 + sub edx, eax + + // 16 pixel loop. + convertloop: + vmovdqu ymm2, [eax] // 16 shorts + add eax, 32 + vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints + vpunpcklwd ymm2, ymm2, ymm5 + vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats + vcvtdq2ps ymm2, ymm2 + vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. + vmulps ymm2, ymm2, ymm4 + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm2, ymm2, 13 + vpackssdw ymm2, ymm2, ymm3 + vmovdqu [eax + edx - 32], ymm2 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + vbroadcastss ymm4, [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + sub edx, eax + + // 16 pixel loop. + convertloop: + vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints + vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts + add eax, 32 + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm3, ymm3 + vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 + vmulps ymm3, ymm3, ymm4 + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm3, ymm3, 3 + vmovdqu [eax + edx + 32], xmm2 + vmovdqu [eax + edx + 32 + 16], xmm3 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_F16C + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -__declspec(naked) -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, - int width) { +__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: @@ -6131,13 +6094,14 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -__declspec(naked) -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { +__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: @@ -6162,27 +6126,28 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -__declspec(naked) -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - int width, - const uint8* luma, uint32 lumacoeff) { +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { __asm { push esi push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ - mov ecx, [esp + 8 + 12] /* width */ + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ movd xmm2, dword ptr [esp + 8 + 16] // luma table movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff pshufd xmm2, xmm2, 0 pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 psllw xmm4, 8 pxor xmm5, xmm5 - // 4 pixel loop. + // 4 pixel loop. convertloop: - movdqu xmm0, xmmword ptr [eax] // generate luma ptr + movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 phaddw xmm0, xmm0 pand xmm0, xmm4 // mask out low bits diff --git a/libs/libvpx/third_party/libyuv/source/scale.cc b/libs/libvpx/third_party/libyuv/source/scale.cc index 36e3fe5281..2cfa1c6cb1 100644 --- a/libs/libvpx/third_party/libyuv/source/scale.cc +++ b/libs/libvpx/third_party/libyuv/source/scale.cc @@ -33,17 +33,25 @@ static __inline int Abs(int v) { // This is an optimized version for scaling down a plane to 1/2 of // its original size. -static void ScalePlaneDown2(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_C : - (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_C + : ScaleRowDown2Box_C); int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; @@ -51,46 +59,63 @@ static void ScalePlaneDown2(int src_width, int src_height, #if defined(HAS_SCALEROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON : - ScaleRowDown2Box_Any_NEON); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON + : ScaleRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : - (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : - ScaleRowDown2Box_NEON); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_NEON + : ScaleRowDown2Box_NEON); } } #endif #if defined(HAS_SCALEROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 : - ScaleRowDown2Box_Any_SSSE3); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 + : ScaleRowDown2Box_Any_SSSE3); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 : - ScaleRowDown2Box_SSSE3); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 + : ScaleRowDown2Box_SSSE3); } } #endif #if defined(HAS_SCALEROWDOWN2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 : - ScaleRowDown2Box_Any_AVX2); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_AVX2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 + : ScaleRowDown2Box_Any_AVX2); if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 : - ScaleRowDown2Box_AVX2); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_AVX2 + : ScaleRowDown2Box_AVX2); } } #endif -#if defined(HAS_SCALEROWDOWN2_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = filtering ? - ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2; +#if defined(HAS_SCALEROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA + : ScaleRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MSA + : ScaleRowDown2Box_MSA); + } } #endif @@ -105,18 +130,25 @@ static void ScalePlaneDown2(int src_width, int src_height, } } -static void ScalePlaneDown2_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_16_C : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : - ScaleRowDown2Box_16_C); + void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_16_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C + : ScaleRowDown2Box_16_C); int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; @@ -124,23 +156,17 @@ static void ScalePlaneDown2_16(int src_width, int src_height, #if defined(HAS_SCALEROWDOWN2_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : - ScaleRowDown2_16_NEON; + ScaleRowDown2 = + filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : - ScaleRowDown2Box_16_SSE2); - } -#endif -#if defined(HAS_SCALEROWDOWN2_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = filtering ? - ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2; + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_16_SSE2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 + : ScaleRowDown2Box_16_SSE2); } #endif @@ -159,24 +185,30 @@ static void ScalePlaneDown2_16(int src_width, int src_height, // This is an optimized version for scaling down a plane to 1/4 of // its original size. -static void ScalePlaneDown4(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown4(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = + void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } @@ -184,8 +216,8 @@ static void ScalePlaneDown4(int src_width, int src_height, #endif #if defined(HAS_SCALEROWDOWN4_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; } @@ -193,19 +225,20 @@ static void ScalePlaneDown4(int src_width, int src_height, #endif #if defined(HAS_SCALEROWDOWN4_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; if (IS_ALIGNED(dst_width, 16)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; } } #endif -#if defined(HAS_SCALEROWDOWN4_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2; +#if defined(HAS_SCALEROWDOWN4_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; + } } #endif @@ -219,38 +252,36 @@ static void ScalePlaneDown4(int src_width, int src_height, } } -static void ScalePlaneDown4_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown4_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) = + void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : - ScaleRowDown4_16_NEON; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN4_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : - ScaleRowDown4_16_SSE2; - } -#endif -#if defined(HAS_SCALEROWDOWN4_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif @@ -265,18 +296,23 @@ static void ScalePlaneDown4_16(int src_width, int src_height, } // Scale plane down, 3/4 - -static void ScalePlaneDown34(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown34(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_C; @@ -305,6 +341,26 @@ static void ScalePlaneDown34(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEROWDOWN34_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MSA; + ScaleRowDown34_1 = ScaleRowDown34_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; + } + } + } +#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { @@ -325,19 +381,6 @@ static void ScalePlaneDown34(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN34_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_DSPR2; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); @@ -346,8 +389,7 @@ static void ScalePlaneDown34(int src_width, int src_height, ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, - dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } @@ -363,17 +405,23 @@ static void ScalePlaneDown34(int src_width, int src_height, } } -static void ScalePlaneDown34_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown34_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); + void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_C; @@ -404,19 +452,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN34_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); @@ -425,8 +460,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height, ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, - dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } @@ -442,7 +476,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } } - // Scale plane, 3/8 // This is an optimized version for scaling down a plane to 3/8 // of its original size. @@ -458,18 +491,24 @@ static void ScalePlaneDown34_16(int src_width, int src_height, // ggghhhii // Boxes are 3x3, 2x3, 3x2 and 2x2 -static void ScalePlaneDown38(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown38(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; assert(dst_width % 3 == 0); + (void)src_width; + (void)src_height; if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_C; ScaleRowDown38_2 = ScaleRowDown38_C; @@ -517,16 +556,23 @@ static void ScalePlaneDown38(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN38_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { +#if defined(HAS_SCALEROWDOWN38_MSA) + if (TestCpuFlag(kCpuHasMSA)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_Any_MSA; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MSA; + ScaleRowDown38_2 = ScaleRowDown38_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA; + } } } #endif @@ -554,17 +600,23 @@ static void ScalePlaneDown38(int src_width, int src_height, } } -static void ScalePlaneDown38_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown38_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); + void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_C; @@ -595,19 +647,6 @@ static void ScalePlaneDown38_16(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN38_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -634,8 +673,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height, #define MIN1(x) ((x) < 1 ? 1 : (x)) -static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { - uint32 sum = 0u; +static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) { + uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { @@ -644,8 +683,8 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { return sum; } -static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { - uint32 sum = 0u; +static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) { + uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { @@ -654,8 +693,12 @@ static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { return sum; } -static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols2_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -666,13 +709,18 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> 16; + *dst_ptr++ = + SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> + 16; } } -static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) { +static void ScaleAddCols2_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -684,22 +732,32 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, x += dx; boxwidth = MIN1((x >> 16) - ix); *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> 16; + scaletbl[boxwidth - minboxwidth] >> + 16; } } -static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols0_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int scaleval = 65536 / boxheight; int i; + (void)dx; src_ptr += (x >> 16); for (i = 0; i < dst_width; ++i) { *dst_ptr++ = src_ptr[i] * scaleval >> 16; } } -static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols1_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -710,8 +768,12 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, } } -static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) { +static void ScaleAddCols1_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -728,10 +790,14 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, // one pixel of destination using fixed point (16.16) to step // through source, sampling a box of pixel with simple // averaging. -static void ScalePlaneBox(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +static void ScalePlaneBox(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -739,18 +805,18 @@ static void ScalePlaneBox(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); src_width = Abs(src_width); { - // Allocate a row buffer of uint16. + // Allocate a row buffer of uint16_t. align_buffer_64(row16, src_width * 2); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_C: - ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); - void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) = - ScaleAddRow_C; + const uint16_t* src_ptr, uint8_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C + : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, + int src_width) = ScaleAddRow_C; #if defined(HAS_SCALEADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleAddRow = ScaleAddRow_Any_SSE2; @@ -775,11 +841,19 @@ static void ScalePlaneBox(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleAddRow = ScaleAddRow_Any_MSA; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_MSA; + } + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint8* src = src_ptr + iy * src_stride; + const uint8_t* src = src_ptr + iy * src_stride; y += dy; if (y > max_y) { y = max_y; @@ -787,20 +861,24 @@ static void ScalePlaneBox(int src_width, int src_height, boxheight = MIN1((y >> 16) - iy); memset(row16, 0, src_width * 2); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint16 *)(row16), src_width); + ScaleAddRow(src, (uint16_t*)(row16), src_width); src += src_stride; } - ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row16); } } -static void ScalePlaneBox_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr) { +static void ScalePlaneBox_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -808,17 +886,17 @@ static void ScalePlaneBox_16(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); src_width = Abs(src_width); { - // Allocate a row buffer of uint32. + // Allocate a row buffer of uint32_t. align_buffer_64(row32, src_width * 4); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; - void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) = - ScaleAddRow_16_C; + const uint32_t* src_ptr, uint16_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; + void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, + int src_width) = ScaleAddRow_16_C; #if defined(HAS_SCALEADDROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { @@ -829,7 +907,7 @@ static void ScalePlaneBox_16(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint16* src = src_ptr + iy * src_stride; + const uint16_t* src = src_ptr + iy * src_stride; y += dy; if (y > max_y) { y = max_y; @@ -837,10 +915,10 @@ static void ScalePlaneBox_16(int src_width, int src_height, boxheight = MIN1((y >> 16) - iy); memset(row32, 0, src_width * 4); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint32 *)(row32), src_width); + ScaleAddRow(src, (uint32_t*)(row32), src_width); src += src_stride; } - ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row32); @@ -848,10 +926,14 @@ static void ScalePlaneBox_16(int src_width, int src_height, } // Scale plane down with bilinear interpolation. -void ScalePlaneBilinearDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +void ScalePlaneBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -864,14 +946,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -898,16 +980,15 @@ void ScalePlaneBilinearDown(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; } } #endif - #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_SSSE3; @@ -920,6 +1001,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, ScaleFilterCols = ScaleFilterCols_NEON; } } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } #endif if (y > max_y) { y = max_y; @@ -927,7 +1016,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { @@ -944,10 +1033,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, free_aligned_buffer_64(row); } -void ScalePlaneBilinearDown_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +void ScalePlaneBilinearDown_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -960,14 +1053,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; - void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) @@ -1002,15 +1095,6 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } -#endif - #if defined(HAS_SCALEFILTERCOLS_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1023,13 +1107,13 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint16* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; - InterpolateRow((uint16*)row, src, src_stride, src_width, yf); - ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx); + InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx); } dst_ptr += dst_stride; y += dy; @@ -1041,10 +1125,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } // Scale up down with bilinear interpolation. -void ScalePlaneBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +void ScalePlaneBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1053,14 +1141,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = filtering ? ScaleFilterCols_C : ScaleCols_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -1087,14 +1175,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } - } -#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; @@ -1111,6 +1191,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, ScaleFilterCols = ScaleFilterCols_NEON; } } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; @@ -1126,13 +1214,13 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } { int yi = y >> 16; - const uint8* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 2); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -1172,10 +1260,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } } -void ScalePlaneBilinearUp_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +void ScalePlaneBilinearUp_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1184,14 +1276,14 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; - void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) @@ -1226,14 +1318,6 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } -#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_16_C; @@ -1257,13 +1341,13 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, } { int yi = y >> 16; - const uint16* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 4); - uint16* rowptr = (uint16*)row; + uint16_t* rowptr = (uint16_t*)row; int rowstride = kRowSize; int lasty = yi; @@ -1308,20 +1392,24 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. -static void ScalePlaneSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +static void ScalePlaneSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { int i; - void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = ScaleCols_C; + void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { @@ -1340,20 +1428,24 @@ static void ScalePlaneSimple(int src_width, int src_height, } } -static void ScalePlaneSimple_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr) { +static void ScalePlaneSimple_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { int i; - void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = ScaleCols_16_C; + void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_16_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { @@ -1366,8 +1458,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height, } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, - dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); dst_ptr += dst_stride; y += dy; } @@ -1377,14 +1468,18 @@ static void ScalePlaneSimple_16(int src_width, int src_height, // This function dispatches to a specialized scaler based on scale factor. LIBYUV_API -void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering) { // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, filtering); + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1403,46 +1498,42 @@ void ScalePlane(const uint8* src, int src_stride, if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical(src_height, - dst_width, dst_height, - src_stride, dst_stride, src, dst, - 0, 0, dy, 1, filtering); + ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. - if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 - ScalePlaneDown34(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 - ScalePlaneDown2(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 - ScalePlaneDown38(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 - ScalePlaneDown4(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } } if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { @@ -1455,19 +1546,23 @@ void ScalePlane(const uint8* src, int src_stride, src_stride, dst_stride, src, dst, filtering); return; } - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); } LIBYUV_API -void ScalePlane_16(const uint16* src, int src_stride, - int src_width, int src_height, - uint16* dst, int dst_stride, - int dst_width, int dst_height, - enum FilterMode filtering) { +void ScalePlane_16(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, filtering); + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1483,19 +1578,16 @@ void ScalePlane_16(const uint16* src, int src_stride, CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); return; } - if (dst_width == src_width) { + if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled vertically. - ScalePlaneVertical_16(src_height, - dst_width, dst_height, - src_stride, dst_stride, src, dst, - 0, 0, dy, 1, filtering); + ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. - if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1508,15 +1600,14 @@ void ScalePlane_16(const uint16* src, int src_stride, return; } // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && - filtering != kFilterBilinear) { + (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1524,8 +1615,8 @@ void ScalePlane_16(const uint16* src, int src_stride, } } if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { @@ -1538,132 +1629,110 @@ void ScalePlane_16(const uint16* src, int src_stride, src_stride, dst_stride, src, dst, filtering); return; } - ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); } // Scale an I420 image. // This function in turn calls a scaling function for each plane. LIBYUV_API -int I420Scale(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } - ScalePlane(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering); + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } LIBYUV_API -int I420Scale_16(const uint16* src_y, int src_stride_y, - const uint16* src_u, int src_stride_u, - const uint16* src_v, int src_stride_v, - int src_width, int src_height, - uint16* dst_y, int dst_stride_y, - uint16* dst_u, int dst_stride_u, - uint16* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } - ScalePlane_16(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering); - ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering); - ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering); + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } // Deprecated api LIBYUV_API -int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, - int src_stride_y, int src_stride_u, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, uint8* dst_u, uint8* dst_v, - int dst_stride_y, int dst_stride_u, int dst_stride_v, - int dst_width, int dst_height, +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, LIBYUV_BOOL interpolate) { - return I420Scale(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - src_width, src_height, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - dst_width, dst_height, - interpolate ? kFilterBox : kFilterNone); -} - -// Deprecated api -LIBYUV_API -int ScaleOffset(const uint8* src, int src_width, int src_height, - uint8* dst, int dst_width, int dst_height, int dst_yoffset, - LIBYUV_BOOL interpolate) { - // Chroma requires offset to multiple of 2. - int dst_yoffset_even = dst_yoffset & ~1; - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - int aheight = dst_height - dst_yoffset_even * 2; // actual output height - const uint8* src_y = src; - const uint8* src_u = src + src_width * src_height; - const uint8* src_v = src + src_width * src_height + - src_halfwidth * src_halfheight; - uint8* dst_y = dst + dst_yoffset_even * dst_width; - uint8* dst_u = dst + dst_width * dst_height + - (dst_yoffset_even >> 1) * dst_halfwidth; - uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + - (dst_yoffset_even >> 1) * dst_halfwidth; - if (!src || src_width <= 0 || src_height <= 0 || - !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 || - dst_yoffset_even >= dst_height) { - return -1; - } - return I420Scale(src_y, src_width, - src_u, src_halfwidth, - src_v, src_halfwidth, - src_width, src_height, - dst_y, dst_width, - dst_u, dst_halfwidth, - dst_v, dst_halfwidth, - dst_width, aheight, - interpolate ? kFilterBox : kFilterNone); + return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_width, src_height, dst_y, dst_stride_y, + dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, + dst_height, interpolate ? kFilterBox : kFilterNone); } #ifdef __cplusplus diff --git a/libs/libvpx/third_party/libyuv/source/scale_any.cc b/libs/libvpx/third_party/libyuv/source/scale_any.cc index ed76a9e4c0..53ad136404 100644 --- a/libs/libvpx/third_party/libyuv/source/scale_any.cc +++ b/libs/libvpx/third_party/libyuv/source/scale_any.cc @@ -20,184 +20,429 @@ extern "C" { // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols #define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ - int dst_width, int x, int dx) { \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, \ - dst_width & MASK, x + n * dx, dx); \ - } + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ + } #ifdef HAS_SCALEFILTERCOLS_NEON CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) #endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif #ifdef HAS_SCALEARGBCOLS_NEON CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) #endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif #ifdef HAS_SCALEARGBFILTERCOLS_NEON -CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, - ScaleARGBFilterCols_C, 4, 3) +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) #endif #undef CANY // Fixed scale down. +// Mask may be non-power of 2, so use MOD #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } // Fixed scale down for odd source width. Used by I420Blend subsampling. // Since dst_width is (width + 1) / 2, this function scales one less pixel // and copies the last pixel. #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ + int n = (dst_width - 1) - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r + 1); \ + } #ifdef HAS_SCALEROWDOWN2_SSSE3 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, - ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C, - 2, 1, 15) -SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3, - ScaleRowDown2Box_Odd_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSSE3, + ScaleRowDown2Linear_SSSE3, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) -SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, - ScaleRowDown2Linear_C, 2, 1, 31) -SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, - 2, 1, 31) -SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C, - 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, + ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_C, + 2, + 1, + 31) +SDODD(ScaleRowDown2Box_Odd_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 31) #endif #ifdef HAS_SCALEROWDOWN2_NEON SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, - ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, - ScaleRowDown2Box_C, 2, 1, 15) -SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON, - ScaleRowDown2Box_Odd_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, + ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN2_MSA +SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_MSA, + ScaleRowDown2Linear_MSA, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_MSA, + ScaleRowDown2Box_MSA, + ScaleRowDown2Box_C, + 2, + 1, + 31) #endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C, - 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSSE3, + ScaleRowDown4Box_SSSE3, + ScaleRowDown4Box_C, + 4, + 1, + 7) #endif #ifdef HAS_SCALEROWDOWN4_AVX2 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) -SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C, - 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, + ScaleRowDown4Box_AVX2, + ScaleRowDown4Box_C, + 4, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN4_NEON SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, - 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_NEON, + ScaleRowDown4Box_NEON, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif +#ifdef HAS_SCALEROWDOWN4_MSA +SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_MSA, + ScaleRowDown4Box_MSA, + ScaleRowDown4Box_C, + 4, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN34_SSSE3 -SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, - ScaleRowDown34_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3, - ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3, - ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_Any_SSSE3, + ScaleRowDown34_SSSE3, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, + ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, + ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) #endif #ifdef HAS_SCALEROWDOWN34_NEON -SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, - ScaleRowDown34_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, - ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON, - ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_Any_NEON, + ScaleRowDown34_NEON, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, + ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, + ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) +#endif +#ifdef HAS_SCALEROWDOWN34_MSA +SDANY(ScaleRowDown34_Any_MSA, + ScaleRowDown34_MSA, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_MSA, + ScaleRowDown34_0_Box_MSA, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_MSA, + ScaleRowDown34_1_Box_MSA, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) #endif #ifdef HAS_SCALEROWDOWN38_SSSE3 -SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, - ScaleRowDown38_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3, - ScaleRowDown38_3_Box_C, 8 / 3, 1, 5) -SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3, - ScaleRowDown38_2_Box_C, 8 / 3, 1, 5) +SDANY(ScaleRowDown38_Any_SSSE3, + ScaleRowDown38_SSSE3, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, + ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, + ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 5) #endif #ifdef HAS_SCALEROWDOWN38_NEON -SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON, - ScaleRowDown38_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON, - ScaleRowDown38_3_Box_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON, - ScaleRowDown38_2_Box_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_Any_NEON, + ScaleRowDown38_NEON, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, + ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, + ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif +#ifdef HAS_SCALEROWDOWN38_MSA +SDANY(ScaleRowDown38_Any_MSA, + ScaleRowDown38_MSA, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_MSA, + ScaleRowDown38_3_Box_MSA, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_MSA, + ScaleRowDown38_2_Box_MSA, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) #endif #ifdef HAS_SCALEARGBROWDOWN2_SSE2 -SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, - ScaleARGBRowDown2_C, 2, 4, 3) -SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2, - ScaleARGBRowDown2Linear_C, 2, 4, 3) -SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2, - ScaleARGBRowDown2Box_C, 2, 4, 3) +SDANY(ScaleARGBRowDown2_Any_SSE2, + ScaleARGBRowDown2_SSE2, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_SSE2, + ScaleARGBRowDown2Linear_SSE2, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_SSE2, + ScaleARGBRowDown2Box_SSE2, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) #endif #ifdef HAS_SCALEARGBROWDOWN2_NEON -SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON, - ScaleARGBRowDown2_C, 2, 4, 7) -SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON, - ScaleARGBRowDown2Linear_C, 2, 4, 7) -SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON, - ScaleARGBRowDown2Box_C, 2, 4, 7) +SDANY(ScaleARGBRowDown2_Any_NEON, + ScaleARGBRowDown2_NEON, + ScaleARGBRowDown2_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Linear_Any_NEON, + ScaleARGBRowDown2Linear_NEON, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Box_Any_NEON, + ScaleARGBRowDown2Box_NEON, + ScaleARGBRowDown2Box_C, + 2, + 4, + 7) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_MSA +SDANY(ScaleARGBRowDown2_Any_MSA, + ScaleARGBRowDown2_MSA, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_MSA, + ScaleARGBRowDown2Linear_MSA, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_MSA, + ScaleARGBRowDown2Box_MSA, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) #endif #undef SDANY // Scale down by even scale factor. -#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \ - src_stepx, dst_ptr + n * BPP, r); \ - } +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8_t* dst_ptr, int dst_width) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ + dst_ptr + n * BPP, r); \ + } #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 -SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, - ScaleARGBRowDownEven_C, 4, 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2, - ScaleARGBRowDownEvenBox_C, 4, 3) +SDAANY(ScaleARGBRowDownEven_Any_SSE2, + ScaleARGBRowDownEven_SSE2, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, + ScaleARGBRowDownEvenBox_SSE2, + ScaleARGBRowDownEvenBox_C, + 4, + 3) #endif #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON -SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON, - ScaleARGBRowDownEven_C, 4, 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, - ScaleARGBRowDownEvenBox_C, 4, 3) +SDAANY(ScaleARGBRowDownEven_Any_NEON, + ScaleARGBRowDownEven_NEON, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, + ScaleARGBRowDownEvenBox_NEON, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA +SDAANY(ScaleARGBRowDownEven_Any_MSA, + ScaleARGBRowDownEven_MSA, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, + ScaleARGBRowDownEvenBox_MSA, + ScaleARGBRowDownEvenBox_C, + 4, + 3) #endif // Add rows box filter scale down. -#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ - void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ - int n = src_width & ~MASK; \ - if (n > 0) { \ - SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ - } \ - SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ - } +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } #ifdef HAS_SCALEADDROW_SSE2 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) @@ -208,14 +453,12 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) #ifdef HAS_SCALEADDROW_NEON SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #endif +#ifdef HAS_SCALEADDROW_MSA +SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) +#endif #undef SAANY #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif - - - - - diff --git a/libs/libvpx/third_party/libyuv/source/scale_argb.cc b/libs/libvpx/third_party/libyuv/source/scale_argb.cc index 17f51ae9bf..53a22e8b41 100644 --- a/libs/libvpx/third_party/libyuv/source/scale_argb.cc +++ b/libs/libvpx/third_party/libyuv/source/scale_argb.cc @@ -30,20 +30,31 @@ static __inline int Abs(int v) { // ScaleARGB ARGB, 1/2 // This is an optimized version for scaling down a ARGB to 1/2 of // its original size. -static void ScaleARGBDown2(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) = - filtering == kFilterNone ? ScaleARGBRowDown2_C : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : - ScaleARGBRowDown2Box_C); - assert(dx == 65536 * 2); // Test scale factor of 2. + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + filtering == kFilterNone + ? ScaleARGBRowDown2_C + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C + : ScaleARGBRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { @@ -54,25 +65,49 @@ static void ScaleARGBDown2(int src_width, int src_height, #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : - ScaleARGBRowDown2Box_Any_SSE2); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 + : ScaleARGBRowDown2Box_Any_SSE2); if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : - ScaleARGBRowDown2Box_SSE2); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 + : ScaleARGBRowDown2Box_SSE2); } } #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : - ScaleARGBRowDown2Box_Any_NEON); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON + : ScaleARGBRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : - ScaleARGBRowDown2Box_NEON); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON + : ScaleARGBRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA + : ScaleARGBRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA + : ScaleARGBRowDown2Box_MSA); } } #endif @@ -90,21 +125,32 @@ static void ScaleARGBDown2(int src_width, int src_height, // ScaleARGB ARGB, 1/4 // This is an optimized version for scaling down a ARGB to 1/4 of // its original size. -static void ScaleARGBDown4Box(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { +static void ScaleARGBDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { int j; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + ScaleARGBRowDown2Box_C; // Advance to odd row, even column. src_argb += (y >> 16) * src_stride + (x >> 16) * 4; - assert(dx == 65536 * 4); // Test scale factor of 4. + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -125,8 +171,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); - ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, - row + kRowSize, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize, + dst_width * 2); ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); src_argb += row_stride; dst_argb += dst_stride; @@ -137,38 +183,57 @@ static void ScaleARGBDown4Box(int src_width, int src_height, // ScaleARGB ARGB Even // This is an optimized version for scaling down a ARGB to even // multiple of its original size. -static void ScaleARGBDownEven(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; int col_step = dx >> 16; int row_stride = (dy >> 16) * src_stride; - void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, - int src_step, uint8* dst_argb, int dst_width) = + void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, + int src_step, uint8_t* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + (void)src_width; + (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : - ScaleARGBRowDownEven_Any_SSE2; + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 + : ScaleARGBRowDownEven_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : - ScaleARGBRowDownEven_SSE2; + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : - ScaleARGBRowDownEven_Any_NEON; + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON + : ScaleARGBRowDownEven_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : - ScaleARGBRowDownEven_NEON; + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA + : ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; } } #endif @@ -184,25 +249,32 @@ static void ScaleARGBDownEven(int src_width, int src_height, } // Scale ARGB down with bilinear interpolation. -static void ScaleARGBBilinearDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; - int64 xlast = x + (int64)(dst_width - 1) * dx; - int64 xl = (dx >= 0) ? x : xlast; - int64 xr = (dx >= 0) ? xlast : x; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; int clip_src_width; - xl = (xl >> 16) & ~3; // Left edge aligned. - xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. if (xr > src_width) { xr = src_width; @@ -234,12 +306,11 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(clip_src_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; } } #endif @@ -255,6 +326,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } #endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. @@ -267,7 +346,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * src_stride; if (filtering == kFilterLinear) { ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); } else { @@ -286,18 +365,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } // Scale ARGB up with bilinear interpolation. -static void ScaleARGBBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -324,15 +410,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } } #endif if (src_width >= 32768) { - ScaleARGBFilterCols = filtering ? - ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -347,6 +435,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -359,6 +455,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -375,13 +479,13 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, { int yi = y >> 16; - const uint8* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * src_stride; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -423,24 +527,27 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, #ifdef YUVSCALEUP // Scale YUV to ARGB up with bilinear interpolation. -static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, +static void ScaleYUVToARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, int src_stride_y, int src_stride_u, int src_stride_v, int dst_stride_argb, - const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int x, int dx, int y, int dy, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, int width) = + I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; @@ -465,19 +572,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -502,19 +608,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } } #endif - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; if (src_width >= 32768) { - ScaleARGBFilterCols = filtering ? - ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -529,6 +637,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -541,6 +657,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -558,9 +682,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. int yi = y >> 16; int uv_yi = yi >> kYShift; - const uint8* src_row_y = src_y + yi * src_stride_y; - const uint8* src_row_u = src_u + uv_yi * src_stride_u; - const uint8* src_row_v = src_v + uv_yi * src_stride_v; + const uint8_t* src_row_y = src_y + yi * src_stride_y; + const uint8_t* src_row_u = src_u + uv_yi * src_stride_u; + const uint8_t* src_row_v = src_v + uv_yi * src_stride_v; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; @@ -569,7 +693,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, // Allocate 1 row of ARGB for source conversion. align_buffer_64(argb_row, src_width * 4); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -635,15 +759,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. -static void ScaleARGBSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { +static void ScaleARGBSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { int j; - void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; + (void)src_height; #if defined(HAS_SCALEARGBCOLS_SSE2) if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBCols = ScaleARGBCols_SSE2; @@ -656,6 +788,14 @@ static void ScaleARGBSimple(int src_width, int src_height, ScaleARGBCols = ScaleARGBCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_MSA; + } + } #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; @@ -667,8 +807,8 @@ static void ScaleARGBSimple(int src_width, int src_height, } for (j = 0; j < dst_height; ++j) { - ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, - dst_width, x, dx); + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x, + dx); dst_argb += dst_stride; y += dy; } @@ -677,11 +817,18 @@ static void ScaleARGBSimple(int src_width, int src_height, // ScaleARGB a ARGB. // This function in turn calls a scaling function // suitable for handling the desired resolutions. -static void ScaleARGB(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +static void ScaleARGB(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -690,8 +837,7 @@ static void ScaleARGB(const uint8* src, int src_stride, int dy = 0; // ARGB does not support box filter yet, but allow the user to pass it. // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative src_height means invert the image. @@ -700,17 +846,17 @@ static void ScaleARGB(const uint8* src, int src_stride, src = src + (src_height - 1) * src_stride; src_stride = -src_stride; } - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (clip_x) { - int64 clipf = (int64)(clip_x) * dx; + int64_t clipf = (int64_t)(clip_x)*dx; x += (clipf & 0xffff); src += (clipf >> 16) * 4; dst += clip_x * 4; } if (clip_y) { - int64 clipf = (int64)(clip_y) * dy; + int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); src += (clipf >> 16) * src_stride; dst += clip_y * dst_stride; @@ -725,24 +871,20 @@ static void ScaleARGB(const uint8* src, int src_stride, if (!(dx & 0x10000) && !(dy & 0x10000)) { if (dx == 0x20000) { // Optimized 1/2 downsample. - ScaleARGBDown2(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } if (dx == 0x40000 && filtering == kFilterBox) { // Optimized 1/4 box downsample. - ScaleARGBDown4Box(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy); + ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); return; } - ScaleARGBDownEven(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } // Optimized odd scale down. ie 3, 5, 7, 9x. @@ -759,96 +901,105 @@ static void ScaleARGB(const uint8* src, int src_stride, } if (dx == 0x10000 && (x & 0xffff) == 0) { // Arbitrary scale vertically, but unscaled vertically. - ScalePlaneVertical(src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, y, dy, 4, filtering); + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, 4, filtering); return; } if (filtering && dy < 65536) { - ScaleARGBBilinearUp(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } if (filtering) { - ScaleARGBBilinearDown(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } - ScaleARGBSimple(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy); + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); } LIBYUV_API -int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int ARGBScaleClip(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || - !dst_argb || dst_width <= 0 || dst_height <= 0 || - clip_x < 0 || clip_y < 0 || + if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || + dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || clip_width > 32768 || clip_height > 32768 || (clip_x + clip_width) > dst_width || (clip_y + clip_height) > dst_height) { return -1; } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, - dst_argb, dst_stride_argb, dst_width, dst_height, - clip_x, clip_y, clip_width, clip_height, filtering); + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, + clip_height, filtering); return 0; } // Scale an ARGB image. LIBYUV_API -int ARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +int ARGBScale(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_argb || dst_width <= 0 || dst_height <= 0) { + if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { return -1; } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, - dst_argb, dst_stride_argb, dst_width, dst_height, - 0, 0, dst_width, dst_height, filtering); + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height, + filtering); return 0; } // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint32 src_fourcc, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - uint32 dst_fourcc, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int YUVToARGBScaleClip(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint32_t src_fourcc, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + uint32_t dst_fourcc, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { - uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4); + uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4); int r; - I420ToARGB(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - argb_buffer, src_width * 4, - src_width, src_height); + (void)src_fourcc; // TODO(fbarchard): implement and/or assert. + (void)dst_fourcc; + I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + argb_buffer, src_width * 4, src_width, src_height); - r = ARGBScaleClip(argb_buffer, src_width * 4, - src_width, src_height, - dst_argb, dst_stride_argb, - dst_width, dst_height, - clip_x, clip_y, clip_width, clip_height, - filtering); + r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, + clip_width, clip_height, filtering); free(argb_buffer); return r; } diff --git a/libs/libvpx/third_party/libyuv/source/scale_common.cc b/libs/libvpx/third_party/libyuv/source/scale_common.cc index 3507aa4d9f..b28d7da41f 100644 --- a/libs/libvpx/third_party/libyuv/source/scale_common.cc +++ b/libs/libvpx/third_party/libyuv/source/scale_common.cc @@ -28,9 +28,12 @@ static __inline int Abs(int v) { } // CPU agnostic row functions -void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; @@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; @@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; @@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { - const uint16* s = src_ptr; +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; @@ -86,10 +98,12 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown2Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; @@ -103,10 +117,12 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; dst_width -= 1; for (x = 0; x < dst_width - 1; x += 2) { @@ -125,10 +141,12 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, dst[0] = (s[0] + t[0] + 1) >> 1; } -void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; @@ -142,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; @@ -156,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown4_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; @@ -170,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; } } -void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; } } -void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown34_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -255,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown34_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -269,19 +303,21 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Filter rows 0 and 1 together, 3 : 1 -void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; @@ -291,19 +327,21 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; @@ -314,19 +352,21 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Filter rows 1 and 2 together, 1 : 1 -void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; @@ -336,19 +376,21 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; @@ -359,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Scales a single row of pixels using point sampling. -void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; @@ -374,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; @@ -390,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; @@ -403,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; @@ -418,16 +476,19 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, // (1-f)a + fb can be replaced with a + f(b-a) #if defined(__arm__) || defined(__aarch64__) -#define BLENDER(a, b, f) (uint8)((int)(a) + \ - ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) #else -// inteluses 7 bit math with rounding. -#define BLENDER(a, b, f) (uint8)((int)(a) + \ - (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +// Intel uses 7 bit math with rounding. +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) #endif -void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; @@ -450,12 +511,15 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -468,7 +532,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, dst_ptr += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -476,12 +540,15 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, } #undef BLENDER -// Same as 8 bit arm blender but return is cast to uint16 -#define BLENDER(a, b, f) (uint16)((int)(a) + \ - ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +// Same as 8 bit arm blender but return is cast to uint16_t +#define BLENDER(a, b, f) \ + (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; @@ -504,12 +571,15 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, } } -void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -522,7 +592,7 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, dst_ptr += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -530,9 +600,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, } #undef BLENDER -void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown38_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -543,9 +616,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown38_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -557,100 +633,118 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // 8x3 -> 3x1 -void ScaleRowDown38_3_Box_C(const uint8* src_ptr, +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } } -void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) { + uint16_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } } // 8x2 -> 3x1 -void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } } -void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } } -void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { @@ -664,7 +758,9 @@ void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { } } -void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { @@ -678,13 +774,14 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { } } -void ScaleARGBRowDown2_C(const uint8* src_argb, +void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); - + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[1]; dst[1] = src[3]; @@ -696,10 +793,12 @@ void ScaleARGBRowDown2_C(const uint8* src_argb, } } -void ScaleARGBRowDown2Linear_C(const uint8* src_argb, +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { + uint8_t* dst_argb, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width; ++x) { dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; @@ -710,29 +809,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb, } } -void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { int x; for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; src_argb += 8; dst_argb += 4; } } -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); - + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + (void)src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[0]; @@ -745,30 +852,38 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, } } -void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { + uint8_t* dst_argb, + int dst_width) { int x; for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; src_argb += src_stepx * 4; dst_argb += 4; } } // Scales a single row of pixels using point sampling. -void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; @@ -782,11 +897,14 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, } } -void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; @@ -801,11 +919,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst[1] = dst[0] = src[0]; src += 1; @@ -818,23 +941,26 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, // TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. // Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 -#define BLENDERC(a, b, f, s) (uint32)( \ - BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) \ - BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ - BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ + BLENDERC(a, b, f, 0) -void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; @@ -848,23 +974,26 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, if (dst_width & 1) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } -void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; @@ -876,10 +1005,10 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, dst += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } @@ -889,16 +1018,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int y, int dy, - int bpp, enum FilterMode filtering) { + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(bpp >= 1 && bpp <= 4); @@ -930,13 +1065,11 @@ void ScalePlaneVertical(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_MSA; } } #endif @@ -948,23 +1081,29 @@ void ScalePlaneVertical(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, - src_stride, dst_width_bytes, yf); + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_bytes, yf); dst_argb += dst_stride; y += dy; } } void ScalePlaneVertical_16(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_argb, uint16* dst_argb, - int x, int y, int dy, - int wpp, enum FilterMode filtering) { + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint16_t* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; - void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; + void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(wpp >= 1 && wpp <= 2); @@ -1003,16 +1142,6 @@ void ScalePlaneVertical_16(int src_height, InterpolateRow = InterpolateRow_16_NEON; } } -#endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } #endif for (j = 0; j < dst_height; ++j) { int yi; @@ -1022,16 +1151,18 @@ void ScalePlaneVertical_16(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, - src_stride, dst_width_words, yf); + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_words, yf); dst_argb += dst_stride; y += dy; } } // Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, int src_height, - int dst_width, int dst_height, +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering) { if (src_width < 0) { src_width = -src_width; @@ -1073,22 +1204,26 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div) { - return (int)(((int64)(num) << 16) / div); + return (int)(((int64_t)(num) << 16) / div); } // Divide num by div and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div) { - return (int)((((int64)(num) << 16) - 0x00010001) / - (div - 1)); + return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); } #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) // Compute slope values for stepping. -void ScaleSlope(int src_width, int src_height, - int dst_width, int dst_height, +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering, - int* x, int* y, int* dx, int* dy) { + int* x, + int* y, + int* dx, + int* dy) { assert(x != NULL); assert(y != NULL); assert(dx != NULL); @@ -1120,7 +1255,7 @@ void ScaleSlope(int src_width, int src_height, *x = 0; } if (dst_height <= src_height) { - *dy = FixedDiv(src_height, dst_height); + *dy = FixedDiv(src_height, dst_height); *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_height > 1) { *dy = FixedDiv1(src_height, dst_height); @@ -1153,6 +1288,35 @@ void ScaleSlope(int src_width, int src_height, } #undef CENTERSTART +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src2 = src_ptr + src_stride; + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; + ++src_ptr; + ++src2; + dst += 2; + } + if (dst_width & 1) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/libs/libvpx/third_party/libyuv/source/scale_gcc.cc b/libs/libvpx/third_party/libyuv/source/scale_gcc.cc index e2f88544b7..312236d2df 100644 --- a/libs/libvpx/third_party/libyuv/source/scale_gcc.cc +++ b/libs/libvpx/third_party/libyuv/source/scale_gcc.cc @@ -21,1296 +21,1348 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { intptr_t stridex3; - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } - #ifdef HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3 - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(src_stride * 3)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movq %%xmm1," MEMACCESS2(0x8,1) " \n" - "movq %%xmm2," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x8,1) " \n" - "lea " MEMLEA(0xc,1) ",%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } // Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" +void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + "pxor %%xmm5,%%xmm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } - #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n" - "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = - { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { intptr_t x0, x1, temp_pixel; - asm volatile ( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 + asm volatile( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2," MEMACCESS(0) " \n" - "lea " MEMLEA(0x2,0) ",%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + // 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" - LABELALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2," MEMACCESS(0) " \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "=&a"(temp_pixel), // %2 - "=&r"(x0), // %3 - "=&r"(x1), // %4 + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "=&a"(temp_pixel), // %2 + "=&r"(x0), // %3 + "=&r"(x1), // %4 #if defined(__x86_64__) - "+rm"(dst_width) // %5 + "+rm"(dst_width) // %5 #else - "+m"(dst_width) // %5 + "+m"(dst_width) // %5 #endif - : "rm"(x), // %6 - "rm"(dx), // %7 + : "rm"(x), // %6 + "rm"(dx), // %7 #if defined(__x86_64__) - "x"(kFsub80), // %8 - "x"(kFadd40) // %9 + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 #else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 #endif - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile( - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + uint8_t* dst_argb, + int dst_width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - LABELALIGN - "1: \n" - "movd " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 - "punpckldq %%xmm1,%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 - MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "=&r"(src_stepx_x12) // %4 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + (void)src_stride; + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "=&r"(src_stepx_x12) // %4 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 - MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 - MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "movq " MEMACCESS(5) ",%%xmm2 \n" - MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 - MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 - MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "=&r"(src_stepx_x12), // %4 - "+r"(row1) // %5 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "=&r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { intptr_t x0, x1; - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" - LABELALIGN - "40: \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 - MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + LABELALIGN + "40: \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" - "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x8,2) ",%2 \n" - "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - "movd %%xmm0," MEMACCESS(2) " \n" - "99: \n" - : "=&a"(x0), // %0 - "=&d"(x1), // %1 - "+r"(dst_argb), // %2 - "+r"(src_argb), // %3 - "+r"(dst_width) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "99: \n" + : "=&a"(x0), // %0 + "=&d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile( - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } // Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { intptr_t x0, x1; - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); + asm volatile( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "psrlw $0x9,%%xmm1 \n" - MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(0) " \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" - LABELALIGN - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(0) " \n" + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" - LABELALIGN - "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "=&r"(x0), // %3 - "=&r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + LABELALIGN "99: \n" // clang-format error. + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "=&r"(x0), // %3 + "=&r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { - asm volatile ( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx" - ); + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); return num; } // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { - asm volatile ( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx" - ); + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); return num; } diff --git a/libs/libvpx/third_party/libyuv/source/scale_mips.cc b/libs/libvpx/third_party/libyuv/source/scale_mips.cc deleted file mode 100644 index ae953073fa..0000000000 --- a/libs/libvpx/third_party/libyuv/source/scale_mips.cc +++ /dev/null @@ -1,644 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC MIPS DSPR2 -#if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 - "beqz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| - // TODO(fbarchard): Use odd pixels instead of even. - "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| - "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| - "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| - "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu $t9, $t9, -1 \n" - "sw $t8, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $t1, 8(%[dst]) \n" - "sw $t2, 12(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 16 \n" - - "2: \n" - "andi $t9, %[dst_width], 0xf \n" // residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lbu $t0, 0(%[src_ptr]) \n" - "addiu %[src_ptr], %[src_ptr], 2 \n" - "addiu $t9, $t9, -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} - -void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* t = src_ptr + src_stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 - "bltz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 0(%[t]) \n" // |19|18|17|16| - "lw $t5, 4(%[t]) \n" // |23|22|21|20| - "lw $t6, 8(%[t]) \n" // |27|26|25|24| - "lw $t7, 12(%[t]) \n" // |31|30|29|28| - "addiu $t9, $t9, -1 \n" - "srl $t8, $t0, 16 \n" // |X|X|3|2| - "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| - "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| - "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| - "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| - "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 - "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 - "srl $t8, $t1, 16 \n" // |X|X|7|6| - "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| - "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| - "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| - "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| - "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 - "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 - "srl $t8, $t2, 16 \n" // |X|X|11|10| - "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| - "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| - "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| - "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| - "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 - "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 - "srl $t8, $t3, 16 \n" // |X|X|15|14| - "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| - "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| - "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| - "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| - "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 - "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 - "addiu %[src_ptr], %[src_ptr], 16 \n" - "addiu %[t], %[t], 16 \n" - "sb $t0, 0(%[dst]) \n" - "sb $t4, 1(%[dst]) \n" - "sb $t1, 2(%[dst]) \n" - "sb $t5, 3(%[dst]) \n" - "sb $t2, 4(%[dst]) \n" - "sb $t6, 5(%[dst]) \n" - "sb $t3, 6(%[dst]) \n" - "sb $t7, 7(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 8 \n" - - "2: \n" - "andi $t9, %[dst_width], 0x7 \n" // x = residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lwr $t1, 0(%[src_ptr]) \n" - "lwl $t1, 3(%[src_ptr]) \n" - "lwr $t2, 0(%[t]) \n" - "lwl $t2, 3(%[t]) \n" - "srl $t8, $t1, 16 \n" - "ins $t1, $t2, 16, 16 \n" - "ins $t2, $t8, 0, 16 \n" - "raddu.w.qb $t1, $t1 \n" - "raddu.w.qb $t2, $t2 \n" - "shra_r.w $t1, $t1, 2 \n" - "shra_r.w $t2, $t2, 2 \n" - "sb $t1, 0(%[dst]) \n" - "sb $t2, 1(%[dst]) \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "addiu $t9, $t9, -2 \n" - "addiu %[t], %[t], 4 \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 2 \n" - - "3: \n" - ".set pop \n" - - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), [t] "+r" (t) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} - -void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 3 \n" - "beqz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| - "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| - "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| - "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| - "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| - "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| - "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu $t9, $t9, -1 \n" - "sw $t1, 0(%[dst]) \n" - "sw $t5, 4(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 8 \n" - - "2: \n" - "andi $t9, %[dst_width], 7 \n" // residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lbu $t1, 0(%[src_ptr]) \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "addiu $t9, $t9, -1 \n" - "sb $t1, 0(%[dst]) \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst) - : [dst_width] "r" (dst_width) - : "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} - -void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - intptr_t stride = src_stride; - const uint8* s1 = src_ptr + stride; - const uint8* s2 = s1 + stride; - const uint8* s3 = s2 + stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 1 \n" - "andi $t8, %[dst_width], 1 \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 0(%[s1]) \n" // |7|6|5|4| - "lw $t2, 0(%[s2]) \n" // |11|10|9|8| - "lw $t3, 0(%[s3]) \n" // |15|14|13|12| - "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 4(%[s1]) \n" // |23|22|21|20| - "lw $t6, 4(%[s2]) \n" // |27|26|25|24| - "lw $t7, 4(%[s3]) \n" // |31|30|29|28| - "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| - "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| - "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| - "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| - "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| - "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| - "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| - "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| - "add $t0, $t0, $t1 \n" - "add $t1, $t2, $t3 \n" - "add $t0, $t0, $t1 \n" - "add $t4, $t4, $t5 \n" - "add $t6, $t6, $t7 \n" - "add $t4, $t4, $t6 \n" - "shra_r.w $t0, $t0, 4 \n" - "shra_r.w $t4, $t4, 4 \n" - "sb $t0, 0(%[dst]) \n" - "sb $t4, 1(%[dst]) \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[s1], %[s1], 8 \n" - "addiu %[s2], %[s2], 8 \n" - "addiu %[s3], %[s3], 8 \n" - "addiu $t9, $t9, -1 \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 2 \n" - "beqz $t8, 2f \n" - " nop \n" - - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 0(%[s1]) \n" // |7|6|5|4| - "lw $t2, 0(%[s2]) \n" // |11|10|9|8| - "lw $t3, 0(%[s3]) \n" // |15|14|13|12| - "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| - "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| - "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| - "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| - "add $t0, $t0, $t1 \n" - "add $t1, $t2, $t3 \n" - "add $t0, $t0, $t1 \n" - "shra_r.w $t0, $t0, 4 \n" - "sb $t0, 0(%[dst]) \n" - - "2: \n" - ".set pop \n" - - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [s1] "+r" (s1), - [s2] "+r" (s2), - [s3] "+r" (s3) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6","t7", "t8", "t9" - ); -} - -void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "1: \n" - "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| - "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| - "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| - "addiu %[dst_width], %[dst_width], -24 \n" - "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| - "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| - "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| - "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| - "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| - "prepend $t1, $t2, 8 \n" // |4|3|1|0| - "prepend $t3, $t4, 24 \n" // |15|13|12|11| - "prepend $t5, $t6, 8 \n" // |20|19|17|16| - "prepend $t7, $t8, 24 \n" // |31|29|28|27| - "sw $t1, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $t3, 8(%[dst]) \n" - "sw $t5, 12(%[dst]) \n" - "sw $t9, 16(%[dst]) \n" - "sw $t7, 20(%[dst]) \n" - "bnez %[dst_width], 1b \n" - " addiu %[dst], %[dst], 24 \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6","t7", "t8", "t9" - ); -} - -void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "repl.ph $t3, 3 \n" // 0x00030003 - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| - "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| - "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| - "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| - "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| - "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| - "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t1, $t1 \n" - "shra_r.w $t0, $t0, 1 \n" - "shra_r.w $t1, $t1, 1 \n" - "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| - "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| - "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| - "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| - "addu.ph $t2, $t2, $t4 \n" - "addu.ph $t6, $t6, $t5 \n" - "sll $t5, $t0, 1 \n" - "add $t0, $t5, $t0 \n" - "shra_r.ph $t2, $t2, 2 \n" - "shra_r.ph $t6, $t6, 2 \n" - "shll.ph $t4, $t2, 1 \n" - "addq.ph $t4, $t4, $t2 \n" - "addu $t0, $t0, $t1 \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "shra_r.w $t0, $t0, 2 \n" - "addu.ph $t6, $t6, $t4 \n" - "shra_r.ph $t6, $t6, 2 \n" - "srl $t1, $t6, 16 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "sb $t1, 0(%[d]) \n" - "sb $t0, 1(%[d]) \n" - "sb $t6, 2(%[d]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[d], %[d], 3 \n" - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [src_stride] "+r" (src_stride), - [d] "+r" (d), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6" - ); -} - -void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "repl.ph $t2, 3 \n" // 0x00030003 - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| - "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| - "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| - "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| - "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| - "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| - "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t1, $t1 \n" - "shra_r.w $t0, $t0, 1 \n" - "shra_r.w $t1, $t1, 1 \n" - "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| - "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| - "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| - "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| - "addu.ph $t4, $t4, $t3 \n" - "addu.ph $t6, $t6, $t5 \n" - "shra_r.ph $t6, $t6, 2 \n" - "shra_r.ph $t4, $t4, 2 \n" - "addu.ph $t6, $t6, $t4 \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "shra_r.ph $t6, $t6, 1 \n" - "addu $t0, $t0, $t1 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "shra_r.w $t0, $t0, 1 \n" - "srl $t1, $t6, 16 \n" - "sb $t1, 0(%[d]) \n" - "sb $t0, 1(%[d]) \n" - "sb $t6, 2(%[d]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[d], %[d], 3 \n" - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [src_stride] "+r" (src_stride), - [d] "+r" (d), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6" - ); -} - -void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| - "wsbh $t0, $t0 \n" // |2|3|0|1| - "wsbh $t6, $t6 \n" // |26|27|24|25| - "srl $t0, $t0, 8 \n" // |X|2|3|0| - "srl $t3, $t3, 16 \n" // |X|X|15|14| - "srl $t5, $t5, 16 \n" // |X|X|23|22| - "srl $t7, $t7, 16 \n" // |X|X|31|30| - "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| - "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| - "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| - "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| - "prepend $t2, $t3, 24 \n" // |X|15|14|11| - "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| - "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu %[dst_width], %[dst_width], -12 \n" - "addiu $t8,%[dst_width], -12 \n" - "sw $t1, 0(%[dst]) \n" - "sw $t4, 4(%[dst]) \n" - "sw $t6, 8(%[dst]) \n" - "bgez $t8, 1b \n" - " addiu %[dst], %[dst], 12 \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", "t4", - "t5", "t6", "t7", "t8" - ); -} - -void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t stride = src_stride; - const uint8* t = src_ptr + stride; - const int c = 0x2AAA; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| - "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| - "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| - "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| - "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| - "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| - "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 - "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 - "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| - "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| - "srl $t4, $t4, 2 \n" // t4 / 4 - "srl $t6, $t6, 16 \n" // |0|0|S3|T3| - "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 - "addu $t6, $t5, $t6 \n" - "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA - "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| - "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| - "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 - "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 - "addu $t0, $t0, $t2 \n" - "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[t], %[t], 8 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "addiu %[dst_ptr], %[dst_ptr], 3 \n" - "srl $t6, $t6, 16 \n" - "srl $t0, $t0, 16 \n" - "sb $t4, -1(%[dst_ptr]) \n" - "sb $t6, -2(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " sb $t0, -3(%[dst_ptr]) \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst_ptr] "+r" (dst_ptr), - [t] "+r" (t), - [dst_width] "+r" (dst_width) - : [c] "r" (c) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6" - ); -} - -void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t stride = src_stride; - const uint8* s1 = src_ptr + stride; - stride += stride; - const uint8* s2 = src_ptr + stride; - const int c1 = 0x1C71; - const int c2 = 0x2AAA; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| - "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| - "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| - "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| - "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| - "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| - "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| - "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 - "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| - "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 - "sll $t8, $t5, 16 \n" // |R5|R4|0|0| - "raddu.w.qb $t8, $t8 \n" // R5+R4 - "addu $t7, $t7, $t8 \n" - "srl $t8, $t5, 16 \n" // |0|0|R7|R6| - "raddu.w.qb $t8, $t8 \n" // R7 + R6 - "addu $t6, $t6, $t8 \n" - "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA - "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| - "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| - "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| - "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 - "addu $t7, $t7, $t8 \n" - "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 - "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| - "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| - "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t2, $t2 \n" - "raddu.w.qb $t4, $t4 \n" - "addu $t0, $t0, $t2 \n" - "addu $t0, $t0, $t4 \n" - "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[s1], %[s1], 8 \n" - "addiu %[s2], %[s2], 8 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "addiu %[dst_ptr], %[dst_ptr], 3 \n" - "srl $t6, $t6, 16 \n" - "srl $t7, $t7, 16 \n" - "srl $t0, $t0, 16 \n" - "sb $t6, -1(%[dst_ptr]) \n" - "sb $t7, -2(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " sb $t0, -3(%[dst_ptr]) \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst_ptr] "+r" (dst_ptr), - [s1] "+r" (s1), - [s2] "+r" (s2), - [dst_width] "+r" (dst_width) - : [c1] "r" (c1), [c2] "r" (c2) - : "t0", "t1", "t2", "t3", "t4", - "t5", "t6", "t7", "t8" - ); -} - -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - diff --git a/libs/libvpx/third_party/libyuv/source/scale_msa.cc b/libs/libvpx/third_party/libyuv/source/scale_msa.cc new file mode 100644 index 0000000000..482a521f0d --- /dev/null +++ b/libs/libvpx/third_party/libyuv/source/scale_msa.cc @@ -0,0 +1,949 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/scale_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define LOAD_INDEXED_DATA(srcp, indx0, out0) \ + { \ + out0[0] = srcp[indx0[0]]; \ + out0[1] = srcp[indx0[1]]; \ + out0[2] = srcp[indx0[2]]; \ + out0[3] = srcp[indx0[3]]; \ + } + +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3; + v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); + vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg0 += reg2; + reg1 += reg3; + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_argb); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int32_t stepx = src_stepx * 4; + int32_t data0, data1, data2, data3; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + data0 = LW(src_argb); + data1 = LW(src_argb + stepx); + data2 = LW(src_argb + stepx * 2); + data3 = LW(src_argb + stepx * 3); + SW(data0, dst_argb); + SW(data1, dst_argb + 4); + SW(data2, dst_argb + 8); + SW(data3, dst_argb + 12); + src_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* nxt_argb = src_argb + src_stride; + int32_t stepx = src_stepx * 4; + int64_t data0, data1, data2, data3; + v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; + v16u8 vec0, vec1, vec2, vec3; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 dst0; + + for (x = 0; x < dst_width; x += 4) { + data0 = LD(src_argb); + data1 = LD(src_argb + stepx); + data2 = LD(src_argb + stepx * 2); + data3 = LD(src_argb + stepx * 3); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); + data0 = LD(nxt_argb); + data1 = LD(nxt_argb + stepx); + data2 = LD(nxt_argb + stepx * 2); + data3 = LD(nxt_argb + stepx * 3); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); + reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); + reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); + reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); + reg4 += reg6; + reg5 += reg7; + reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_argb); + src_argb += stepx * 4; + nxt_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = __msa_aver_u_b(vec1, vec0); + dst1 = __msa_aver_u_b(vec3, vec2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst, 16); + s += 64; + t += 64; + dst += 32; + } +} + +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + const uint8_t* t2 = s + src_stride * 3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); + vec0 += __msa_hadd_u_h(src0, src0); + vec1 += __msa_hadd_u_h(src1, src1); + vec2 += __msa_hadd_u_h(src2, src2); + vec3 += __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + reg0 = __msa_hadd_u_w(vec0, vec0); + reg1 = __msa_hadd_u_w(vec1, vec1); + reg2 = __msa_hadd_u_w(vec2, vec2); + reg3 = __msa_hadd_u_w(vec3, vec3); + reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); + reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); + reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); + reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + s += 64; + t0 += 64; + t1 += 64; + t2 += 64; + dst += 16; + } +} + +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, width; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, vec0; + v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; + (void)src_stride; + + assert(dst_width % 3 == 0); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); + dst0 = __msa_copy_u_d((v2i64)vec0, 0); + dst1 = __msa_copy_u_w((v4i32)vec0, 2); + SD(dst0, dst); + SW(dst1, dst + 8); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8i16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x2AAA; + tmp1 *= const_0x2AAA; + tmp4 *= const_0x4000; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, src4, src5, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8u16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); + vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); + vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); + vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x1C71; + tmp1 *= const_0x1C71; + tmp4 *= const_0x2AAA; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t0 += 32; + t1 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + v16u8 src0; + v8u16 dst0, dst1; + v16i8 zero = {0}; + + assert(src_width > 0); + + for (x = 0; x < src_width; x += 16) { + src0 = LD_UB(src_ptr); + dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); + dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); + dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + ST_UH2(dst0, dst1, dst_ptr, 8); + src_ptr += 16; + dst_ptr += 16; + } +} + +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + v4i32 vec_x = __msa_fill_w(x); + v4i32 vec_dx = __msa_fill_w(dx); + v4i32 vec_const = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 reg0, reg1; + v16u8 dst0; + v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); + v4i32 const_0x40 = __msa_fill_w(0x40); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 16) { + vec2 = vec_x >> 16; + vec6 = vec_x & const_0xFFFF; + vec_x += vec1; + vec3 = vec_x >> 16; + vec7 = vec_x & const_0xFFFF; + vec_x += vec1; + vec4 = vec_x >> 16; + vec8 = vec_x & const_0xFFFF; + vec_x += vec1; + vec5 = vec_x >> 16; + vec9 = vec_x & const_0xFFFF; + vec_x += vec1; + vec6 >>= 9; + vec7 >>= 9; + vec8 >>= 9; + vec9 >>= 9; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); + vec2 += 1; + vec3 += 1; + vec4 += 1; + vec5 += 1; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); + tmp4 -= tmp0; + tmp5 -= tmp1; + tmp6 -= tmp2; + tmp7 -= tmp3; + tmp4 *= vec6; + tmp5 *= vec7; + tmp6 *= vec8; + tmp7 *= vec9; + tmp4 += const_0x40; + tmp5 += const_0x40; + tmp6 += const_0x40; + tmp7 += const_0x40; + tmp4 >>= 7; + tmp5 >>= 7; + tmp6 >>= 7; + tmp7 >>= 7; + tmp0 += tmp4; + tmp1 += tmp5; + tmp2 += tmp6; + tmp3 += tmp7; + reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + __msa_st_b(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + v4i32 x_vec = __msa_fill_w(x); + v4i32 dx_vec = __msa_fill_w(dx); + v4i32 const_vec = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2; + v4i32 dst0; + + vec0 = dx_vec * const_vec; + vec1 = dx_vec * 4; + x_vec += vec0; + + for (j = 0; j < dst_width; j += 4) { + vec2 = x_vec >> 16; + x_vec += vec1; + LOAD_INDEXED_DATA(src, vec2, dst0); + __msa_st_w(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + int j; + v4u32 src0, src1, src2, src3; + v4u32 vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 mult0, mult1, mult2, mult3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 dst0, dst1; + v4u32 vec_x = (v4u32)__msa_fill_w(x); + v4u32 vec_dx = (v4u32)__msa_fill_w(dx); + v4u32 vec_const = {0, 1, 2, 3}; + v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 8) { + vec2 = vec_x >> 16; + reg0 = (v16u8)(vec_x >> 9); + vec_x += vec1; + vec3 = vec_x >> 16; + reg1 = (v16u8)(vec_x >> 9); + vec_x += vec1; + reg0 = reg0 & const_0x7f; + reg1 = reg1 & const_0x7f; + reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); + reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); + reg2 = reg0 ^ const_0x7f; + reg3 = reg1 ^ const_0x7f; + mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); + mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); + mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); + mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); + LOAD_INDEXED_DATA(src, vec2, src0); + LOAD_INDEXED_DATA(src, vec3, src1); + vec2 += 1; + vec3 += 1; + LOAD_INDEXED_DATA(src, vec2, src2); + LOAD_INDEXED_DATA(src, vec3, src3); + reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + tmp0 = __msa_dotp_u_h(reg4, mult0); + tmp1 = __msa_dotp_u_h(reg5, mult1); + tmp2 = __msa_dotp_u_h(reg6, mult2); + tmp3 = __msa_dotp_u_h(reg7, mult3); + tmp0 >>= 7; + tmp1 >>= 7; + tmp2 >>= 7; + tmp3 >>= 7; + dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + __msa_st_b(dst0, dst_argb, 0); + __msa_st_b(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + v16u8 src0, src1, src2, src3; + v16u8 vec0, vec1, vec2; + v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; + v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; + v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, + 21, 23, 24, 25, 27, 28, 29, 31}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); + __msa_st_b((v16i8)vec0, dst, 0); + __msa_st_b((v16i8)vec1, dst, 16); + __msa_st_b((v16i8)vec2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 = reg0 * 3 + reg6; + reg1 = reg1 * 3 + reg7; + reg2 = reg2 * 3 + reg8; + reg3 = reg3 * 3 + reg9; + reg4 = reg4 * 3 + reg10; + reg5 = reg5 * 3 + reg11; + reg0 = __msa_srari_h(reg0, 2); + reg1 = __msa_srari_h(reg1, 2); + reg2 = __msa_srari_h(reg2, 2); + reg3 = __msa_srari_h(reg3, 2); + reg4 = __msa_srari_h(reg4, 2); + reg5 = __msa_srari_h(reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 += reg6; + reg1 += reg7; + reg2 += reg8; + reg3 += reg9; + reg4 += reg10; + reg5 += reg11; + reg0 = __msa_srari_h(reg0, 1); + reg1 = __msa_srari_h(reg1, 1); + reg2 = __msa_srari_h(reg2, 1); + reg3 = __msa_srari_h(reg3, 1); + reg4 = __msa_srari_h(reg4, 1); + reg5 = __msa_srari_h(reg5, 1); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/libs/libvpx/third_party/libyuv/source/scale_neon.cc b/libs/libvpx/third_party/libyuv/source/scale_neon.cc index 44b0c8080d..459a2995df 100644 --- a/libs/libvpx/third_party/libyuv/source/scale_neon.cc +++ b/libs/libvpx/third_party/libyuv/source/scale_neon.cc @@ -23,564 +23,541 @@ extern "C" { // Provided by Fritz Koenig // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc - "subs %2, %2, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // add adjacent - "vpaddl.u8 q1, q1 \n" - "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #1 \n" - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - MEMACCESS(1) - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "q0", "q1", "q2", "q3" // Clobber List - ); +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + // row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + // pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1", "memory", "cc" - ); +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc"); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride; - const uint8* src_ptr2 = src_ptr + src_stride * 2; - const uint8* src_ptr3 = src_ptr + src_stride * 3; -asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - MEMACCESS(3) - "vld1.8 {q1}, [%3]! \n" - MEMACCESS(4) - "vld1.8 {q2}, [%4]! \n" - MEMACCESS(5) - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - MEMACCESS(1) - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_ptr1), // %3 - "+r"(src_ptr2), // %4 - "+r"(src_ptr3) // %5 - : - : "q0", "q1", "q2", "q3", "memory", "cc" - ); +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "d0", "d1", "d2", "d3", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc"); } -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" - // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" - // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" - ); + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", + "cc"); } -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" - ); + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); } #define HAS_SCALEROWDOWN38_NEON -static uvec8 kShuf38 = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -static uvec8 kShuf38_2 = - { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; -static vec16 kMult38_Div6 = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -static vec16 kMult38_Div9 = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, + 18, 6, 14, 19, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q3}, [%3] \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vld1.8 {q3}, [%3] \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); } // 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride * 2; + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - asm volatile ( - MEMACCESS(5) - "vld1.16 {q13}, [%5] \n" - MEMACCESS(6) - "vld1.8 {q14}, [%6] \n" - MEMACCESS(7) - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" - "1: \n" + asm volatile( + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + "1: \n" - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - MEMACCESS(4) - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" - // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q15 \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride), // %3 - "+r"(src_ptr1) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" - ); + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", + "cc"); } // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - MEMACCESS(4) - "vld1.16 {q13}, [%4] \n" - MEMACCESS(5) - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" - "1: \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" - // combine source lines - "vadd.u16 q1, q3 \n" + // combine source lines + "vadd.u16 q1, q3 \n" - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q13 \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" - ); + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - const uint8* src_tmp; - asm volatile ( - "1: \n" - "mov %0, %1 \n" - "mov r12, %5 \n" - "veor q2, q2, q2 \n" - "veor q3, q3, q3 \n" - "2: \n" - // load 16 pixels into q0 - MEMACCESS(0) - "vld1.8 {q0}, [%0], %3 \n" - "vaddw.u8 q3, q3, d1 \n" - "vaddw.u8 q2, q2, d0 \n" - "subs r12, r12, #1 \n" - "bgt 2b \n" - MEMACCESS(2) - "vst1.16 {q2, q3}, [%2]! \n" // store pixels - "add %1, %1, #16 \n" - "subs %4, %4, #16 \n" // 16 processed per loop - "bgt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 - : - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List - ); +void ScaleAddRows_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int src_width, + int src_height) { + const uint8_t* src_tmp; + asm volatile( + "1: \n" + "mov %0, %1 \n" + "mov r12, %5 \n" + "veor q2, q2, q2 \n" + "veor q3, q3, q3 \n" + "2: \n" + // load 16 pixels into q0 + "vld1.8 {q0}, [%0], %3 \n" + "vaddw.u8 q3, q3, d1 \n" + "vaddw.u8 q2, q2, d0 \n" + "subs r12, r12, #1 \n" + "bgt 2b \n" + "vst1.16 {q2, q3}, [%2]! \n" // store pixels + "add %1, %1, #16 \n" + "subs %4, %4, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List + ); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" -// The NEON version mimics this formula: -// #define BLENDER(a, b, f) (uint8)((int)(a) + -// ((int)(f) * ((int)(b) - (int)(a)) >> 16)) +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_ptr; + const uint8_t* src_tmp = src_ptr; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx @@ -617,7 +594,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "vadd.s16 q8, q8, q9 \n" "vmovn.s16 d6, q8 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0]! \n" // store pixels "vadd.s32 q1, q1, q0 \n" "vadd.s32 q2, q2, q0 \n" @@ -639,325 +615,299 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" - "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" + // Blend 25 / 75. + "25: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" + // Blend 75 / 25. + "75: \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" - "99: \n" - MEMACCESS(0) - "vst1.8 {d1[7]}, [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 - : - : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" - ); + "99: \n" + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS(0) - "vld2.32 {q0, q1}, [%0]! \n" - MEMACCESS(0) - "vld2.32 {q2, q3}, [%0]! \n" - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - MEMACCESS(1) - "vst1.8 {q3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #1 \n" - "vrshrn.u16 d2, q2, #1 \n" - "vrshrn.u16 d3, q3, #1 \n" - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); +// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! +// 4a: 3e04 subs r6, #4 +// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! +// 50: ef64 21f4 vorr q9, q10, q10 +// 54: f942 038d vst2.32 {d16-d19}, [r2]! +// 58: d1f5 bne.n 46 + +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - MEMACCESS(1) - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %3, lsl #2 \n" - "1: \n" - MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0" - ); +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "mov r12, %3, lsl #2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 - MEMACCESS(1) - "vld1.8 {d1}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d3}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d4}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d5}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d6}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" - ); + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(dn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld1.32 {"#dn"["#n"]}, [%6] \n" +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld1.32 {" #dn "[" #n "]}, [%6] \n" -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { int tmp; - const uint8* src_tmp = src_argb; - asm volatile ( - "1: \n" - LOAD1_DATA32_LANE(d0, 0) - LOAD1_DATA32_LANE(d0, 1) - LOAD1_DATA32_LANE(d1, 0) - LOAD1_DATA32_LANE(d1, 1) - LOAD1_DATA32_LANE(d2, 0) - LOAD1_DATA32_LANE(d2, 1) - LOAD1_DATA32_LANE(d3, 0) - LOAD1_DATA32_LANE(d3, 1) - - MEMACCESS(0) - "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "=&r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1" - ); + const uint8_t* src_tmp = src_argb; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + // clang-format on + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "=&r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1"); } #undef LOAD1_DATA32_LANE // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(dn1, dn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n" +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_argb; + const uint8_t* src_tmp = src_argb; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx @@ -993,7 +943,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, "vshrn.i16 d0, q11, #7 \n" "vshrn.i16 d1, q12, #7 \n" - MEMACCESS(0) "vst1.32 {d0, d1}, [%0]! \n" // store pixels "vadd.s32 q8, q8, q9 \n" "subs %2, %2, #4 \n" // 4 processed per loop diff --git a/libs/libvpx/third_party/libyuv/source/scale_neon64.cc b/libs/libvpx/third_party/libyuv/source/scale_neon64.cc index ff277f26ff..494a9cfbfb 100644 --- a/libs/libvpx/third_party/libyuv/source/scale_neon64.cc +++ b/libs/libvpx/third_party/libyuv/source/scale_neon64.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/scale.h" #include "libyuv/row.h" +#include "libyuv/scale.h" #include "libyuv/scale_row.h" #ifdef __cplusplus @@ -21,580 +21,556 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into v0, odd into v1 - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc - "subs %w2, %w2, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // add adjacent - "uaddlp v1.8h, v1.16b \n" - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #1 \n" - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc - MEMACCESS(1) - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "uaddlp v1.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride; - const uint8* src_ptr2 = src_ptr + src_stride * 2; - const uint8* src_ptr3 = src_ptr + src_stride * 3; -asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - MEMACCESS(3) - "ld1 {v1.16b}, [%2], #16 \n" - MEMACCESS(4) - "ld1 {v2.16b}, [%3], #16 \n" - MEMACCESS(5) - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "uadalp v0.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" - "uadalp v0.8h, v3.16b \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - MEMACCESS(1) - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(src_ptr2), // %3 - "+r"(src_ptr3), // %4 - "+r"(dst_width) // %5 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" - // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" - // (3 * line_0 + line_1) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", - "v20", "memory", "cc" - ); + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "memory", "cc"); } -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" - // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" - ); + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); } -static uvec8 kShuf38 = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -static uvec8 kShuf38_2 = - { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; -static vec16 kMult38_Div6 = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -static vec16 kMult38_Div9 = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, + 34, 6, 22, 35, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v3.16b}, [%3] \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - MEMACCESS(1) - "st1 {v2.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc"); } // 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride * 2; + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - MEMACCESS(5) - "ld1 {v29.8h}, [%5] \n" - MEMACCESS(6) - "ld1 {v30.16b}, [%6] \n" - MEMACCESS(7) - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" + asm volatile( + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - MEMACCESS(4) - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" - // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" - // combine source lines - "add v0.8h, v0.8h, v16.8h \n" + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" - // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" - // Align for table lookup, vtbl requires registers to - // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + // Align for table lookup, vtbl requires registers to be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - MEMACCESS(1) - "st1 {v3.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(src_ptr1), // %3 - "+r"(dst_width) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", - "v30", "v31", "memory", "cc" - ); + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", + "memory", "cc"); } // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8_t* dst_ptr, + int dst_width) { // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - MEMACCESS(4) - "ld1 {v30.8h}, [%4] \n" - MEMACCESS(5) - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" + asm volatile( + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" - // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" - // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" - // Align for table lookup, vtbl requires registers to - // be adjacent + // Align for table lookup, vtbl requires registers to + // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - MEMACCESS(1) - "st1 {v3.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(dst_width) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v30", "v31", "memory", "cc" - ); + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v30", "v31", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - const uint8* src_tmp; - asm volatile ( - "1: \n" - "mov %0, %1 \n" - "mov w12, %w5 \n" - "eor v2.16b, v2.16b, v2.16b \n" - "eor v3.16b, v3.16b, v3.16b \n" - "2: \n" - // load 16 pixels into q0 - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" - "uaddw2 v3.8h, v3.8h, v0.16b \n" - "uaddw v2.8h, v2.8h, v0.8b \n" - "subs w12, w12, #1 \n" - "b.gt 2b \n" - MEMACCESS(2) - "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels - "add %1, %1, #16 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop - "b.gt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 - : - : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleAddRows_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int src_width, + int src_height) { + const uint8_t* src_tmp; + asm volatile( + "1: \n" + "mov %0, %1 \n" + "mov w12, %w5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs w12, w12, #1 \n" + "b.gt 2b \n" + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + ); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld2 {v4.b, v5.b}["#n"], [%6] \n" +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {v4.b, v5.b}[" #n "], [%6] \n" -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_ptr; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; + const uint8_t* src_tmp = src_ptr; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx @@ -626,12 +602,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "ushll2 v6.4s, v6.8h, #0 \n" "mul v16.4s, v16.4s, v7.4s \n" "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" "add v4.8h, v4.8h, v6.8h \n" "xtn v4.8b, v4.8h \n" - MEMACCESS(0) "st1 {v4.8b}, [%0], #8 \n" // store pixels "add v1.4s, v1.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n" @@ -639,7 +614,7 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 @@ -653,331 +628,300 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - int y_fraction = 256 - source_y_fraction; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" + // Blend 25 / 75. + "25: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" + // Blend 75 / 25. + "75: \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" - "99: \n" - MEMACCESS(0) - "st1 {v0.b}[15], [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction),// %4 - "+r"(y_fraction) // %5 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" - ); + "99: \n" + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction), // %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS (0) - "ld2 {v0.4s, v1.4s}, [%0], #32 \n" - MEMACCESS (0) - "ld2 {v2.4s, v3.4s}, [%0], #32 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - MEMACCESS (1) - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - MEMACCESS (1) - "st1 {v3.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (dst), // %1 - "+r" (dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS (0) - // load 8 ARGB pixels. - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #1 \n" - "rshrn v2.8b, v2.8h, #1 \n" - "rshrn v3.8b, v3.8h, #1 \n" - MEMACCESS (1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS (0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - MEMACCESS (1) - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - MEMACCESS (2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (src_stride), // %1 - "+r" (dst), // %2 - "+r" (dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" - ); +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.s}[0], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[1], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[2], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((int64)(src_stepx * 4)) // %3 - : "memory", "cc", "v0" - ); +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((int64_t)(src_stepx * 4)) // %3 + : "memory", "cc", "v0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. // TODO(Yang Zhang): Might be worth another optimization pass in future. // It could be upgraded to 8 pixels at a time to start with. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 - MEMACCESS(1) - "ld1 {v1.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v3.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v4.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v5.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v6.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"((int64)(src_stepx * 4)) // %4 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"((int64_t)(src_stepx * 4)) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(vn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld1 {"#vn".s}["#n"], [%6] \n" +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld1 {" #vn ".s}[" #n "], [%6] \n" -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; - int64 tmp64; - asm volatile ( - "1: \n" - LOAD1_DATA32_LANE(v0, 0) - LOAD1_DATA32_LANE(v0, 1) - LOAD1_DATA32_LANE(v0, 2) - LOAD1_DATA32_LANE(v0, 3) - LOAD1_DATA32_LANE(v1, 0) - LOAD1_DATA32_LANE(v1, 1) - LOAD1_DATA32_LANE(v1, 2) - LOAD1_DATA32_LANE(v1, 3) - - MEMACCESS(0) - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "=&r"(tmp64), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1" - ); +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + int64_t tmp64; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + // clang-format on + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "=&r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1"); } #undef LOAD1_DATA32_LANE // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(vn1, vn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx @@ -1014,14 +958,13 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, "shrn v0.8b, v16.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n" - MEMACCESS(0) "st1 {v0.4s}, [%0], #16 \n" // store pixels "add v5.4s, v5.4s, v6.4s \n" "subs %w2, %w2, #4 \n" // 4 processed per loop "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 @@ -1034,6 +977,85 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, #undef LOAD2_DATA32_LANE +// Read 16x2 average down and write 8x1. +void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "1: \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Read 8x2 upsample with filtering and write 16x1. +// Actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" + + "1: \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : "r"(2LL), // %4 + "r"(14LL) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19" // Clobber List + ); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/libs/libvpx/third_party/libyuv/source/scale_win.cc b/libs/libvpx/third_party/libyuv/source/scale_win.cc index f17097365c..c5fc86f3e9 100644 --- a/libs/libvpx/third_party/libyuv/source/scale_win.cc +++ b/libs/libvpx/third_party/libyuv/source/scale_win.cc @@ -17,97 +17,93 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -120,27 +116,28 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x1 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -153,20 +150,21 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x2 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] @@ -174,15 +172,15 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add + paddw xmm0, xmm2 // vertical add paddw xmm1, xmm3 psrlw xmm0, 1 psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -197,23 +195,24 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -225,30 +224,31 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x1 rectangle to 32x1. -__declspec(naked) -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -262,20 +262,21 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // For rounding, average = (sum + 2) / 4 // becomes average((sum >> 1), 0) // Blends 64x2 rectangle to 32x1. -__declspec(naked) -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] @@ -283,18 +284,18 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add + vpaddw ymm0, ymm0, ymm2 // vertical add vpaddw ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 + vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 vpsrlw ymm1, ymm1, 1 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -308,15 +309,16 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. -__declspec(naked) -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 psrld xmm5, 24 pslld xmm5, 16 @@ -339,50 +341,51 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x4 rectangle to 8x1. -__declspec(naked) -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 movdqa xmm5, xmm4 packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 + psllw xmm5, 3 // constant 0x0008 wloop: - movdqu xmm0, [eax] // average rows + movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 + paddw xmm0, xmm2 // vertical add rows 0, 1 paddw xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 + paddw xmm0, xmm2 // add row 2 paddw xmm1, xmm3 movdqu xmm2, [eax + edi] movdqu xmm3, [eax + edi + 16] lea eax, [eax + 32] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 + paddw xmm0, xmm2 // add row 3 paddw xmm1, xmm3 phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 + paddw xmm0, xmm5 // + 8 for round + psrlw xmm0, 4 // /16 for average of 4 * 4 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] @@ -397,15 +400,16 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN4_AVX2 // Point samples 64 pixels to 16 pixels. -__declspec(naked) -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 vpsrld ymm5, ymm5, 24 vpslld ymm5, ymm5, 16 @@ -416,10 +420,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vpand ymm0, ymm0, ymm5 vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -431,52 +435,53 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x4 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 + vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 vpsrlw ymm4, ymm4, 15 - vpsllw ymm5, ymm4, 3 // constant 0x0008 + vpsllw ymm5, ymm4, 3 // constant 0x0008 vpackuswb ymm4, ymm4, ymm4 wloop: - vmovdqu ymm0, [eax] // average rows + vmovdqu ymm0, [eax] // average rows vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 + vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + esi * 2] vmovdqu ymm3, [eax + esi * 2 + 32] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 2 + vpaddw ymm0, ymm0, ymm2 // add row 2 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + edi] vmovdqu ymm3, [eax + edi + 32] lea eax, [eax + 64] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 3 + vpaddw ymm0, ymm0, ymm2 // add row 3 vpaddw ymm1, ymm1, ymm3 - vphaddw ymm0, ymm0, ymm1 // mutates - vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw - vpaddw ymm0, ymm0, ymm5 // + 8 for round - vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 + vphaddw ymm0, ymm0, ymm1 // mutates + vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw + vpaddw ymm0, ymm0, ymm5 // + 8 for round + vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -494,14 +499,15 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -__declspec(naked) -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm3, xmmword ptr kShuf0 movdqa xmm4, xmmword ptr kShuf1 movdqa xmm5, xmmword ptr kShuf2 @@ -541,16 +547,16 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -__declspec(naked) -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 @@ -559,7 +565,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, movdqa xmm7, xmmword ptr kRound34 wloop: - movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -568,7 +574,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm0, xmm1 pshufb xmm0, xmm3 @@ -577,7 +583,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 @@ -598,16 +604,16 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, } // Note that movdqa+palign may be better than movdqu. -__declspec(naked) -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 @@ -616,7 +622,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, movdqa xmm7, xmmword ptr kRound34 wloop: - movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -626,7 +632,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -636,7 +642,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 @@ -660,26 +666,27 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm4, xmmword ptr kShuf38a movdqa xmm5, xmmword ptr kShuf38b xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 paddusb xmm0, xmm1 - movq qword ptr [edx], xmm0 // write 12 pixels + movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] @@ -691,23 +698,23 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAc movdqa xmm3, xmmword ptr kShufAc3 movdqa xmm4, xmmword ptr kScaleAc33 pxor xmm5, xmm5 xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 movdqu xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 @@ -725,14 +732,14 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, paddusw xmm0, xmm6 paddusw xmm1, xmm7 - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 psrldq xmm0, 2 paddusw xmm6, xmm0 psrldq xmm0, 2 paddusw xmm6, xmm0 pshufb xmm6, xmm2 - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 psrldq xmm1, 2 paddusw xmm7, xmm1 psrldq xmm1, 2 @@ -740,10 +747,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, pshufb xmm7, xmm3 paddusw xmm6, xmm7 - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 packuswb xmm6, xmm6 - movd [edx], xmm6 // write 6 pixels + movd [edx], xmm6 // write 6 pixels psrlq xmm6, 16 movd [edx + 2], xmm6 lea edx, [edx + 6] @@ -756,28 +763,28 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAb0 movdqa xmm3, xmmword ptr kShufAb1 movdqa xmm4, xmmword ptr kShufAb2 movdqa xmm5, xmmword ptr kScaleAb2 xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm0, [eax] // average 2 rows into xmm0 movdqu xmm1, [eax + esi] lea eax, [eax + 16] pavgb xmm0, xmm1 - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 movdqa xmm6, xmm0 pshufb xmm6, xmm3 @@ -785,10 +792,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, pshufb xmm0, xmm4 paddusw xmm1, xmm0 - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 packuswb xmm1, xmm1 - movd [edx], xmm1 // write 6 pixels + movd [edx], xmm1 // write 6 pixels psrlq xmm1, 16 movd [edx + 2], xmm1 lea edx, [edx + 6] @@ -801,26 +808,27 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, } // Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width pxor xmm5, xmm5 - // sum rows + // sum rows xloop: - movdqu xmm3, [eax] // read 16 bytes + movdqu xmm3, [eax] // read 16 bytes lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm0, [edx] // read 16 words from destination movdqu xmm1, [edx + 16] movdqa xmm2, xmm3 punpcklbw xmm2, xmm5 punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words + paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx], xmm0 // write 16 words to destination movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 16 @@ -831,24 +839,25 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. -__declspec(naked) -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width vpxor ymm5, ymm5, ymm5 - // sum rows + // sum rows xloop: - vmovdqu ymm3, [eax] // read 32 bytes + vmovdqu ymm3, [eax] // read 32 bytes lea eax, [eax + 32] vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck vpunpcklbw ymm2, ymm3, ymm5 vpunpckhbw ymm3, ymm3, ymm5 - vpaddusw ymm0, ymm2, [edx] // sum 16 words + vpaddusw ymm0, ymm2, [edx] // sum 16 words vpaddusw ymm1, ymm3, [edx + 32] - vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx], ymm0 // write 32 words to destination vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 32 @@ -862,86 +871,87 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = - { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -__declspec(naked) -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { __asm { push ebx push esi push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width movd xmm2, [esp + 12 + 16] // x movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. + mov eax, 0x04040000 // shuffle to line up fractions with pixel. movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 + pcmpeqb xmm7, xmm7 // generate 0x0001 psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll + pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx + movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. + psrlw xmm1, 9 // 7 bit fractions. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 + pshufb xmm1, xmm5 // 0011 punpcklwd xmm0, xmm4 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. + pxor xmm1, xmm6 // 0..7f and 7f..0 + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm1, xmm1 // 8 bits, 2 pixels. movd ebx, xmm1 mov [edi], bx lea edi, [edi + 2] - sub ecx, 2 // 2 pixels + sub ecx, 2 // 2 pixels jge xloop2 xloop29: add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder + // 1 pixel remainder movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit + pxor xmm2, xmm6 // 0..7f and 7f..0 + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm2, xmm0 // 16 bit paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm2, xmm2 // 8 bits movd ebx, xmm2 mov [edi], bl @@ -955,13 +965,15 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] @@ -980,15 +992,15 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] @@ -1005,23 +1017,23 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, } // Blends 8x1 rectangle to 4x1. -__declspec(naked) -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1033,16 +1045,16 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, } // Blends 8x2 rectangle to 4x1. -__declspec(naked) -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width wloop: movdqu xmm0, [eax] @@ -1050,11 +1062,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1067,18 +1079,19 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, } // Reads 4 pixels at a time. -__declspec(naked) -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { __asm { push ebx push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] @@ -1103,21 +1116,21 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, } // Blends four 2x2 to 4x1. -__declspec(naked) -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] @@ -1132,11 +1145,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, movq xmm3, qword ptr [esi + ebx * 2] movhps xmm3, qword ptr [esi + edi] lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1151,64 +1164,66 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, } // Column scaling unfiltered. SSE2 version. -__declspec(naked) -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { __asm { push edi push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. cmp ecx, 0 jle xloop99 sub ecx, 4 jl xloop49 - // 4 Pixel loop. + // 4 Pixel loop. xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 movd xmm1, [esi + eax * 4] // 1 source x2 pixels movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 movdqu [edi], xmm0 lea edi, [edi + 16] - sub ecx, 4 // 4 pixels + sub ecx, 4 // 4 pixels jge xloop4 xloop49: test ecx, 2 je xloop29 - // 2 Pixels. + // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 movq qword ptr [edi], xmm0 lea edi, [edi + 8] @@ -1217,7 +1232,7 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, test ecx, 1 je xloop99 - // 1 Pixels. + // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 xloop99: @@ -1232,60 +1247,62 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, // TODO(fbarchard): Port to Neon // Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx movdqa xmm4, xmmword ptr kShuffleColARGB movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll + pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx + movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. + psrlw xmm1, 9 // 7 bit fractions. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. movq qword ptr [edi], xmm0 lea edi, [edi + 8] - sub ecx, 2 // 2 pixels + sub ecx, 2 // 2 pixels jge xloop2 xloop29: @@ -1293,15 +1310,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. movd [edi], xmm0 xloop99: @@ -1313,13 +1330,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, } // Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] @@ -1338,12 +1357,11 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) -int FixedDiv_X86(int num, int div) { +__declspec(naked) int FixedDiv_X86(int num, int div) { __asm { - mov eax, [esp + 4] // num - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 shl eax, 16 idiv dword ptr [esp + 8] ret @@ -1351,13 +1369,12 @@ int FixedDiv_X86(int num, int div) { } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) -int FixedDiv1_X86(int num, int div) { +__declspec(naked) int FixedDiv1_X86(int num, int div) { __asm { - mov eax, [esp + 4] // num - mov ecx, [esp + 8] // denom - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 shl eax, 16 sub eax, 0x00010001 sbb edx, 0 diff --git a/libs/libvpx/third_party/libyuv/source/video_common.cc b/libs/libvpx/third_party/libyuv/source/video_common.cc index 00fb71e18b..92384c050c 100644 --- a/libs/libvpx/third_party/libyuv/source/video_common.cc +++ b/libs/libvpx/third_party/libyuv/source/video_common.cc @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "libyuv/video_common.h" #ifdef __cplusplus @@ -16,40 +15,39 @@ namespace libyuv { extern "C" { #endif -#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0])) - struct FourCCAliasEntry { - uint32 alias; - uint32 canonical; + uint32_t alias; + uint32_t canonical; }; -static const struct FourCCAliasEntry kFourCCAliases[] = { - {FOURCC_IYUV, FOURCC_I420}, - {FOURCC_YU12, FOURCC_I420}, - {FOURCC_YU16, FOURCC_I422}, - {FOURCC_YU24, FOURCC_I444}, - {FOURCC_YUYV, FOURCC_YUY2}, - {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs - {FOURCC_HDYC, FOURCC_UYVY}, - {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 - {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. - {FOURCC_DMB1, FOURCC_MJPG}, - {FOURCC_BA81, FOURCC_BGGR}, // deprecated. - {FOURCC_RGB3, FOURCC_RAW }, - {FOURCC_BGR3, FOURCC_24BG}, - {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB - {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB - {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 - {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 - {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +#define NUM_ALIASES 18 +static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 }; // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. // {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA LIBYUV_API -uint32 CanonicalFourCC(uint32 fourcc) { +uint32_t CanonicalFourCC(uint32_t fourcc) { int i; - for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + for (i = 0; i < NUM_ALIASES; ++i) { if (kFourCCAliases[i].alias == fourcc) { return kFourCCAliases[i].canonical; } @@ -62,4 +60,3 @@ uint32 CanonicalFourCC(uint32 fourcc) { } // extern "C" } // namespace libyuv #endif - diff --git a/libs/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py b/libs/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py new file mode 100644 index 0000000000..4b640e3e48 --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py @@ -0,0 +1,76 @@ +import argparse +from os import listdir, path +from PIL import Image +import sys + +parser = argparse.ArgumentParser() +parser.add_argument("--frame_path", default="../data/frame/", type=str) +parser.add_argument("--frame_rate", default="25:1", type=str) +parser.add_argument("--interlacing", default="Ip", type=str) +parser.add_argument("--pix_ratio", default="0:0", type=str) +parser.add_argument("--color_space", default="4:2:0", type=str) +parser.add_argument("--output", default="output.y4m", type=str) + + +def generate(args, frames): + if len(frames) == 0: + return + #sort the frames based on the frame index + frames = sorted(frames, key=lambda x: x[0]) + #convert the frames to YUV form + frames = [f.convert("YCbCr") for _, f in frames] + #write the header + header = "YUV4MPEG2 W%d H%d F%s %s A%s" % (frames[0].width, frames[0].height, + args.frame_rate, args.interlacing, + args.pix_ratio) + cs = args.color_space.split(":") + header += " C%s%s%s\n" % (cs[0], cs[1], cs[2]) + #estimate the sample step based on subsample value + subsamples = [int(c) for c in cs] + r_step = [1, int(subsamples[2] == 0) + 1, int(subsamples[2] == 0) + 1] + c_step = [1, 4 // subsamples[1], 4 // subsamples[1]] + #write in frames + with open(args.output, "wb") as y4m: + y4m.write(header) + for f in frames: + y4m.write("FRAME\n") + px = f.load() + for k in xrange(3): + for i in xrange(0, f.height, r_step[k]): + for j in xrange(0, f.width, c_step[k]): + yuv = px[j, i] + y4m.write(chr(yuv[k])) + + +if __name__ == "__main__": + args = parser.parse_args() + frames = [] + frames_mv = [] + for filename in listdir(args.frame_path): + name, ext = filename.split(".") + if ext == "png": + name_parse = name.split("_") + idx = int(name_parse[-1]) + img = Image.open(path.join(args.frame_path, filename)) + if name_parse[-2] == "mv": + frames_mv.append((idx, img)) + else: + frames.append((idx, img)) + if len(frames) == 0: + print "No frames in directory: " + args.frame_path + sys.exit() + print("----------------------Y4M Info----------------------") + print("width: %d" % frames[0][1].width) + print("height: %d" % frames[0][1].height) + print("#frame: %d" % len(frames)) + print("frame rate: %s" % args.frame_rate) + print("interlacing: %s" % args.interlacing) + print("pixel ratio: %s" % args.pix_ratio) + print("color space: %s" % args.color_space) + print("----------------------------------------------------") + + print("Generating ...") + generate(args, frames) + if len(frames_mv) != 0: + args.output = args.output.replace(".y4m", "_mv.y4m") + generate(args, frames_mv) diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde new file mode 100644 index 0000000000..7249ee972e --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde @@ -0,0 +1,163 @@ +/* + *AABB bounding box + *Bouding Volume Hierarchy + */ +class BoundingBox { + float min_x, min_y, min_z, max_x, max_y, max_z; + PVector center; + BoundingBox() { + min_x = Float.POSITIVE_INFINITY; + min_y = Float.POSITIVE_INFINITY; + min_z = Float.POSITIVE_INFINITY; + max_x = Float.NEGATIVE_INFINITY; + max_y = Float.NEGATIVE_INFINITY; + max_z = Float.NEGATIVE_INFINITY; + center = new PVector(); + } + // build a bounding box for a triangle + void create(Triangle t) { + min_x = min(t.p1.x, min(t.p2.x, t.p3.x)); + max_x = max(t.p1.x, max(t.p2.x, t.p3.x)); + + min_y = min(t.p1.y, min(t.p2.y, t.p3.y)); + max_y = max(t.p1.y, max(t.p2.y, t.p3.y)); + + min_z = min(t.p1.z, min(t.p2.z, t.p3.z)); + max_z = max(t.p1.z, max(t.p2.z, t.p3.z)); + center.x = (max_x + min_x) / 2; + center.y = (max_y + min_y) / 2; + center.z = (max_z + min_z) / 2; + } + // merge two bounding boxes + void add(BoundingBox bbx) { + min_x = min(min_x, bbx.min_x); + min_y = min(min_y, bbx.min_y); + min_z = min(min_z, bbx.min_z); + + max_x = max(max_x, bbx.max_x); + max_y = max(max_y, bbx.max_y); + max_z = max(max_z, bbx.max_z); + center.x = (max_x + min_x) / 2; + center.y = (max_y + min_y) / 2; + center.z = (max_z + min_z) / 2; + } + // get bounding box center axis value + float getCenterAxisValue(int axis) { + if (axis == 1) { + return center.x; + } else if (axis == 2) { + return center.y; + } + // when axis == 3 + return center.z; + } + // check if a ray is intersected with the bounding box + boolean intersect(Ray r) { + float tmin, tmax; + if (r.dir.x >= 0) { + tmin = (min_x - r.ori.x) * (1.0f / r.dir.x); + tmax = (max_x - r.ori.x) * (1.0f / r.dir.x); + } else { + tmin = (max_x - r.ori.x) * (1.0f / r.dir.x); + tmax = (min_x - r.ori.x) * (1.0f / r.dir.x); + } + + float tymin, tymax; + if (r.dir.y >= 0) { + tymin = (min_y - r.ori.y) * (1.0f / r.dir.y); + tymax = (max_y - r.ori.y) * (1.0f / r.dir.y); + } else { + tymin = (max_y - r.ori.y) * (1.0f / r.dir.y); + tymax = (min_y - r.ori.y) * (1.0f / r.dir.y); + } + + if (tmax < tymin || tymax < tmin) { + return false; + } + + tmin = tmin < tymin ? tymin : tmin; + tmax = tmax > tymax ? tymax : tmax; + + float tzmin, tzmax; + if (r.dir.z >= 0) { + tzmin = (min_z - r.ori.z) * (1.0f / r.dir.z); + tzmax = (max_z - r.ori.z) * (1.0f / r.dir.z); + } else { + tzmin = (max_z - r.ori.z) * (1.0f / r.dir.z); + tzmax = (min_z - r.ori.z) * (1.0f / r.dir.z); + } + if (tmax < tzmin || tmin > tzmax) { + return false; + } + return true; + } +} +// Bounding Volume Hierarchy +class BVH { + // Binary Tree + BVH left, right; + BoundingBox overall_bbx; + ArrayList mesh; + BVH(ArrayList mesh) { + this.mesh = mesh; + overall_bbx = new BoundingBox(); + left = null; + right = null; + int mesh_size = this.mesh.size(); + if (mesh_size <= 1) { + return; + } + // random select an axis + int axis = int(random(100)) % 3 + 1; + // build bounding box and save the selected center component + float[] axis_values = new float[mesh_size]; + for (int i = 0; i < mesh_size; i++) { + Triangle t = this.mesh.get(i); + overall_bbx.add(t.bbx); + axis_values[i] = t.bbx.getCenterAxisValue(axis); + } + // find the median value of selected center component as pivot + axis_values = sort(axis_values); + float pivot; + if (mesh_size % 2 == 1) { + pivot = axis_values[mesh_size / 2]; + } else { + pivot = + 0.5f * (axis_values[mesh_size / 2 - 1] + axis_values[mesh_size / 2]); + } + // Build left node and right node by partitioning the mesh based on triangle + // bounding box center component value + ArrayList left_mesh = new ArrayList(); + ArrayList right_mesh = new ArrayList(); + for (int i = 0; i < mesh_size; i++) { + Triangle t = this.mesh.get(i); + if (t.bbx.getCenterAxisValue(axis) < pivot) { + left_mesh.add(t); + } else if (t.bbx.getCenterAxisValue(axis) > pivot) { + right_mesh.add(t); + } else if (left_mesh.size() < right_mesh.size()) { + left_mesh.add(t); + } else { + right_mesh.add(t); + } + } + left = new BVH(left_mesh); + right = new BVH(right_mesh); + } + // check if a ray intersect with current volume + boolean intersect(Ray r, float[] param) { + if (mesh.size() == 0) { + return false; + } + if (mesh.size() == 1) { + Triangle t = mesh.get(0); + return t.intersect(r, param); + } + if (!overall_bbx.intersect(r)) { + return false; + } + boolean left_res = left.intersect(r, param); + boolean right_res = right.intersect(r, param); + return left_res || right_res; + } +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde new file mode 100644 index 0000000000..b39dae3a19 --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde @@ -0,0 +1,138 @@ +class Camera { + // camera's field of view + float fov; + // camera's position, look at point and axis + PVector pos, center, axis; + PVector init_pos, init_center, init_axis; + float move_speed; + float rot_speed; + Camera(float fov, PVector pos, PVector center, PVector axis) { + this.fov = fov; + this.pos = pos; + this.center = center; + this.axis = axis; + this.axis.normalize(); + move_speed = 0.001; + rot_speed = 0.01 * PI; + init_pos = pos.copy(); + init_center = center.copy(); + init_axis = axis.copy(); + } + + Camera copy() { + Camera cam = new Camera(fov, pos.copy(), center.copy(), axis.copy()); + return cam; + } + + PVector project(PVector pos) { + PVector proj = MatxVec3(getCameraMat(), PVector.sub(pos, this.pos)); + proj.x = (float)height / 2.0 * proj.x / proj.z / tan(fov / 2.0f); + proj.y = (float)height / 2.0 * proj.y / proj.z / tan(fov / 2.0f); + proj.z = proj.z; + return proj; + } + + float[] getCameraMat() { + float[] mat = new float[9]; + PVector dir = PVector.sub(center, pos); + dir.normalize(); + PVector left = dir.cross(axis); + left.normalize(); + // processing camera system does not follow right hand rule + mat[0] = -left.x; + mat[1] = -left.y; + mat[2] = -left.z; + mat[3] = axis.x; + mat[4] = axis.y; + mat[5] = axis.z; + mat[6] = dir.x; + mat[7] = dir.y; + mat[8] = dir.z; + + return mat; + } + + void run() { + PVector dir, left; + if (mousePressed) { + float angleX = (float)mouseX / width * PI - PI / 2; + float angleY = (float)mouseY / height * PI - PI; + PVector diff = PVector.sub(center, pos); + float radius = diff.mag(); + pos.x = radius * sin(angleY) * sin(angleX) + center.x; + pos.y = radius * cos(angleY) + center.y; + pos.z = radius * sin(angleY) * cos(angleX) + center.z; + dir = PVector.sub(center, pos); + dir.normalize(); + PVector up = new PVector(0, 1, 0); + left = up.cross(dir); + left.normalize(); + axis = dir.cross(left); + axis.normalize(); + } + + if (keyPressed) { + switch (key) { + case 'w': + dir = PVector.sub(center, pos); + dir.normalize(); + pos = PVector.add(pos, PVector.mult(dir, move_speed)); + center = PVector.add(center, PVector.mult(dir, move_speed)); + break; + case 's': + dir = PVector.sub(center, pos); + dir.normalize(); + pos = PVector.sub(pos, PVector.mult(dir, move_speed)); + center = PVector.sub(center, PVector.mult(dir, move_speed)); + break; + case 'a': + dir = PVector.sub(center, pos); + dir.normalize(); + left = axis.cross(dir); + left.normalize(); + pos = PVector.add(pos, PVector.mult(left, move_speed)); + center = PVector.add(center, PVector.mult(left, move_speed)); + break; + case 'd': + dir = PVector.sub(center, pos); + dir.normalize(); + left = axis.cross(dir); + left.normalize(); + pos = PVector.sub(pos, PVector.mult(left, move_speed)); + center = PVector.sub(center, PVector.mult(left, move_speed)); + break; + case 'r': + dir = PVector.sub(center, pos); + dir.normalize(); + float[] mat = getRotationMat3x3(rot_speed, dir.x, dir.y, dir.z); + axis = MatxVec3(mat, axis); + axis.normalize(); + break; + case 'b': + pos = init_pos.copy(); + center = init_center.copy(); + axis = init_axis.copy(); + break; + case '+': move_speed *= 2.0f; break; + case '-': move_speed /= 2.0; break; + case CODED: + if (keyCode == UP) { + pos = PVector.add(pos, PVector.mult(axis, move_speed)); + center = PVector.add(center, PVector.mult(axis, move_speed)); + } else if (keyCode == DOWN) { + pos = PVector.sub(pos, PVector.mult(axis, move_speed)); + center = PVector.sub(center, PVector.mult(axis, move_speed)); + } + } + } + } + void open() { + perspective(fov, float(width) / height, 1e-6, 1e5); + camera(pos.x, pos.y, pos.z, center.x, center.y, center.z, axis.x, axis.y, + axis.z); + } + void close() { + ortho(-width, 0, -height, 0); + camera(0, 0, 0, 0, 0, 1, 0, 1, 0); + } +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde new file mode 100644 index 0000000000..883a8f8310 --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde @@ -0,0 +1,94 @@ +class MotionField { + int block_size; + ArrayList motion_field; + MotionField(int block_size) { + this.block_size = block_size; + motion_field = new ArrayList(); + } + + void update(Camera last_cam, Camera current_cam, PointCloud point_cloud, + BVH bvh) { + // clear motion field + motion_field = new ArrayList(); + int r_num = height / block_size, c_num = width / block_size; + for (int i = 0; i < r_num * c_num; i++) + motion_field.add(new PVector(0, 0, 0)); + // estimate motion vector of each point in point cloud + for (int i = 0; i < point_cloud.size(); i++) { + PVector p = point_cloud.getPosition(i); + PVector p0 = current_cam.project(p); + PVector p1 = last_cam.project(p); + int row = int((p0.y + height / 2.0f) / block_size); + int col = int((p0.x + width / 2.0f) / block_size); + if (row >= 0 && row < r_num && col >= 0 && col < c_num) { + PVector accu = motion_field.get(row * c_num + col); + accu.x += p1.x - p0.x; + accu.y += p1.y - p0.y; + accu.z += 1; + } + } + // if some blocks do not have point, then use ray tracing to see if they are + // in triangles + for (int i = 0; i < r_num; i++) + for (int j = 0; j < c_num; j++) { + PVector accu = motion_field.get(i * c_num + j); + if (accu.z > 0) { + continue; + } + // use the center of the block to generate view ray + float cx = j * block_size + block_size / 2.0f - width / 2.0f; + float cy = i * block_size + block_size / 2.0f - height / 2.0f; + float cz = 0.5f * height / tan(current_cam.fov / 2.0f); + PVector dir = new PVector(cx, cy, cz); + float[] camMat = current_cam.getCameraMat(); + dir = MatxVec3(transpose3x3(camMat), dir); + dir.normalize(); + Ray r = new Ray(current_cam.pos, dir); + // ray tracing + float[] param = new float[4]; + param[0] = Float.POSITIVE_INFINITY; + if (bvh.intersect(r, param)) { + PVector p = new PVector(param[1], param[2], param[3]); + PVector p0 = current_cam.project(p); + PVector p1 = last_cam.project(p); + accu.x += p1.x - p0.x; + accu.y += p1.y - p0.y; + accu.z += 1; + } + } + // estimate the motion vector of each block + for (int i = 0; i < r_num * c_num; i++) { + PVector mv = motion_field.get(i); + if (mv.z > 0) { + motion_field.set(i, new PVector(mv.x / mv.z, mv.y / mv.z, 0)); + } + } + } + + void render() { + int r_num = height / block_size, c_num = width / block_size; + for (int i = 0; i < r_num; i++) + for (int j = 0; j < c_num; j++) { + PVector mv = motion_field.get(i * c_num + j); + float ox = j * block_size + 0.5f * block_size; + float oy = i * block_size + 0.5f * block_size; + stroke(255, 0, 0); + line(ox, oy, ox + mv.x, oy + mv.y); + } + } + + void save(String path) { + int r_num = height / block_size; + int c_num = width / block_size; + String[] mvs = new String[r_num]; + for (int i = 0; i < r_num; i++) { + mvs[i] = ""; + for (int j = 0; j < c_num; j++) { + PVector mv = motion_field.get(i * c_num + j); + mvs[i] += str(mv.x) + "," + str(mv.y); + if (j != c_num - 1) mvs[i] += ";"; + } + } + saveStrings(path, mvs); + } +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde new file mode 100644 index 0000000000..714a6f3a0b --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde @@ -0,0 +1,138 @@ +class PointCloud { + ArrayList points; // array to save points + IntList point_colors; // array to save points color + PVector cloud_mass; + float[] depth; + boolean[] real; + PointCloud() { + // initialize + points = new ArrayList(); + point_colors = new IntList(); + cloud_mass = new PVector(0, 0, 0); + depth = new float[width * height]; + real = new boolean[width * height]; + } + + void generate(PImage rgb_img, PImage depth_img, Transform trans) { + if (depth_img.width != width || depth_img.height != height || + rgb_img.width != width || rgb_img.height != height) { + println("rgb and depth file dimension should be same with window size"); + exit(); + } + // clear depth and real + for (int i = 0; i < width * height; i++) { + depth[i] = 0; + real[i] = false; + } + for (int v = 0; v < height; v++) + for (int u = 0; u < width; u++) { + // get depth value (red channel) + color depth_px = depth_img.get(u, v); + depth[v * width + u] = depth_px & 0x0000FFFF; + if (int(depth[v * width + u]) != 0) { + real[v * width + u] = true; + } + point_colors.append(rgb_img.get(u, v)); + } + for (int v = 0; v < height; v++) + for (int u = 0; u < width; u++) { + if (int(depth[v * width + u]) == 0) { + interpolateDepth(v, u); + } + // add transformed pixel as well as pixel color to the list + PVector pos = trans.transform(u, v, int(depth[v * width + u])); + points.add(pos); + // accumulate z value + cloud_mass = PVector.add(cloud_mass, pos); + } + } + void fillInDepthAlongPath(float d, Node node) { + node = node.parent; + while (node != null) { + int i = node.row; + int j = node.col; + if (depth[i * width + j] == 0) { + depth[i * width + j] = d; + } + node = node.parent; + } + } + // interpolate + void interpolateDepth(int row, int col) { + if (row < 0 || row >= height || col < 0 || col >= width || + int(depth[row * width + col]) != 0) { + return; + } + ArrayList queue = new ArrayList(); + queue.add(new Node(row, col, null)); + boolean[] visited = new boolean[width * height]; + for (int i = 0; i < width * height; i++) visited[i] = false; + visited[row * width + col] = true; + // Using BFS to Find the Nearest Neighbor + while (queue.size() > 0) { + // pop + Node node = queue.get(0); + queue.remove(0); + int i = node.row; + int j = node.col; + // if current position have a real depth + if (depth[i * width + j] != 0 && real[i * width + j]) { + fillInDepthAlongPath(depth[i * width + j], node); + break; + } else { + // search unvisited 8 neighbors + for (int r = max(0, i - 1); r < min(height, i + 2); r++) { + for (int c = max(0, j - 1); c < min(width, j + 2); c++) { + if (!visited[r * width + c]) { + visited[r * width + c] = true; + queue.add(new Node(r, c, node)); + } + } + } + } + } + } + // get point cloud size + int size() { return points.size(); } + // get ith position + PVector getPosition(int i) { + if (i >= points.size()) { + println("point position: index " + str(i) + " exceeds"); + exit(); + } + return points.get(i); + } + // get ith color + color getColor(int i) { + if (i >= point_colors.size()) { + println("point color: index " + str(i) + " exceeds"); + exit(); + } + return point_colors.get(i); + } + // get cloud center + PVector getCloudCenter() { + if (points.size() > 0) { + return PVector.div(cloud_mass, points.size()); + } + return new PVector(0, 0, 0); + } + // merge two clouds + void merge(PointCloud point_cloud) { + for (int i = 0; i < point_cloud.size(); i++) { + points.add(point_cloud.getPosition(i)); + point_colors.append(point_cloud.getColor(i)); + } + cloud_mass = PVector.add(cloud_mass, point_cloud.cloud_mass); + } +} + +class Node { + int row, col; + Node parent; + Node(int row, int col, Node parent) { + this.row = row; + this.col = col; + this.parent = parent; + } +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde new file mode 100644 index 0000000000..ef4be691c2 --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde @@ -0,0 +1,61 @@ +// Triangle +class Triangle { + // position + PVector p1, p2, p3; + // color + color c1, c2, c3; + BoundingBox bbx; + Triangle(PVector p1, PVector p2, PVector p3, color c1, color c2, color c3) { + this.p1 = p1; + this.p2 = p2; + this.p3 = p3; + this.c1 = c1; + this.c2 = c2; + this.c3 = c3; + bbx = new BoundingBox(); + bbx.create(this); + } + // check to see if a ray intersects with the triangle + boolean intersect(Ray r, float[] param) { + PVector p21 = PVector.sub(p2, p1); + PVector p31 = PVector.sub(p3, p1); + PVector po1 = PVector.sub(r.ori, p1); + + PVector dxp31 = r.dir.cross(p31); + PVector po1xp21 = po1.cross(p21); + float denom = p21.dot(dxp31); + float t = p31.dot(po1xp21) / denom; + float alpha = po1.dot(dxp31) / denom; + float beta = r.dir.dot(po1xp21) / denom; + + boolean res = t > 0 && alpha > 0 && alpha < 1 && beta > 0 && beta < 1 && + alpha + beta < 1; + // depth test + if (res && t < param[0]) { + param[0] = t; + param[1] = alpha * p1.x + beta * p2.x + (1 - alpha - beta) * p3.x; + param[2] = alpha * p1.y + beta * p2.y + (1 - alpha - beta) * p3.y; + param[3] = alpha * p1.z + beta * p2.z + (1 - alpha - beta) * p3.z; + } + return res; + } + void render() { + beginShape(TRIANGLES); + fill(c1); + vertex(p1.x, p1.y, p1.z); + fill(c2); + vertex(p2.x, p2.y, p2.z); + fill(c3); + vertex(p3.x, p3.y, p3.z); + endShape(); + } +} +// Ray +class Ray { + // origin and direction + PVector ori, dir; + Ray(PVector ori, PVector dir) { + this.ori = ori; + this.dir = dir; + } +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde new file mode 100644 index 0000000000..cf79ab7141 --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde @@ -0,0 +1,59 @@ +class Scene { + PointCloud point_cloud; + ArrayList mesh; + BVH bvh; + MotionField motion_field; + Camera last_cam; + Camera current_cam; + int frame_count; + + Scene(Camera camera, PointCloud point_cloud, MotionField motion_field) { + this.point_cloud = point_cloud; + this.motion_field = motion_field; + mesh = new ArrayList(); + for (int v = 0; v < height - 1; v++) + for (int u = 0; u < width - 1; u++) { + PVector p1 = point_cloud.getPosition(v * width + u); + PVector p2 = point_cloud.getPosition(v * width + u + 1); + PVector p3 = point_cloud.getPosition((v + 1) * width + u + 1); + PVector p4 = point_cloud.getPosition((v + 1) * width + u); + color c1 = point_cloud.getColor(v * width + u); + color c2 = point_cloud.getColor(v * width + u + 1); + color c3 = point_cloud.getColor((v + 1) * width + u + 1); + color c4 = point_cloud.getColor((v + 1) * width + u); + mesh.add(new Triangle(p1, p2, p3, c1, c2, c3)); + mesh.add(new Triangle(p3, p4, p1, c3, c4, c1)); + } + bvh = new BVH(mesh); + last_cam = camera.copy(); + current_cam = camera; + frame_count = 0; + } + + void run() { + last_cam = current_cam.copy(); + current_cam.run(); + motion_field.update(last_cam, current_cam, point_cloud, bvh); + frame_count += 1; + } + + void render(boolean show_motion_field) { + // build mesh + current_cam.open(); + noStroke(); + for (int i = 0; i < mesh.size(); i++) { + Triangle t = mesh.get(i); + t.render(); + } + if (show_motion_field) { + current_cam.close(); + motion_field.render(); + } + } + + void save(String path) { saveFrame(path + "_" + str(frame_count) + ".png"); } + + void saveMotionField(String path) { + motion_field.save(path + "_" + str(frame_count) + ".txt"); + } +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde new file mode 100644 index 0000000000..af2204e8cf --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde @@ -0,0 +1,82 @@ +class Transform { + float[] inv_rot; // inverse of rotation matrix + PVector inv_mov; // inverse of movement vector + float focal; // the focal distacne of real camera + int w, h; // the width and height of the frame + float normalier; // nomalization factor of depth + Transform(float tx, float ty, float tz, float qx, float qy, float qz, + float qw, float fov, int w, int h, float normalier) { + // currently, we did not use the info of real camera's position and + // quaternion maybe we will use it in the future when combine all frames + float[] rot = quaternion2Mat3x3(qx, qy, qz, qw); + inv_rot = transpose3x3(rot); + inv_mov = new PVector(-tx, -ty, -tz); + this.focal = 0.5f * h / tan(fov / 2.0); + this.w = w; + this.h = h; + this.normalier = normalier; + } + + PVector transform(int i, int j, float d) { + // transfer from camera view to world view + float z = d / normalier; + float x = (i - w / 2.0f) * z / focal; + float y = (j - h / 2.0f) * z / focal; + return new PVector(x, y, z); + } +} + +// get rotation matrix by using rotation axis and angle +float[] getRotationMat3x3(float angle, float ax, float ay, float az) { + float[] mat = new float[9]; + float c = cos(angle); + float s = sin(angle); + mat[0] = c + ax * ax * (1 - c); + mat[1] = ax * ay * (1 - c) - az * s; + mat[2] = ax * az * (1 - c) + ay * s; + mat[3] = ay * ax * (1 - c) + az * s; + mat[4] = c + ay * ay * (1 - c); + mat[5] = ay * az * (1 - c) - ax * s; + mat[6] = az * ax * (1 - c) - ay * s; + mat[7] = az * ay * (1 - c) + ax * s; + mat[8] = c + az * az * (1 - c); + return mat; +} + +// get rotation matrix by using quaternion +float[] quaternion2Mat3x3(float qx, float qy, float qz, float qw) { + float[] mat = new float[9]; + mat[0] = 1 - 2 * qy * qy - 2 * qz * qz; + mat[1] = 2 * qx * qy - 2 * qz * qw; + mat[2] = 2 * qx * qz + 2 * qy * qw; + mat[3] = 2 * qx * qy + 2 * qz * qw; + mat[4] = 1 - 2 * qx * qx - 2 * qz * qz; + mat[5] = 2 * qy * qz - 2 * qx * qw; + mat[6] = 2 * qx * qz - 2 * qy * qw; + mat[7] = 2 * qy * qz + 2 * qx * qw; + mat[8] = 1 - 2 * qx * qx - 2 * qy * qy; + return mat; +} + +// tranpose a 3x3 matrix +float[] transpose3x3(float[] mat) { + float[] Tmat = new float[9]; + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + Tmat[i * 3 + j] = mat[j * 3 + i]; + } + return Tmat; +} + +// multiply a matrix with vector +PVector MatxVec3(float[] mat, PVector v) { + float[] vec = v.array(); + float[] res = new float[3]; + for (int i = 0; i < 3; i++) { + res[i] = 0.0f; + for (int j = 0; j < 3; j++) { + res[i] += mat[i * 3 + j] * vec[j]; + } + } + return new PVector(res[0], res[1], res[2]); +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde new file mode 100644 index 0000000000..19d124a0b3 --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde @@ -0,0 +1,28 @@ +// show grids +void showGrids(int block_size) { + ortho(-width, 0, -height, 0); + camera(0, 0, 0, 0, 0, 1, 0, 1, 0); + stroke(0, 0, 255); + for (int i = 0; i < height; i += block_size) { + line(0, i, width, i); + } + for (int i = 0; i < width; i += block_size) { + line(i, 0, i, height); + } +} + +// save the point clould information +void savePointCloud(PointCloud point_cloud, String file_name) { + String[] positions = new String[point_cloud.points.size()]; + String[] colors = new String[point_cloud.points.size()]; + for (int i = 0; i < point_cloud.points.size(); i++) { + PVector point = point_cloud.getPosition(i); + color point_color = point_cloud.getColor(i); + positions[i] = str(point.x) + ' ' + str(point.y) + ' ' + str(point.z); + colors[i] = str(((point_color >> 16) & 0xFF) / 255.0) + ' ' + + str(((point_color >> 8) & 0xFF) / 255.0) + ' ' + + str((point_color & 0xFF) / 255.0); + } + saveStrings(file_name + "_pos.txt", positions); + saveStrings(file_name + "_color.txt", colors); +} diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde new file mode 100644 index 0000000000..22a495432d --- /dev/null +++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde @@ -0,0 +1,74 @@ +/*The dataset is from + *Computer Vision Group + *TUM Department of Informatics Technical + *University of Munich + *https://vision.in.tum.de/data/datasets/rgbd-dataset/download#freiburg1_xyz + */ +Scene scene; +void setup() { + size(640, 480, P3D); + // default settings + int frame_no = 0; // frame number + float fov = PI / 3; // field of view + int block_size = 8; // block size + float normalizer = 5000.0f; // normalizer + // initialize + PointCloud point_cloud = new PointCloud(); + // synchronized rgb, depth and ground truth + String head = "../data/"; + String[] rgb_depth_gt = loadStrings(head + "rgb_depth_groundtruth.txt"); + // read in rgb and depth image file paths as well as corresponding camera + // posiiton and quaternion + String[] info = split(rgb_depth_gt[frame_no], ' '); + String rgb_path = head + info[1]; + String depth_path = head + info[3]; + float tx = float(info[7]), ty = float(info[8]), + tz = float(info[9]); // real camera position + float qx = float(info[10]), qy = float(info[11]), qz = float(info[12]), + qw = float(info[13]); // quaternion + + // build transformer + Transform trans = + new Transform(tx, ty, tz, qx, qy, qz, qw, fov, width, height, normalizer); + PImage rgb = loadImage(rgb_path); + PImage depth = loadImage(depth_path); + // generate point cloud + point_cloud.generate(rgb, depth, trans); + // initialize camera + Camera camera = new Camera(fov, new PVector(0, 0, 0), new PVector(0, 0, 1), + new PVector(0, 1, 0)); + // initialize motion field + MotionField motion_field = new MotionField(block_size); + // initialize scene + scene = new Scene(camera, point_cloud, motion_field); +} +boolean inter = false; +void draw() { + background(0); + // run camera dragged mouse to rotate camera + // w: go forward + // s: go backward + // a: go left + // d: go right + // up arrow: go up + // down arrow: go down + //+ increase move speed + //- decrease move speed + // r: rotate the camera + // b: reset to initial position + scene.run(); // true: make interpolation; false: do not make + // interpolation + if (keyPressed && key == 'o') { + inter = true; + } + scene.render( + false); // true: turn on motion field; false: turn off motion field + // save frame with no motion field + scene.save("../data/frame/raw"); + background(0); + scene.render(true); + showGrids(scene.motion_field.block_size); + // save frame with motion field + scene.save("../data/frame/raw_mv"); + scene.saveMotionField("../data/frame/mv"); +} diff --git a/libs/libvpx/tools/non_greedy_mv/non_greedy_mv.py b/libs/libvpx/tools/non_greedy_mv/non_greedy_mv.py new file mode 100644 index 0000000000..513faa435f --- /dev/null +++ b/libs/libvpx/tools/non_greedy_mv/non_greedy_mv.py @@ -0,0 +1,186 @@ +import sys +import matplotlib.pyplot as plt +from matplotlib.collections import LineCollection +from matplotlib import colors as mcolors +import numpy as np +import math + + +def draw_mv_ls(axis, mv_ls, mode=0): + colors = np.array([(1., 0., 0., 1.)]) + segs = np.array([ + np.array([[ptr[0], ptr[1]], [ptr[0] + ptr[2], ptr[1] + ptr[3]]]) + for ptr in mv_ls + ]) + line_segments = LineCollection( + segs, linewidths=(1.,), colors=colors, linestyle='solid') + axis.add_collection(line_segments) + if mode == 0: + axis.scatter(mv_ls[:, 0], mv_ls[:, 1], s=2, c='b') + else: + axis.scatter( + mv_ls[:, 0] + mv_ls[:, 2], mv_ls[:, 1] + mv_ls[:, 3], s=2, c='b') + + +def draw_pred_block_ls(axis, mv_ls, bs, mode=0): + colors = np.array([(0., 0., 0., 1.)]) + segs = [] + for ptr in mv_ls: + if mode == 0: + x = ptr[0] + y = ptr[1] + else: + x = ptr[0] + ptr[2] + y = ptr[1] + ptr[3] + x_ls = [x, x + bs, x + bs, x, x] + y_ls = [y, y, y + bs, y + bs, y] + + segs.append(np.column_stack([x_ls, y_ls])) + line_segments = LineCollection( + segs, linewidths=(.5,), colors=colors, linestyle='solid') + axis.add_collection(line_segments) + + +def read_frame(fp, no_swap=0): + plane = [None, None, None] + for i in range(3): + line = fp.readline() + word_ls = line.split() + word_ls = [int(item) for item in word_ls] + rows = word_ls[0] + cols = word_ls[1] + + line = fp.readline() + word_ls = line.split() + word_ls = [int(item) for item in word_ls] + + plane[i] = np.array(word_ls).reshape(rows, cols) + if i > 0: + plane[i] = plane[i].repeat(2, axis=0).repeat(2, axis=1) + plane = np.array(plane) + if no_swap == 0: + plane = np.swapaxes(np.swapaxes(plane, 0, 1), 1, 2) + return plane + + +def yuv_to_rgb(yuv): + #mat = np.array([ + # [1.164, 0 , 1.596 ], + # [1.164, -0.391, -0.813], + # [1.164, 2.018 , 0 ] ] + # ) + #c = np.array([[ -16 , -16 , -16 ], + # [ 0 , -128, -128 ], + # [ -128, -128, 0 ]]) + + mat = np.array([[1, 0, 1.4075], [1, -0.3445, -0.7169], [1, 1.7790, 0]]) + c = np.array([[0, 0, 0], [0, -128, -128], [-128, -128, 0]]) + mat_c = np.dot(mat, c) + v = np.array([mat_c[0, 0], mat_c[1, 1], mat_c[2, 2]]) + mat = mat.transpose() + rgb = np.dot(yuv, mat) + v + rgb = rgb.astype(int) + rgb = rgb.clip(0, 255) + return rgb / 255. + + +def read_feature_score(fp, mv_rows, mv_cols): + line = fp.readline() + word_ls = line.split() + feature_score = np.array([math.log(float(v) + 1, 2) for v in word_ls]) + feature_score = feature_score.reshape(mv_rows, mv_cols) + return feature_score + +def read_mv_mode_arr(fp, mv_rows, mv_cols): + line = fp.readline() + word_ls = line.split() + mv_mode_arr = np.array([int(v) for v in word_ls]) + mv_mode_arr = mv_mode_arr.reshape(mv_rows, mv_cols) + return mv_mode_arr + + +def read_frame_dpl_stats(fp): + line = fp.readline() + word_ls = line.split() + frame_idx = int(word_ls[1]) + mi_rows = int(word_ls[3]) + mi_cols = int(word_ls[5]) + bs = int(word_ls[7]) + ref_frame_idx = int(word_ls[9]) + rf_idx = int(word_ls[11]) + gf_frame_offset = int(word_ls[13]) + ref_gf_frame_offset = int(word_ls[15]) + mi_size = bs / 8 + mv_ls = [] + mv_rows = int((math.ceil(mi_rows * 1. / mi_size))) + mv_cols = int((math.ceil(mi_cols * 1. / mi_size))) + for i in range(mv_rows * mv_cols): + line = fp.readline() + word_ls = line.split() + row = int(word_ls[0]) * 8. + col = int(word_ls[1]) * 8. + mv_row = int(word_ls[2]) / 8. + mv_col = int(word_ls[3]) / 8. + mv_ls.append([col, row, mv_col, mv_row]) + mv_ls = np.array(mv_ls) + feature_score = read_feature_score(fp, mv_rows, mv_cols) + mv_mode_arr = read_mv_mode_arr(fp, mv_rows, mv_cols) + img = yuv_to_rgb(read_frame(fp)) + ref = yuv_to_rgb(read_frame(fp)) + return rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr + + +def read_dpl_stats_file(filename, frame_num=0): + fp = open(filename) + line = fp.readline() + width = 0 + height = 0 + data_ls = [] + while (line): + if line[0] == '=': + data_ls.append(read_frame_dpl_stats(fp)) + line = fp.readline() + if frame_num > 0 and len(data_ls) == frame_num: + break + return data_ls + + +if __name__ == '__main__': + filename = sys.argv[1] + data_ls = read_dpl_stats_file(filename, frame_num=5) + for rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr in data_ls: + fig, axes = plt.subplots(2, 2) + + axes[0][0].imshow(img) + draw_mv_ls(axes[0][0], mv_ls) + draw_pred_block_ls(axes[0][0], mv_ls, bs, mode=0) + #axes[0].grid(color='k', linestyle='-') + axes[0][0].set_ylim(img.shape[0], 0) + axes[0][0].set_xlim(0, img.shape[1]) + + if ref is not None: + axes[0][1].imshow(ref) + draw_mv_ls(axes[0][1], mv_ls, mode=1) + draw_pred_block_ls(axes[0][1], mv_ls, bs, mode=1) + #axes[1].grid(color='k', linestyle='-') + axes[0][1].set_ylim(ref.shape[0], 0) + axes[0][1].set_xlim(0, ref.shape[1]) + + axes[1][0].imshow(feature_score) + #feature_score_arr = feature_score.flatten() + #feature_score_max = feature_score_arr.max() + #feature_score_min = feature_score_arr.min() + #step = (feature_score_max - feature_score_min) / 20. + #feature_score_bins = np.arange(feature_score_min, feature_score_max, step) + #axes[1][1].hist(feature_score_arr, bins=feature_score_bins) + im = axes[1][1].imshow(mv_mode_arr) + #axes[1][1].figure.colorbar(im, ax=axes[1][1]) + + print rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, len(mv_ls) + + flatten_mv_mode = mv_mode_arr.flatten() + zero_mv_count = sum(flatten_mv_mode == 0); + new_mv_count = sum(flatten_mv_mode == 1); + ref_mv_count = sum(flatten_mv_mode == 2) + sum(flatten_mv_mode == 3); + print zero_mv_count, new_mv_count, ref_mv_count + plt.show() diff --git a/libs/libvpx/tools/set_analyzer_env.sh b/libs/libvpx/tools/set_analyzer_env.sh new file mode 100644 index 0000000000..4bdbba6523 --- /dev/null +++ b/libs/libvpx/tools/set_analyzer_env.sh @@ -0,0 +1,142 @@ +## Copyright (c) 2018 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## Sourcing this file sets environment variables to simplify setting up +## sanitizer builds and testing. + +sanitizer="${1}" + +case "${sanitizer}" in + address) ;; + cfi) ;; + integer) ;; + memory) ;; + thread) ;; + undefined) ;; + clear) + echo "Clearing environment:" + set -x + unset CC CXX LD AR + unset CFLAGS CXXFLAGS LDFLAGS + unset ASAN_OPTIONS MSAN_OPTIONS TSAN_OPTIONS UBSAN_OPTIONS + set +x + return + ;; + *) + echo "Usage: source set_analyzer_env.sh [|clear]" + echo " Supported sanitizers:" + echo " address cfi integer memory thread undefined" + return 1 + ;; +esac + +if [ ! $(which clang) ]; then + # TODO(johannkoenig): Support gcc analyzers. + echo "ERROR: 'clang' must be in your PATH" + return 1 +fi + +# Warnings. +if [ "${sanitizer}" = "undefined" -o "${sanitizer}" = "integer" ]; then + echo "WARNING: When building the ${sanitizer} sanitizer for 32 bit targets" + echo "you must run:" + echo "export LDFLAGS=\"\${LDFLAGS} --rtlib=compiler-rt -lgcc_s\"" + echo "See http://llvm.org/bugs/show_bug.cgi?id=17693 for details." +fi + +if [ "${sanitizer}" = "undefined" ]; then + major_version=$(clang --version | head -n 1 \ + | grep -o -E "[[:digit:]]\.[[:digit:]]\.[[:digit:]]" | cut -f1 -d.) + if [ ${major_version} -eq 5 ]; then + echo "WARNING: clang v5 has a problem with vp9 x86_64 high bit depth" + echo "configurations. It can take ~40 minutes to compile" + echo "vpx_dsp/x86/fwd_txfm_sse2.c" + echo "clang v4 did not have this issue." + fi +fi + +echo "It is recommended to configure with '--enable-debug' to improve stack" +echo "traces. On mac builds, run 'dysmutil' on the output binaries (vpxenc," +echo "test_libvpx, etc) to link the stack traces to source code lines." + +# Build configuration. +cflags="-fsanitize=${sanitizer}" +ldflags="-fsanitize=${sanitizer}" + +# http://code.google.com/p/webm/issues/detail?id=570 +cflags="${cflags} -fno-strict-aliasing" +# Useful backtraces. +cflags="${cflags} -fno-omit-frame-pointer" +# Exact backtraces. +cflags="${cflags} -fno-optimize-sibling-calls" + +if [ "${sanitizer}" = "cfi" ]; then + # https://clang.llvm.org/docs/ControlFlowIntegrity.html + cflags="${cflags} -fno-sanitize-trap=cfi -flto -fvisibility=hidden" + ldflags="${ldflags} -fno-sanitize-trap=cfi -flto -fuse-ld=gold" + export AR="llvm-ar" +fi + +# TODO(http://crbug.com/webm/1615): -fsanitize=implicit-integer-truncation +# causes conversion warnings in many of the x86 intrinsics and elsewhere. +if [ "${sanitizer}" = "integer" ]; then + major_version=$(clang --version | head -n 1 \ + | grep -o -E "[[:digit:]]\.[[:digit:]]\.[[:digit:]]" | cut -f1 -d.) + if [ ${major_version} -ge 7 ]; then + cflags="${cflags} -fno-sanitize=implicit-integer-truncation" + ldflags="${ldflags} -fno-sanitize=implicit-integer-truncation" + fi +fi + +set -x +export CC="clang" +export CXX="clang++" +export LD="clang++" + +export CFLAGS="${cflags}" +export CXXFLAGS="${cflags}" +export LDFLAGS="${ldflags}" +set +x + +# Execution configuration. +sanitizer_options="" +sanitizer_options="${sanitizer_options}:handle_segv=1" +sanitizer_options="${sanitizer_options}:handle_abort=1" +sanitizer_options="${sanitizer_options}:handle_sigfpe=1" +sanitizer_options="${sanitizer_options}:fast_unwind_on_fatal=1" +sanitizer_options="${sanitizer_options}:allocator_may_return_null=1" + +case "${sanitizer}" in + address) + sanitizer_options="${sanitizer_options}:detect_stack_use_after_return=1" + sanitizer_options="${sanitizer_options}:max_uar_stack_size_log=17" + set -x + export ASAN_OPTIONS="${sanitizer_options}" + set +x + ;; + cfi) + # No environment settings + ;; + memory) + set -x + export MSAN_OPTIONS="${sanitizer_options}" + set +x + ;; + thread) + # The thread sanitizer uses an entirely independent set of options. + set -x + export TSAN_OPTIONS="halt_on_error=1" + set +x + ;; + undefined|integer) + sanitizer_options="${sanitizer_options}:print_stacktrace=1" + set -x + export UBSAN_OPTIONS="${sanitizer_options}" + set +x + ;; +esac diff --git a/libs/libvpx/tools/tiny_ssim.c b/libs/libvpx/tools/tiny_ssim.c index 5e8ca02b49..ff4634ade4 100644 --- a/libs/libvpx/tools/tiny_ssim.c +++ b/libs/libvpx/tools/tiny_ssim.c @@ -34,6 +34,10 @@ static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride, unsigned int row, col; uint64_t total_sse = 0; int diff; + if (orig == NULL || recon == NULL) { + assert(0); + return 0; + } for (row = 0; row < rows; row++) { for (col = 0; col < cols; col++) { @@ -46,13 +50,18 @@ static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride, } return total_sse; } -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH + static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, int recon_stride, unsigned int cols, unsigned int rows) { unsigned int row, col; uint64_t total_sse = 0; int diff; + if (orig == NULL || recon == NULL) { + assert(0); + return 0; + } for (row = 0; row < rows; row++) { for (col = 0; col < cols; col++) { @@ -91,40 +100,43 @@ typedef struct input_file { int w; int h; int bit_depth; + int frame_size; } input_file_t; // Open a file and determine if its y4m or raw. If y4m get the header. static int open_input_file(const char *file_name, input_file_t *input, int w, int h, int bit_depth) { char y4m_buf[4]; - size_t r1; + input->w = w; + input->h = h; + input->bit_depth = bit_depth; input->type = RAW_YUV; input->buf = NULL; input->file = strcmp(file_name, "-") ? fopen(file_name, "rb") : stdin; if (input->file == NULL) return -1; - r1 = fread(y4m_buf, 1, 4, input->file); - if (r1 == 4) { - if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M; - switch (input->type) { - case Y4M: - y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0); - input->w = input->y4m.pic_w; - input->h = input->y4m.pic_h; - input->bit_depth = input->y4m.bit_depth; - // Y4M alloc's its own buf. Init this to avoid problems if we never - // read frames. - memset(&input->img, 0, sizeof(input->img)); - break; - case RAW_YUV: - fseek(input->file, 0, SEEK_SET); - input->w = w; - input->h = h; - if (bit_depth < 9) - input->buf = malloc(w * h * 3 / 2); - else - input->buf = malloc(w * h * 3); - break; - } + if (fread(y4m_buf, 1, 4, input->file) != 4) return -1; + if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M; + switch (input->type) { + case Y4M: + y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0); + input->w = input->y4m.pic_w; + input->h = input->y4m.pic_h; + input->bit_depth = input->y4m.bit_depth; + // Y4M alloc's its own buf. Init this to avoid problems if we never + // read frames. + memset(&input->img, 0, sizeof(input->img)); + break; + case RAW_YUV: + fseek(input->file, 0, SEEK_SET); + input->w = w; + input->h = h; + // handle odd frame sizes + input->frame_size = w * h + ((w + 1) / 2) * ((h + 1) / 2) * 2; + if (bit_depth > 8) { + input->frame_size *= 2; + } + input->buf = malloc(input->frame_size); + break; } return 0; } @@ -150,15 +162,15 @@ static size_t read_input_file(input_file_t *in, unsigned char **y, break; case RAW_YUV: if (bd < 9) { - r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file); + r1 = fread(in->buf, in->frame_size, 1, in->file); *y = in->buf; *u = in->buf + in->w * in->h; - *v = in->buf + 5 * in->w * in->h / 4; + *v = *u + ((1 + in->w) / 2) * ((1 + in->h) / 2); } else { - r1 = fread(in->buf, in->w * in->h * 3, 1, in->file); + r1 = fread(in->buf, in->frame_size, 1, in->file); *y = in->buf; - *u = in->buf + in->w * in->h / 2; - *v = *u + in->w * in->h / 2; + *u = in->buf + (in->w * in->h) * 2; + *v = *u + 2 * ((1 + in->w) / 2) * ((1 + in->h) / 2); } break; } @@ -166,24 +178,15 @@ static size_t read_input_file(input_file_t *in, unsigned char **y, return r1; } -void ssim_parms_16x16(const uint8_t *s, int sp, const uint8_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, - uint32_t *sum_sq_r, uint32_t *sum_sxr) { +static void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { int i, j; - for (i = 0; i < 16; i++, s += sp, r += rp) { - for (j = 0; j < 16; j++) { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; - } + if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL || + sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) { + assert(0); + return; } -} -void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, - uint32_t *sum_sq_r, uint32_t *sum_sxr) { - int i, j; for (i = 0; i < 8; i++, s += sp, r += rp) { for (j = 0; j < 8; j++) { *sum_s += s[j]; @@ -195,10 +198,17 @@ void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp, } } -void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, - uint32_t *sum_sq_r, uint32_t *sum_sxr) { +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { int i, j; + if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL || + sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) { + assert(0); + return; + } for (i = 0; i < 8; i++, s += sp, r += rp) { for (j = 0; j < 8; j++) { *sum_s += s[j]; @@ -209,11 +219,12 @@ void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp, } } } +#endif // CONFIG_VP9_HIGHBITDEPTH static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, uint32_t sum_sq_r, uint32_t sum_sxr, int count, uint32_t bd) { - int64_t ssim_n, ssim_d; + double ssim_n, ssim_d; int64_t c1 = 0, c2 = 0; if (bd == 8) { // scale the constants by number of pixels @@ -229,14 +240,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, assert(0); } - ssim_n = (2 * sum_s * sum_r + c1) * - ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + ssim_n = (2.0 * sum_s * sum_r + c1) * + (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); - ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * - ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * + ((double)count * sum_sq_s - (double)sum_s * sum_s + + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); - return ssim_n * 1.0 / ssim_d; + return ssim_n / ssim_d; } static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { @@ -245,14 +256,15 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); } +#if CONFIG_VP9_HIGHBITDEPTH static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, - int rp, uint32_t bd, uint32_t shift) { + int rp, uint32_t bd) { uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); - return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), - sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, bd); } +#endif // CONFIG_VP9_HIGHBITDEPTH // We are using a 8x8 moving window with starting location of each 8x8 window // on the 4x4 pixel grid. Such arrangement allows the windows to overlap @@ -276,9 +288,10 @@ static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, return ssim_total; } +#if CONFIG_VP9_HIGHBITDEPTH static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, int stride_img2, int width, - int height, uint32_t bd, uint32_t shift) { + int height, uint32_t bd) { int i, j; int samples = 0; double ssim_total = 0; @@ -287,9 +300,9 @@ static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2, for (i = 0; i <= height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { for (j = 0; j <= width - 8; j += 4) { - double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, - CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, - shift); + double v = + highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd); ssim_total += v; samples++; } @@ -297,277 +310,7 @@ static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2, ssim_total /= samples; return ssim_total; } - -// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity -// -// Re working out the math -> -// -// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / -// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) -// -// mean(x) = sum(x) / n -// -// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) -// -// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) -// -// ssim(x,y) = -// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / -// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * -// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ -// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) -// -// factoring out n*n -// -// ssim(x,y) = -// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / -// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * -// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) -// -// Replace c1 with n*n * c1 for the final step that leads to this code: -// The final step scales by 12 bits so we don't lose precision in the constants. - -static double ssimv_similarity(const Ssimv *sv, int64_t n) { - // Scale the constants by number of pixels. - const int64_t c1 = (cc1 * n * n) >> 12; - const int64_t c2 = (cc2 * n * n) >> 12; - - const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / - (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); - - // Since these variables are unsigned sums, convert to double so - // math is done in double arithmetic. - const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / - (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + - n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); - - return l * v; -} - -// The first term of the ssim metric is a luminance factor. -// -// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) -// -// This luminance factor is super sensitive to the dark side of luminance -// values and completely insensitive on the white side. check out 2 sets -// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 -// 2*250*252/ (250^2+252^2) => .99999997 -// -// As a result in this tweaked version of the calculation in which the -// luminance is taken as percentage off from peak possible. -// -// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count -// -static double ssimv_similarity2(const Ssimv *sv, int64_t n) { - // Scale the constants by number of pixels. - const int64_t c1 = (cc1 * n * n) >> 12; - const int64_t c2 = (cc2 * n * n) >> 12; - - const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; - const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); - - // Since these variables are unsigned, sums convert to double so - // math is done in double arithmetic. - const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / - (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + - n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); - - return l * v; -} -static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, - int img2_pitch, Ssimv *sv) { - ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, - &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); -} - -double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, - int img2_pitch, int width, int height, Ssimv *sv2, - Metrics *m, int do_inconsistency) { - double dssim_total = 0; - double ssim_total = 0; - double ssim2_total = 0; - double inconsistency_total = 0; - int i, j; - int c = 0; - double norm; - double old_ssim_total = 0; - - // We can sample points as frequently as we like start with 1 per 4x4. - for (i = 0; i < height; - i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { - for (j = 0; j < width; j += 4, ++c) { - Ssimv sv = { 0, 0, 0, 0, 0, 0 }; - double ssim; - double ssim2; - double dssim; - uint32_t var_new; - uint32_t var_old; - uint32_t mean_new; - uint32_t mean_old; - double ssim_new; - double ssim_old; - - // Not sure there's a great way to handle the edge pixels - // in ssim when using a window. Seems biased against edge pixels - // however you handle this. This uses only samples that are - // fully in the frame. - if (j + 8 <= width && i + 8 <= height) { - ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); - } - - ssim = ssimv_similarity(&sv, 64); - ssim2 = ssimv_similarity2(&sv, 64); - - sv.ssim = ssim2; - - // dssim is calculated to use as an actual error metric and - // is scaled up to the same range as sum square error. - // Since we are subsampling every 16th point maybe this should be - // *16 ? - dssim = 255 * 255 * (1 - ssim2) / 2; - - // Here I introduce a new error metric: consistency-weighted - // SSIM-inconsistency. This metric isolates frames where the - // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much - // sharper or blurrier than the others. Higher values indicate a - // temporally inconsistent SSIM. There are two ideas at work: - // - // 1) 'SSIM-inconsistency': the total inconsistency value - // reflects how much SSIM values are changing between this - // source / reference frame pair and the previous pair. - // - // 2) 'consistency-weighted': weights de-emphasize areas in the - // frame where the scene content has changed. Changes in scene - // content are detected via changes in local variance and local - // mean. - // - // Thus the overall measure reflects how inconsistent the SSIM - // values are, over consistent regions of the frame. - // - // The metric has three terms: - // - // term 1 -> uses change in scene Variance to weight error score - // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) - // larger changes from one frame to the next mean we care - // less about consistency. - // - // term 2 -> uses change in local scene luminance to weight error - // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) - // larger changes from one frame to the next mean we care - // less about consistency. - // - // term3 -> measures inconsistency in ssim scores between frames - // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). - // - // This term compares the ssim score for the same location in 2 - // subsequent frames. - var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; - var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; - mean_new = sv.sum_s; - mean_old = sv2[c].sum_s; - ssim_new = sv.ssim; - ssim_old = sv2[c].ssim; - - if (do_inconsistency) { - // We do the metric once for every 4x4 block in the image. Since - // we are scaling the error to SSE for use in a psnr calculation - // 1.0 = 4x4x255x255 the worst error we can possibly have. - static const double kScaling = 4. * 4 * 255 * 255; - - // The constants have to be non 0 to avoid potential divide by 0 - // issues other than that they affect kind of a weighting between - // the terms. No testing of what the right terms should be has been - // done. - static const double c1 = 1, c2 = 1, c3 = 1; - - // This measures how much consistent variance is in two consecutive - // source frames. 1.0 means they have exactly the same variance. - const double variance_term = - (2.0 * var_old * var_new + c1) / - (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); - - // This measures how consistent the local mean are between two - // consecutive frames. 1.0 means they have exactly the same mean. - const double mean_term = - (2.0 * mean_old * mean_new + c2) / - (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); - - // This measures how consistent the ssims of two - // consecutive frames is. 1.0 means they are exactly the same. - double ssim_term = - pow((2.0 * ssim_old * ssim_new + c3) / - (ssim_old * ssim_old + ssim_new * ssim_new + c3), - 5); - - double this_inconsistency; - - // Floating point math sometimes makes this > 1 by a tiny bit. - // We want the metric to scale between 0 and 1.0 so we can convert - // it to an snr scaled value. - if (ssim_term > 1) ssim_term = 1; - - // This converts the consistency metric to an inconsistency metric - // ( so we can scale it like psnr to something like sum square error. - // The reason for the variance and mean terms is the assumption that - // if there are big changes in the source we shouldn't penalize - // inconsistency in ssim scores a bit less as it will be less visible - // to the user. - this_inconsistency = (1 - ssim_term) * variance_term * mean_term; - - this_inconsistency *= kScaling; - inconsistency_total += this_inconsistency; - } - sv2[c] = sv; - ssim_total += ssim; - ssim2_total += ssim2; - dssim_total += dssim; - - old_ssim_total += ssim_old; - } - old_ssim_total += 0; - } - - norm = 1. / (width / 4) / (height / 4); - ssim_total *= norm; - ssim2_total *= norm; - m->ssim2 = ssim2_total; - m->ssim = ssim_total; - if (old_ssim_total == 0) inconsistency_total = 0; - - m->ssimc = inconsistency_total; - - m->dssim = dssim_total; - return inconsistency_total; -} - -double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *weight, - uint32_t bd, uint32_t in_bd) { - double a, b, c; - double ssimv; - uint32_t shift = 0; - - assert(bd >= in_bd); - shift = bd - in_bd; - - a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, - dest->y_stride, source->y_crop_width, source->y_crop_height, - in_bd, shift); - - b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, shift); - - c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, shift); - - ssimv = a * .8 + .1 * (b + c); - - *weight = 1; - - return ssimv; -} +#endif // CONFIG_VP9_HIGHBITDEPTH int main(int argc, char *argv[]) { FILE *framestats = NULL; @@ -583,13 +326,14 @@ int main(int argc, char *argv[]) { input_file_t in[2]; double peak = 255.0; + memset(in, 0, sizeof(in)); + if (argc < 2) { fprintf(stderr, "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}" "[WxH tl_skip={0,1,3} frame_stats_file bits]\n", argv[0]); - return_value = 1; - goto clean_up; + return 1; } if (argc > 3) { @@ -601,7 +345,7 @@ int main(int argc, char *argv[]) { } if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) { - fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]); + fprintf(stderr, "File %s can't be opened or parsed!\n", argv[1]); goto clean_up; } @@ -613,7 +357,7 @@ int main(int argc, char *argv[]) { } if (bit_depth == 10) peak = 1023.0; - if (bit_depth == 12) peak = 4095; + if (bit_depth == 12) peak = 4095.0; if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) { fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]); @@ -628,9 +372,19 @@ int main(int argc, char *argv[]) { goto clean_up; } - // Number of frames to skip from file1.yuv for every frame used. Normal values - // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding - // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding. + if (in[0].bit_depth != in[1].bit_depth) { + fprintf(stderr, + "Failing: Image bit depths don't match or are unspecified!\n"); + return_value = 1; + goto clean_up; + } + + bit_depth = in[0].bit_depth; + + // Number of frames to skip from file1.yuv for every frame used. Normal + // values 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL + // encoding in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer + // encoding. if (argc > 4) { sscanf(argv[4], "%d", &tl_skip); if (argc > 5) { @@ -644,12 +398,6 @@ int main(int argc, char *argv[]) { } } - if (w & 1 || h & 1) { - fprintf(stderr, "Invalid size %dx%d\n", w, h); - return_value = 1; - goto clean_up; - } - while (1) { size_t r1, r2; unsigned char *y[2], *u[2], *v[2]; @@ -683,7 +431,7 @@ int main(int argc, char *argv[]) { psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ } else { \ ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \ - w, w, h, bit_depth, bit_depth - 8); \ + w, w, h, bit_depth); \ psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w, \ CAST_TO_SHORTPTR(buf1), w, w, h); \ } @@ -691,7 +439,7 @@ int main(int argc, char *argv[]) { #define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ ssim = ssim2(buf0, buf1, w, w, w, h); \ psnr = calc_plane_error(buf0, w, buf1, w, w, h); -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH if (n_frames == allocated_frames) { allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2; @@ -703,8 +451,10 @@ int main(int argc, char *argv[]) { psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv)); } psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h); - psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], w / 2, h / 2); - psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], w / 2, h / 2); + psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2, + (h + 1) / 2); + psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], (w + 1) / 2, + (h + 1) / 2); n_frames++; } diff --git a/libs/libvpx/tools_common.c b/libs/libvpx/tools_common.c index 6f14c25561..59978b7f93 100644 --- a/libs/libvpx/tools_common.c +++ b/libs/libvpx/tools_common.c @@ -46,6 +46,14 @@ va_end(ap); \ } while (0) +#if CONFIG_ENCODERS +/* Swallow warnings about unused results of fread/fwrite */ +static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { + return fread(ptr, size, nmemb, stream); +} +#define fread wrap_fread +#endif + FILE *set_binary_mode(FILE *stream) { (void)stream; #if defined(_WIN32) || defined(__OS2__) @@ -200,8 +208,6 @@ const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) { #endif // CONFIG_DECODERS -// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part -// of vpx_image_t support int vpx_img_plane_width(const vpx_image_t *img, int plane) { if (plane > 0 && img->x_chroma_shift > 0) return (img->d_w + 1) >> img->x_chroma_shift; @@ -266,6 +272,88 @@ double sse_to_psnr(double samples, double peak, double sse) { } } +#if CONFIG_ENCODERS +int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { + FILE *f = input_ctx->file; + y4m_input *y4m = &input_ctx->y4m; + int shortread = 0; + + if (input_ctx->file_type == FILE_TYPE_Y4M) { + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; + } else { + shortread = read_yuv_frame(input_ctx, img); + } + + return !shortread; +} + +int file_is_y4m(const char detect[4]) { + if (memcmp(detect, "YUV4", 4) == 0) { + return 1; + } + return 0; +} + +int fourcc_is_ivf(const char detect[4]) { + if (memcmp(detect, "DKIF", 4) == 0) { + return 1; + } + return 0; +} + +void open_input_file(struct VpxInputContext *input) { + /* Parse certain options from the input file, if possible */ + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); + + if (!input->file) fatal("Failed to open input file"); + + if (!fseeko(input->file, 0, SEEK_END)) { + /* Input file is seekable. Figure out how long it is, so we can get + * progress info. + */ + input->length = ftello(input->file); + rewind(input->file); + } + + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + + /* For RAW input sources, these bytes will applied on the first frame + * in read_frame(). + */ + input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); + input->detect.position = 0; + + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { + if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, + input->only_i420) >= 0) { + input->file_type = FILE_TYPE_Y4M; + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.vpx_fmt; + input->bit_depth = input->y4m.bit_depth; + } else { + fatal("Unsupported Y4M stream."); + } + } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { + fatal("IVF is not supported as input."); + } else { + input->file_type = FILE_TYPE_RAW; + } +} + +void close_input_file(struct VpxInputContext *input) { + fclose(input->file); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); +} +#endif + // TODO(debargha): Consolidate the functions below into a separate file. #if CONFIG_VP9_HIGHBITDEPTH static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, @@ -459,3 +547,225 @@ void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift) { } } #endif // CONFIG_VP9_HIGHBITDEPTH + +int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) { + uint32_t l_w = img1->d_w; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + uint32_t i; + int match = 1; + + match &= (img1->fmt == img2->fmt); + match &= (img1->d_w == img2->d_w); + match &= (img1->d_h == img2->d_h); +#if CONFIG_VP9_HIGHBITDEPTH + if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + l_w *= 2; + c_w *= 2; + } +#endif + + for (i = 0; i < img1->d_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], + img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], + l_w) == 0); + + for (i = 0; i < c_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], + img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], + c_w) == 0); + + for (i = 0; i < c_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], + img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], + c_w) == 0); + + return match; +} + +#define mmin(a, b) ((a) < (b) ? (a) : (b)) + +#if CONFIG_VP9_HIGHBITDEPTH +void find_mismatch_high(const vpx_image_t *const img1, + const vpx_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]) { + uint16_t *plane1, *plane2; + uint32_t stride1, stride2; + const uint32_t bsize = 64; + const uint32_t bsizey = bsize >> img1->y_chroma_shift; + const uint32_t bsizex = bsize >> img1->x_chroma_shift; + const uint32_t c_w = + (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + uint32_t i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y]; + stride1 = img1->stride[VPX_PLANE_Y] / 2; + stride2 = img2->stride[VPX_PLANE_Y] / 2; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { + int k, l; + const int si = mmin(i + bsize, img1->d_h) - i; + const int sj = mmin(j + bsize, img1->d_w) - j; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + yloc[0] = i + k; + yloc[1] = j + l; + yloc[2] = *(plane1 + (i + k) * stride1 + j + l); + yloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_U]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_U]; + stride1 = img1->stride[VPX_PLANE_U] / 2; + stride2 = img2->stride[VPX_PLANE_U] / 2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + uloc[0] = i + k; + uloc[1] = j + l; + uloc[2] = *(plane1 + (i + k) * stride1 + j + l); + uloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_V]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_V]; + stride1 = img1->stride[VPX_PLANE_V] / 2; + stride2 = img2->stride[VPX_PLANE_V] / 2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + vloc[0] = i + k; + vloc[1] = j + l; + vloc[2] = *(plane1 + (i + k) * stride1 + j + l); + vloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2, + int yloc[4], int uloc[4], int vloc[4]) { + const uint32_t bsize = 64; + const uint32_t bsizey = bsize >> img1->y_chroma_shift; + const uint32_t bsizex = bsize >> img1->x_chroma_shift; + const uint32_t c_w = + (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + uint32_t i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { + int k, l; + const int si = mmin(i + bsize, img1->d_h) - i; + const int sj = mmin(j + bsize, img1->d_w) - j; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_Y] + + (i + k) * img1->stride[VPX_PLANE_Y] + j + l) != + *(img2->planes[VPX_PLANE_Y] + + (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { + yloc[0] = i + k; + yloc[1] = j + l; + yloc[2] = *(img1->planes[VPX_PLANE_Y] + + (i + k) * img1->stride[VPX_PLANE_Y] + j + l); + yloc[3] = *(img2->planes[VPX_PLANE_Y] + + (i + k) * img2->stride[VPX_PLANE_Y] + j + l); + match = 0; + break; + } + } + } + } + } + + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_U] + + (i + k) * img1->stride[VPX_PLANE_U] + j + l) != + *(img2->planes[VPX_PLANE_U] + + (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { + uloc[0] = i + k; + uloc[1] = j + l; + uloc[2] = *(img1->planes[VPX_PLANE_U] + + (i + k) * img1->stride[VPX_PLANE_U] + j + l); + uloc[3] = *(img2->planes[VPX_PLANE_U] + + (i + k) * img2->stride[VPX_PLANE_U] + j + l); + match = 0; + break; + } + } + } + } + } + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_V] + + (i + k) * img1->stride[VPX_PLANE_V] + j + l) != + *(img2->planes[VPX_PLANE_V] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { + vloc[0] = i + k; + vloc[1] = j + l; + vloc[2] = *(img1->planes[VPX_PLANE_V] + + (i + k) * img1->stride[VPX_PLANE_V] + j + l); + vloc[3] = *(img2->planes[VPX_PLANE_V] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l); + match = 0; + break; + } + } + } + } + } +} diff --git a/libs/libvpx/tools_common.h b/libs/libvpx/tools_common.h index e41de3195f..4526d9f165 100644 --- a/libs/libvpx/tools_common.h +++ b/libs/libvpx/tools_common.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TOOLS_COMMON_H_ -#define TOOLS_COMMON_H_ +#ifndef VPX_TOOLS_COMMON_H_ +#define VPX_TOOLS_COMMON_H_ #include @@ -33,6 +33,7 @@ typedef int64_t FileOffset; #define ftello ftello64 typedef off64_t FileOffset; #elif CONFIG_OS_SUPPORT +#include /* NOLINT */ typedef off_t FileOffset; /* Use 32-bit file operations in WebM file format when building ARM * executables (.axf) with RVCT. */ @@ -144,8 +145,6 @@ const VpxInterface *get_vpx_decoder_by_index(int i); const VpxInterface *get_vpx_decoder_by_name(const char *name); const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc); -// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part -// of vpx_image_t support int vpx_img_plane_width(const vpx_image_t *img, int plane); int vpx_img_plane_height(const vpx_image_t *img, int plane); void vpx_img_write(const vpx_image_t *img, FILE *file); @@ -153,14 +152,31 @@ int vpx_img_read(vpx_image_t *img, FILE *file); double sse_to_psnr(double samples, double peak, double mse); +#if CONFIG_ENCODERS +int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img); +int file_is_y4m(const char detect[4]); +int fourcc_is_ivf(const char detect[4]); +void open_input_file(struct VpxInputContext *input); +void close_input_file(struct VpxInputContext *input); +#endif + #if CONFIG_VP9_HIGHBITDEPTH void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift); void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift); void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src); #endif +int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2); +#if CONFIG_VP9_HIGHBITDEPTH +void find_mismatch_high(const vpx_image_t *const img1, + const vpx_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]); +#endif +void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2, + int yloc[4], int uloc[4], int vloc[4]); + #ifdef __cplusplus } /* extern "C" */ #endif -#endif // TOOLS_COMMON_H_ +#endif // VPX_TOOLS_COMMON_H_ diff --git a/libs/libvpx/usage_cx.dox b/libs/libvpx/usage_cx.dox index 92b0d34ef4..b2220cfdde 100644 --- a/libs/libvpx/usage_cx.dox +++ b/libs/libvpx/usage_cx.dox @@ -8,6 +8,8 @@ \ref usage_deadline. + \if samples \ref samples + \endif */ diff --git a/libs/libvpx/usage_dx.dox b/libs/libvpx/usage_dx.dox index 883ce24926..85063f705b 100644 --- a/libs/libvpx/usage_dx.dox +++ b/libs/libvpx/usage_dx.dox @@ -11,7 +11,9 @@ \ref usage_postproc based on the amount of free CPU time. For more information on the deadline parameter, see \ref usage_deadline. + \if samples \ref samples + \endif \section usage_cb Callback Based Decoding diff --git a/libs/libvpx/video_common.h b/libs/libvpx/video_common.h index 44b27a8390..77eb9fac0c 100644 --- a/libs/libvpx/video_common.h +++ b/libs/libvpx/video_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VIDEO_COMMON_H_ -#define VIDEO_COMMON_H_ +#ifndef VPX_VIDEO_COMMON_H_ +#define VPX_VIDEO_COMMON_H_ #include "./tools_common.h" @@ -20,4 +20,4 @@ typedef struct { struct VpxRational time_base; } VpxVideoInfo; -#endif // VIDEO_COMMON_H_ +#endif // VPX_VIDEO_COMMON_H_ diff --git a/libs/libvpx/video_reader.c b/libs/libvpx/video_reader.c index a0ba2521c6..16822eff3c 100644 --- a/libs/libvpx/video_reader.c +++ b/libs/libvpx/video_reader.c @@ -30,17 +30,37 @@ VpxVideoReader *vpx_video_reader_open(const char *filename) { char header[32]; VpxVideoReader *reader = NULL; FILE *const file = fopen(filename, "rb"); - if (!file) return NULL; // Can't open file + if (!file) { + fprintf(stderr, "%s can't be opened.\n", filename); // Can't open file + return NULL; + } - if (fread(header, 1, 32, file) != 32) return NULL; // Can't read file header + if (fread(header, 1, 32, file) != 32) { + fprintf(stderr, "File header on %s can't be read.\n", + filename); // Can't read file header + return NULL; + } + if (memcmp(kIVFSignature, header, 4) != 0) { + fprintf(stderr, "The IVF signature on %s is wrong.\n", + filename); // Wrong IVF signature - if (memcmp(kIVFSignature, header, 4) != 0) - return NULL; // Wrong IVF signature + return NULL; + } + if (mem_get_le16(header + 4) != 0) { + fprintf(stderr, "%s uses the wrong IVF version.\n", + filename); // Wrong IVF version - if (mem_get_le16(header + 4) != 0) return NULL; // Wrong IVF version + return NULL; + } reader = calloc(1, sizeof(*reader)); - if (!reader) return NULL; // Can't allocate VpxVideoReader + if (!reader) { + fprintf( + stderr, + "Can't allocate VpxVideoReader\n"); // Can't allocate VpxVideoReader + + return NULL; + } reader->file = file; reader->info.codec_fourcc = mem_get_le32(header + 8); diff --git a/libs/libvpx/video_reader.h b/libs/libvpx/video_reader.h index 73c25b00a7..1f5c8088bb 100644 --- a/libs/libvpx/video_reader.h +++ b/libs/libvpx/video_reader.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VIDEO_READER_H_ -#define VIDEO_READER_H_ +#ifndef VPX_VIDEO_READER_H_ +#define VPX_VIDEO_READER_H_ #include "./video_common.h" @@ -48,4 +48,4 @@ const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader); } // extern "C" #endif -#endif // VIDEO_READER_H_ +#endif // VPX_VIDEO_READER_H_ diff --git a/libs/libvpx/video_writer.c b/libs/libvpx/video_writer.c index 56d428b072..6e9a848bc3 100644 --- a/libs/libvpx/video_writer.c +++ b/libs/libvpx/video_writer.c @@ -37,11 +37,15 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename, if (container == kContainerIVF) { VpxVideoWriter *writer = NULL; FILE *const file = fopen(filename, "wb"); - if (!file) return NULL; - + if (!file) { + fprintf(stderr, "%s can't be written to.\n", filename); + return NULL; + } writer = malloc(sizeof(*writer)); - if (!writer) return NULL; - + if (!writer) { + fprintf(stderr, "Can't allocate VpxVideoWriter.\n"); + return NULL; + } writer->frame_count = 0; writer->info = *info; writer->file = file; @@ -50,7 +54,7 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename, return writer; } - + fprintf(stderr, "VpxVideoWriter supports only IVF.\n"); return NULL; } diff --git a/libs/libvpx/video_writer.h b/libs/libvpx/video_writer.h index a769811c44..b4d242b920 100644 --- a/libs/libvpx/video_writer.h +++ b/libs/libvpx/video_writer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VIDEO_WRITER_H_ -#define VIDEO_WRITER_H_ +#ifndef VPX_VIDEO_WRITER_H_ +#define VPX_VIDEO_WRITER_H_ #include "./video_common.h" @@ -41,4 +41,4 @@ int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer, } // extern "C" #endif -#endif // VIDEO_WRITER_H_ +#endif // VPX_VIDEO_WRITER_H_ diff --git a/libs/libvpx/vp8/common/alloccommon.h b/libs/libvpx/vp8/common/alloccommon.h index 5d0840c670..2d376bbac3 100644 --- a/libs/libvpx/vp8/common/alloccommon.h +++ b/libs/libvpx/vp8/common/alloccommon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ALLOCCOMMON_H_ -#define VP8_COMMON_ALLOCCOMMON_H_ +#ifndef VPX_VP8_COMMON_ALLOCCOMMON_H_ +#define VPX_VP8_COMMON_ALLOCCOMMON_H_ #include "onyxc_int.h" @@ -21,10 +21,10 @@ void vp8_create_common(VP8_COMMON *oci); void vp8_remove_common(VP8_COMMON *oci); void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); -void vp8_setup_version(VP8_COMMON *oci); +void vp8_setup_version(VP8_COMMON *cm); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_ALLOCCOMMON_H_ +#endif // VPX_VP8_COMMON_ALLOCCOMMON_H_ diff --git a/libs/libvpx/vp8/common/arm/loopfilter_arm.c b/libs/libvpx/vp8/common/arm/loopfilter_arm.c index e12f65a042..48a1972048 100644 --- a/libs/libvpx/vp8/common/arm/loopfilter_arm.c +++ b/libs/libvpx/vp8/common/arm/loopfilter_arm.c @@ -8,28 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vp8/common/arm/loopfilter_arm.h" #include "vp8/common/loopfilter.h" #include "vp8/common/onyxc_int.h" -typedef void loopfilter_y_neon(unsigned char *src, int pitch, - unsigned char blimit, unsigned char limit, - unsigned char thresh); -typedef void loopfilter_uv_neon(unsigned char *u, int pitch, - unsigned char blimit, unsigned char limit, - unsigned char thresh, unsigned char *v); - -extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; -extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; - -extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; -extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; - /* NEON loopfilter functions */ /* Horizontal MB filtering */ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, diff --git a/libs/libvpx/vp8/common/arm/loopfilter_arm.h b/libs/libvpx/vp8/common/arm/loopfilter_arm.h new file mode 100644 index 0000000000..6cf660d228 --- /dev/null +++ b/libs/libvpx/vp8/common/arm/loopfilter_arm.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ +#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ + +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh, unsigned char *v); + +loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; + +loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; + +#endif // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ diff --git a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c index 8520ab5ca0..590956dde1 100644 --- a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -10,7 +10,9 @@ #include #include + #include "./vpx_config.h" +#include "./vp8_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 }, diff --git a/libs/libvpx/vp8/common/arm/neon/copymem_neon.c b/libs/libvpx/vp8/common/arm/neon/copymem_neon.c index c1d293b58d..c89b47d628 100644 --- a/libs/libvpx/vp8/common/arm/neon/copymem_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -10,6 +10,8 @@ #include +#include "./vp8_rtcd.h" + void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) { uint8x8_t vtmp; diff --git a/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c index 6edff3c69f..791aaea2ae 100644 --- a/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -10,6 +10,7 @@ #include +#include "./vp8_rtcd.h" #include "vp8/common/blockd.h" void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { diff --git a/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c index d61dde86cf..5c26ce67a4 100644 --- a/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c @@ -8,15 +8,226 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include -/* place these declarations here because we don't want to maintain them - * outside of this scope - */ -void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *dst, - int stride); -void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *dst, int stride); +#include "./vp8_rtcd.h" + +static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0); + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } +} + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, + unsigned char *dst, int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); +} void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs) { @@ -43,42 +254,42 @@ void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, } void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, - unsigned char *dstu, - unsigned char *dstv, int stride, + unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { if (((short *)(eobs))[0]) { if (((short *)eobs)[0] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstu, stride); + idct_dequant_full_2x_neon(q, dq, dst_u, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstu, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); } q += 32; - dstu += 4 * stride; + dst_u += 4 * stride; if (((short *)(eobs))[1]) { if (((short *)eobs)[1] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstu, stride); + idct_dequant_full_2x_neon(q, dq, dst_u, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstu, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); } q += 32; if (((short *)(eobs))[2]) { if (((short *)eobs)[2] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstv, stride); + idct_dequant_full_2x_neon(q, dq, dst_v, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstv, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); } q += 32; - dstv += 4 * stride; + dst_v += 4 * stride; if (((short *)(eobs))[3]) { if (((short *)eobs)[3] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstv, stride); + idct_dequant_full_2x_neon(q, dq, dst_v, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstv, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); } } diff --git a/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c deleted file mode 100644 index c83102a5cc..0000000000 --- a/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst, - int stride) { - unsigned char *dst0; - int i, a0, a1; - int16x8x2_t q2Add; - int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0); - uint8x8_t d2u8, d4u8; - uint16x8_t q1u16, q2u16; - - a0 = ((q[0] * dq) + 4) >> 3; - a1 = ((q[16] * dq) + 4) >> 3; - q[0] = q[16] = 0; - q2Add.val[0] = vdupq_n_s16((int16_t)a0); - q2Add.val[1] = vdupq_n_s16((int16_t)a1); - - for (i = 0; i < 2; i++, dst += 4) { - dst0 = dst; - d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); - dst0 += stride; - d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); - dst0 += stride; - d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); - dst0 += stride; - d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); - - q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), - vreinterpret_u8_s32(d2s32)); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), - vreinterpret_u8_s32(d4s32)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); - d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - - d2s32 = vreinterpret_s32_u8(d2u8); - d4s32 = vreinterpret_s32_u8(d4u8); - - dst0 = dst; - vst1_lane_s32((int32_t *)dst0, d2s32, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst0, d2s32, 1); - dst0 += stride; - vst1_lane_s32((int32_t *)dst0, d4s32, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst0, d4s32, 1); - } - return; -} diff --git a/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c deleted file mode 100644 index f30671cc3f..0000000000 --- a/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 17734; -// because the lowest bit in 0x8a8c is 0, we can pre-shift this - -void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, unsigned char *dst, - int stride) { - unsigned char *dst0, *dst1; - int32x2_t d28, d29, d30, d31; - int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; - int16x8_t qEmpty = vdupq_n_s16(0); - int32x4x2_t q2tmp0, q2tmp1; - int16x8x2_t q2tmp2, q2tmp3; - int16x4_t dLow0, dLow1, dHigh0, dHigh1; - - d28 = d29 = d30 = d31 = vdup_n_s32(0); - - // load dq - q0 = vld1q_s16(dq); - dq += 8; - q1 = vld1q_s16(dq); - - // load q - q2 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - q += 8; - q3 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - q += 8; - q4 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - q += 8; - q5 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - - // load src from dst - dst0 = dst; - dst1 = dst + 4; - d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); - dst0 += stride; - d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); - dst1 += stride; - d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); - dst0 += stride; - d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); - dst1 += stride; - - d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); - dst0 += stride; - d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); - dst1 += stride; - d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); - d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); - - q2 = vmulq_s16(q2, q0); - q3 = vmulq_s16(q3, q1); - q4 = vmulq_s16(q4, q0); - q5 = vmulq_s16(q5, q1); - - // vswp - dLow0 = vget_low_s16(q2); - dHigh0 = vget_high_s16(q2); - dLow1 = vget_low_s16(q4); - dHigh1 = vget_high_s16(q4); - q2 = vcombine_s16(dLow0, dLow1); - q4 = vcombine_s16(dHigh0, dHigh1); - - dLow0 = vget_low_s16(q3); - dHigh0 = vget_high_s16(q3); - dLow1 = vget_low_s16(q5); - dHigh1 = vget_high_s16(q5); - q3 = vcombine_s16(dLow0, dLow1); - q5 = vcombine_s16(dHigh0, dHigh1); - - q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); - q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); - q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); - q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); - - q10 = vqaddq_s16(q2, q3); - q11 = vqsubq_s16(q2, q3); - - q8 = vshrq_n_s16(q8, 1); - q9 = vshrq_n_s16(q9, 1); - - q4 = vqaddq_s16(q4, q8); - q5 = vqaddq_s16(q5, q9); - - q2 = vqsubq_s16(q6, q5); - q3 = vqaddq_s16(q7, q4); - - q4 = vqaddq_s16(q10, q3); - q5 = vqaddq_s16(q11, q2); - q6 = vqsubq_s16(q11, q2); - q7 = vqsubq_s16(q10, q3); - - q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); - q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); - q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), - vreinterpretq_s16_s32(q2tmp1.val[0])); - q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), - vreinterpretq_s16_s32(q2tmp1.val[1])); - - // loop 2 - q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); - q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); - q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); - q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); - - q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); - q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); - - q10 = vshrq_n_s16(q10, 1); - q11 = vshrq_n_s16(q11, 1); - - q10 = vqaddq_s16(q2tmp2.val[1], q10); - q11 = vqaddq_s16(q2tmp3.val[1], q11); - - q8 = vqsubq_s16(q8, q11); - q9 = vqaddq_s16(q9, q10); - - q4 = vqaddq_s16(q2, q9); - q5 = vqaddq_s16(q3, q8); - q6 = vqsubq_s16(q3, q8); - q7 = vqsubq_s16(q2, q9); - - q4 = vrshrq_n_s16(q4, 3); - q5 = vrshrq_n_s16(q5, 3); - q6 = vrshrq_n_s16(q6, 3); - q7 = vrshrq_n_s16(q7, 3); - - q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); - q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); - q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), - vreinterpretq_s16_s32(q2tmp1.val[0])); - q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), - vreinterpretq_s16_s32(q2tmp1.val[1])); - - q4 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); - q5 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); - q6 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); - q7 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); - - d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); - d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); - d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); - d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); - - dst0 = dst; - dst1 = dst + 4; - vst1_lane_s32((int32_t *)dst0, d28, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst1, d28, 1); - dst1 += stride; - vst1_lane_s32((int32_t *)dst0, d29, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst1, d29, 1); - dst1 += stride; - - vst1_lane_s32((int32_t *)dst0, d30, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst1, d30, 1); - dst1 += stride; - vst1_lane_s32((int32_t *)dst0, d31, 0); - vst1_lane_s32((int32_t *)dst1, d31, 1); - return; -} diff --git a/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c index 6c4bcc134b..91600bfc00 100644 --- a/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -10,6 +10,8 @@ #include +#include "./vp8_rtcd.h" + void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) { int16x8_t q0s16, q1s16, q2s16, q3s16; int16x4_t d4s16, d5s16, d6s16, d7s16; diff --git a/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c index a168219705..df983b23a3 100644 --- a/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "./vp8_rtcd.h" static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( unsigned char *s, int p, const unsigned char *blimit) { diff --git a/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c index 80a222d248..fbc83ae290 100644 --- a/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "./vp8_rtcd.h" #include "vpx_ports/arm.h" #ifdef VPX_INCOMPATIBLE_GCC diff --git a/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c index 65eec300ff..fafaf2d451 100644 --- a/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit, // mblimit uint8x16_t qlimit, // limit diff --git a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c index aa2567df79..48e86d3278 100644 --- a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -11,6 +11,7 @@ #include #include #include "./vpx_config.h" +#include "./vp8_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_ports/mem.h" diff --git a/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c index d7286739da..ebc004a048 100644 --- a/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" #include "vpx_ports/arm.h" static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit, // flimit diff --git a/libs/libvpx/vp8/common/blockd.c b/libs/libvpx/vp8/common/blockd.c index f47c5bae15..22905c10a6 100644 --- a/libs/libvpx/vp8/common/blockd.c +++ b/libs/libvpx/vp8/common/blockd.c @@ -11,9 +11,9 @@ #include "blockd.h" #include "vpx_mem/vpx_mem.h" -const unsigned char vp8_block2left[25] = { - 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; -const unsigned char vp8_block2above[25] = { - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 -}; +const unsigned char vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, + 2, 2, 2, 3, 3, 3, 3, 4, 4, + 5, 5, 6, 6, 7, 7, 8 }; +const unsigned char vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, + 1, 2, 3, 0, 1, 2, 3, 4, 5, + 4, 5, 6, 7, 6, 7, 8 }; diff --git a/libs/libvpx/vp8/common/blockd.h b/libs/libvpx/vp8/common/blockd.h index 1a3aad16af..f8d1539739 100644 --- a/libs/libvpx/vp8/common/blockd.h +++ b/libs/libvpx/vp8/common/blockd.h @@ -8,11 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_BLOCKD_H_ -#define VP8_COMMON_BLOCKD_H_ +#ifndef VPX_VP8_COMMON_BLOCKD_H_ +#define VPX_VP8_COMMON_BLOCKD_H_ void vpx_log(const char *format, ...); +#include "vpx/internal/vpx_codec_internal.h" #include "vpx_config.h" #include "vpx_scale/yv12config.h" #include "mv.h" @@ -37,7 +38,9 @@ extern "C" { #define SEGMENT_DELTADATA 0 #define SEGMENT_ABSDATA 1 -typedef struct { int r, c; } POS; +typedef struct { + int r, c; +} POS; #define PLANE_TYPE_Y_NO_DC 0 #define PLANE_TYPE_Y2 1 @@ -180,6 +183,9 @@ typedef struct { unsigned int low_res_ref_frames[MAX_REF_FRAMES]; // The video frame counter value for the key frame, for lowest resolution. unsigned int key_frame_counter_value; + // Flags to signal skipped encoding of previous and base layer stream. + unsigned int skip_encoding_prev_stream; + unsigned int skip_encoding_base_stream; LOWER_RES_MB_INFO *mb_info; } LOWER_RES_FRAME_INFO; #endif @@ -196,8 +202,9 @@ typedef struct blockd { union b_mode_info bmi; } BLOCKD; -typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, - int yofst, unsigned char *dst, int dst_pitch); +typedef void (*vp8_subpix_fn_t)(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch); typedef struct macroblockd { DECLARE_ALIGNED(16, unsigned char, predictor[384]); @@ -283,6 +290,8 @@ typedef struct macroblockd { int corrupted; + struct vpx_internal_error_info error_info; + #if ARCH_X86 || ARCH_X86_64 /* This is an intermediate buffer currently used in sub-pixel motion search * to keep a copy of the reference area. This buffer can be used for other @@ -299,4 +308,4 @@ extern void vp8_setup_block_dptrs(MACROBLOCKD *x); } // extern "C" #endif -#endif // VP8_COMMON_BLOCKD_H_ +#endif // VPX_VP8_COMMON_BLOCKD_H_ diff --git a/libs/libvpx/vp8/common/coefupdateprobs.h b/libs/libvpx/vp8/common/coefupdateprobs.h index 9b01bba312..b342096b55 100644 --- a/libs/libvpx/vp8/common/coefupdateprobs.h +++ b/libs/libvpx/vp8/common/coefupdateprobs.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_COEFUPDATEPROBS_H_ -#define VP8_COMMON_COEFUPDATEPROBS_H_ +#ifndef VPX_VP8_COMMON_COEFUPDATEPROBS_H_ +#define VPX_VP8_COMMON_COEFUPDATEPROBS_H_ #ifdef __cplusplus extern "C" { @@ -194,4 +194,4 @@ const vp8_prob vp8_coef_update_probs } // extern "C" #endif -#endif // VP8_COMMON_COEFUPDATEPROBS_H_ +#endif // VPX_VP8_COMMON_COEFUPDATEPROBS_H_ diff --git a/libs/libvpx/vp8/common/common.h b/libs/libvpx/vp8/common/common.h index bbfc4f3934..2c30e8d6c5 100644 --- a/libs/libvpx/vp8/common/common.h +++ b/libs/libvpx/vp8/common/common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_COMMON_H_ -#define VP8_COMMON_COMMON_H_ +#ifndef VPX_VP8_COMMON_COMMON_H_ +#define VPX_VP8_COMMON_COMMON_H_ #include @@ -31,18 +31,18 @@ extern "C" { /* Use this for variably-sized arrays. */ -#define vp8_copy_array(Dest, Src, N) \ - { \ - assert(sizeof(*Dest) == sizeof(*Src)); \ - memcpy(Dest, Src, N * sizeof(*Src)); \ +#define vp8_copy_array(Dest, Src, N) \ + { \ + assert(sizeof(*(Dest)) == sizeof(*(Src))); \ + memcpy(Dest, Src, (N) * sizeof(*(Src))); \ } -#define vp8_zero(Dest) memset(&Dest, 0, sizeof(Dest)); +#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest)); -#define vp8_zero_array(Dest, N) memset(Dest, 0, N * sizeof(*Dest)); +#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest))); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_COMMON_H_ +#endif // VPX_VP8_COMMON_COMMON_H_ diff --git a/libs/libvpx/vp8/common/default_coef_probs.h b/libs/libvpx/vp8/common/default_coef_probs.h index 8c861ac876..b25e4a45a3 100644 --- a/libs/libvpx/vp8/common/default_coef_probs.h +++ b/libs/libvpx/vp8/common/default_coef_probs.h @@ -6,10 +6,10 @@ * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. -*/ + */ -#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_ -#define VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#ifndef VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#define VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ #ifdef __cplusplus extern "C" { @@ -157,4 +157,4 @@ static const vp8_prob default_coef_probs } // extern "C" #endif -#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#endif // VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/libs/libvpx/vp8/common/entropy.c b/libs/libvpx/vp8/common/entropy.c index f61fa9e8e4..fc4a3539fd 100644 --- a/libs/libvpx/vp8/common/entropy.c +++ b/libs/libvpx/vp8/common/entropy.c @@ -28,9 +28,9 @@ DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) = { - 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7 -}; +DECLARE_ALIGNED(16, const unsigned char, + vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, + 6, 6, 6, 6, 6, 6, 6, 7 }; DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { @@ -41,9 +41,9 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15, }; -DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) = { - 1, 2, 6, 7, 3, 5, 8, 13, 4, 9, 12, 14, 10, 11, 15, 16 -}; +DECLARE_ALIGNED(16, const short, + vp8_default_inv_zig_zag[16]) = { 1, 2, 6, 7, 3, 5, 8, 13, + 4, 9, 12, 14, 10, 11, 15, 16 }; /* vp8_default_zig_zag_mask generated with: @@ -129,9 +129,9 @@ static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 }; static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 }; static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 }; static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 }; -static const vp8_tree_index cat6[22] = { - 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 0, 0 -}; +static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, + 10, 10, 12, 12, 14, 14, 16, 16, + 18, 18, 20, 20, 0, 0 }; const vp8_extra_bit_struct vp8_extra_bits[12] = { { 0, 0, 0, 0 }, { 0, 0, 0, 1 }, { 0, 0, 0, 2 }, diff --git a/libs/libvpx/vp8/common/entropy.h b/libs/libvpx/vp8/common/entropy.h index d088560011..fbdb7bcfca 100644 --- a/libs/libvpx/vp8/common/entropy.h +++ b/libs/libvpx/vp8/common/entropy.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ENTROPY_H_ -#define VP8_COMMON_ENTROPY_H_ +#ifndef VPX_VP8_COMMON_ENTROPY_H_ +#define VPX_VP8_COMMON_ENTROPY_H_ #include "treecoder.h" #include "blockd.h" @@ -105,4 +105,4 @@ void vp8_coef_tree_initialize(void); } // extern "C" #endif -#endif // VP8_COMMON_ENTROPY_H_ +#endif // VPX_VP8_COMMON_ENTROPY_H_ diff --git a/libs/libvpx/vp8/common/entropymode.c b/libs/libvpx/vp8/common/entropymode.c index 239492a8cb..f61e0c2e2b 100644 --- a/libs/libvpx/vp8/common/entropymode.c +++ b/libs/libvpx/vp8/common/entropymode.c @@ -75,9 +75,9 @@ const vp8_tree_index vp8_ymode_tree[8] = { -DC_PRED, 2, 4, 6, -V_PRED, -H_PRED, -TM_PRED, -B_PRED }; -const vp8_tree_index vp8_kf_ymode_tree[8] = { - -B_PRED, 2, 4, 6, -DC_PRED, -V_PRED, -H_PRED, -TM_PRED -}; +const vp8_tree_index vp8_kf_ymode_tree[8] = { -B_PRED, 2, 4, + 6, -DC_PRED, -V_PRED, + -H_PRED, -TM_PRED }; const vp8_tree_index vp8_uv_mode_tree[6] = { -DC_PRED, 2, -V_PRED, 4, -H_PRED, -TM_PRED }; @@ -99,6 +99,6 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) { memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob)); } -void vp8_default_bmode_probs(vp8_prob p[VP8_BINTRAMODES - 1]) { - memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob)); +void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]) { + memcpy(dest, vp8_bmode_prob, sizeof(vp8_bmode_prob)); } diff --git a/libs/libvpx/vp8/common/entropymode.h b/libs/libvpx/vp8/common/entropymode.h index b3fad19be0..c772cece57 100644 --- a/libs/libvpx/vp8/common/entropymode.h +++ b/libs/libvpx/vp8/common/entropymode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ENTROPYMODE_H_ -#define VP8_COMMON_ENTROPYMODE_H_ +#ifndef VPX_VP8_COMMON_ENTROPYMODE_H_ +#define VPX_VP8_COMMON_ENTROPYMODE_H_ #include "onyxc_int.h" #include "treecoder.h" @@ -85,4 +85,4 @@ void vp8_kf_default_bmode_probs( } // extern "C" #endif -#endif // VP8_COMMON_ENTROPYMODE_H_ +#endif // VPX_VP8_COMMON_ENTROPYMODE_H_ diff --git a/libs/libvpx/vp8/common/entropymv.h b/libs/libvpx/vp8/common/entropymv.h index 6373000903..40039f5b2c 100644 --- a/libs/libvpx/vp8/common/entropymv.h +++ b/libs/libvpx/vp8/common/entropymv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ENTROPYMV_H_ -#define VP8_COMMON_ENTROPYMV_H_ +#ifndef VPX_VP8_COMMON_ENTROPYMV_H_ +#define VPX_VP8_COMMON_ENTROPYMV_H_ #include "treecoder.h" @@ -46,4 +46,4 @@ extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; } // extern "C" #endif -#endif // VP8_COMMON_ENTROPYMV_H_ +#endif // VPX_VP8_COMMON_ENTROPYMV_H_ diff --git a/libs/libvpx/vp8/common/extend.c b/libs/libvpx/vp8/common/extend.c index 2d67b516be..f4dbce2cd5 100644 --- a/libs/libvpx/vp8/common/extend.c +++ b/libs/libvpx/vp8/common/extend.c @@ -20,8 +20,7 @@ static void copy_and_extend_plane(unsigned char *s, /* source */ int et, /* extend top border */ int el, /* extend left border */ int eb, /* extend bottom border */ - int er /* extend right border */ - ) { + int er) { /* extend right border */ int i; unsigned char *src_ptr1, *src_ptr2; unsigned char *dest_ptr1, *dest_ptr2; diff --git a/libs/libvpx/vp8/common/extend.h b/libs/libvpx/vp8/common/extend.h index 7da5ce31da..586a38a4f3 100644 --- a/libs/libvpx/vp8/common/extend.h +++ b/libs/libvpx/vp8/common/extend.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_EXTEND_H_ -#define VP8_COMMON_EXTEND_H_ +#ifndef VPX_VP8_COMMON_EXTEND_H_ +#define VPX_VP8_COMMON_EXTEND_H_ #include "vpx_scale/yv12config.h" @@ -29,4 +29,4 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, } // extern "C" #endif -#endif // VP8_COMMON_EXTEND_H_ +#endif // VPX_VP8_COMMON_EXTEND_H_ diff --git a/libs/libvpx/vp8/common/filter.h b/libs/libvpx/vp8/common/filter.h index f1d5ece4a5..6acee22b21 100644 --- a/libs/libvpx/vp8/common/filter.h +++ b/libs/libvpx/vp8/common/filter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_FILTER_H_ -#define VP8_COMMON_FILTER_H_ +#ifndef VPX_VP8_COMMON_FILTER_H_ +#define VPX_VP8_COMMON_FILTER_H_ #include "vpx_ports/mem.h" @@ -28,4 +28,4 @@ extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]); } // extern "C" #endif -#endif // VP8_COMMON_FILTER_H_ +#endif // VPX_VP8_COMMON_FILTER_H_ diff --git a/libs/libvpx/vp8/common/findnearmv.c b/libs/libvpx/vp8/common/findnearmv.c index f40d2c6bde..6889fdedde 100644 --- a/libs/libvpx/vp8/common/findnearmv.c +++ b/libs/libvpx/vp8/common/findnearmv.c @@ -21,19 +21,20 @@ const unsigned char vp8_mbsplit_offset[4][16] = { Note that we only consider one 4x4 subblock from each candidate 16x16 macroblock. */ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, - int_mv *nearby, int_mv *best_mv, int cnt[4], + int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4], int refframe, int *ref_frame_sign_bias) { const MODE_INFO *above = here - xd->mode_info_stride; const MODE_INFO *left = here - 1; const MODE_INFO *aboveleft = above - 1; int_mv near_mvs[4]; int_mv *mv = near_mvs; - int *cntx = cnt; + int *cntx = near_mv_ref_cnts; enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; /* Zero accumulators */ mv[0].as_int = mv[1].as_int = mv[2].as_int = 0; - cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + near_mv_ref_cnts[0] = near_mv_ref_cnts[1] = near_mv_ref_cnts[2] = + near_mv_ref_cnts[3] = 0; /* Process above */ if (above->mbmi.ref_frame != INTRA_FRAME) { @@ -63,7 +64,7 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, *cntx += 2; } else { - cnt[CNT_INTRA] += 2; + near_mv_ref_cnts[CNT_INTRA] += 2; } } @@ -83,33 +84,34 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, *cntx += 1; } else { - cnt[CNT_INTRA] += 1; + near_mv_ref_cnts[CNT_INTRA] += 1; } } /* If we have three distinct MV's ... */ - if (cnt[CNT_SPLITMV]) { + if (near_mv_ref_cnts[CNT_SPLITMV]) { /* See if above-left MV can be merged with NEAREST */ - if (mv->as_int == near_mvs[CNT_NEAREST].as_int) cnt[CNT_NEAREST] += 1; + if (mv->as_int == near_mvs[CNT_NEAREST].as_int) + near_mv_ref_cnts[CNT_NEAREST] += 1; } - cnt[CNT_SPLITMV] = + near_mv_ref_cnts[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV) + (left->mbmi.mode == SPLITMV)) * 2 + (aboveleft->mbmi.mode == SPLITMV); /* Swap near and nearest if necessary */ - if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) { + if (near_mv_ref_cnts[CNT_NEAR] > near_mv_ref_cnts[CNT_NEAREST]) { int tmp; - tmp = cnt[CNT_NEAREST]; - cnt[CNT_NEAREST] = cnt[CNT_NEAR]; - cnt[CNT_NEAR] = tmp; + tmp = near_mv_ref_cnts[CNT_NEAREST]; + near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR]; + near_mv_ref_cnts[CNT_NEAR] = tmp; tmp = near_mvs[CNT_NEAREST].as_int; near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; near_mvs[CNT_NEAR].as_int = tmp; } /* Use near_mvs[0] to store the "best" MV */ - if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) { + if (near_mv_ref_cnts[CNT_NEAREST] >= near_mv_ref_cnts[CNT_INTRA]) { near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST]; } diff --git a/libs/libvpx/vp8/common/findnearmv.h b/libs/libvpx/vp8/common/findnearmv.h index c1eaa26980..d7db9544aa 100644 --- a/libs/libvpx/vp8/common/findnearmv.h +++ b/libs/libvpx/vp8/common/findnearmv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_FINDNEARMV_H_ -#define VP8_COMMON_FINDNEARMV_H_ +#ifndef VPX_VP8_COMMON_FINDNEARMV_H_ +#define VPX_VP8_COMMON_FINDNEARMV_H_ #include "./vpx_config.h" #include "mv.h" @@ -70,7 +70,7 @@ static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge, } void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, - int_mv *nearby, int_mv *best, int near_mv_ref_cts[4], + int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4], int refframe, int *ref_frame_sign_bias); int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here, @@ -148,4 +148,4 @@ static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, } // extern "C" #endif -#endif // VP8_COMMON_FINDNEARMV_H_ +#endif // VPX_VP8_COMMON_FINDNEARMV_H_ diff --git a/libs/libvpx/vp8/common/header.h b/libs/libvpx/vp8/common/header.h index 1df01fc6fa..e64e241908 100644 --- a/libs/libvpx/vp8/common/header.h +++ b/libs/libvpx/vp8/common/header.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_HEADER_H_ -#define VP8_COMMON_HEADER_H_ +#ifndef VPX_VP8_COMMON_HEADER_H_ +#define VPX_VP8_COMMON_HEADER_H_ #ifdef __cplusplus extern "C" { @@ -45,4 +45,4 @@ typedef struct { } // extern "C" #endif -#endif // VP8_COMMON_HEADER_H_ +#endif // VPX_VP8_COMMON_HEADER_H_ diff --git a/libs/libvpx/vp8/common/idct_blk.c b/libs/libvpx/vp8/common/idct_blk.c index ff9f3eb7f2..ebe1774f56 100644 --- a/libs/libvpx/vp8/common/idct_blk.c +++ b/libs/libvpx/vp8/common/idct_blk.c @@ -12,12 +12,6 @@ #include "vp8_rtcd.h" #include "vpx_mem/vpx_mem.h" -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, - int stride); -void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred, - int pred_stride, unsigned char *dst_ptr, - int dst_stride); - void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs) { int i, j; @@ -39,40 +33,40 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, } } -void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dstu, - unsigned char *dstv, int stride, +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { int i, j; for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) { - vp8_dequant_idct_add_c(q, dq, dstu, stride); + vp8_dequant_idct_add_c(q, dq, dst_u, stride); } else { - vp8_dc_only_idct_add_c(q[0] * dq[0], dstu, stride, dstu, stride); + vp8_dc_only_idct_add_c(q[0] * dq[0], dst_u, stride, dst_u, stride); memset(q, 0, 2 * sizeof(q[0])); } q += 16; - dstu += 4; + dst_u += 4; } - dstu += 4 * stride - 8; + dst_u += 4 * stride - 8; } for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) { - vp8_dequant_idct_add_c(q, dq, dstv, stride); + vp8_dequant_idct_add_c(q, dq, dst_v, stride); } else { - vp8_dc_only_idct_add_c(q[0] * dq[0], dstv, stride, dstv, stride); + vp8_dc_only_idct_add_c(q[0] * dq[0], dst_v, stride, dst_v, stride); memset(q, 0, 2 * sizeof(q[0])); } q += 16; - dstv += 4; + dst_v += 4; } - dstv += 4 * stride - 8; + dst_v += 4 * stride - 8; } } diff --git a/libs/libvpx/vp8/common/invtrans.h b/libs/libvpx/vp8/common/invtrans.h index c7af32fb67..aed7bb0600 100644 --- a/libs/libvpx/vp8/common/invtrans.h +++ b/libs/libvpx/vp8/common/invtrans.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_INVTRANS_H_ -#define VP8_COMMON_INVTRANS_H_ +#ifndef VPX_VP8_COMMON_INVTRANS_H_ +#define VPX_VP8_COMMON_INVTRANS_H_ #include "./vpx_config.h" #include "vp8_rtcd.h" @@ -54,4 +54,4 @@ static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd) { } // extern "C" #endif -#endif // VP8_COMMON_INVTRANS_H_ +#endif // VPX_VP8_COMMON_INVTRANS_H_ diff --git a/libs/libvpx/vp8/common/loopfilter.h b/libs/libvpx/vp8/common/loopfilter.h index 7484563e06..0733046e5a 100644 --- a/libs/libvpx/vp8/common/loopfilter.h +++ b/libs/libvpx/vp8/common/loopfilter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_LOOPFILTER_H_ -#define VP8_COMMON_LOOPFILTER_H_ +#ifndef VPX_VP8_COMMON_LOOPFILTER_H_ +#define VPX_VP8_COMMON_LOOPFILTER_H_ #include "vpx_ports/mem.h" #include "vpx_config.h" @@ -100,4 +100,4 @@ void vp8_loop_filter_row_simple(struct VP8Common *cm, } // extern "C" #endif -#endif // VP8_COMMON_LOOPFILTER_H_ +#endif // VPX_VP8_COMMON_LOOPFILTER_H_ diff --git a/libs/libvpx/vp8/common/loopfilter_filters.c b/libs/libvpx/vp8/common/loopfilter_filters.c index 188e290ca7..61a55d3c92 100644 --- a/libs/libvpx/vp8/common/loopfilter_filters.c +++ b/libs/libvpx/vp8/common/loopfilter_filters.c @@ -270,28 +270,32 @@ static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, *op0 = u ^ 0x80; } -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *s, int p, +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, + int y_stride, const unsigned char *blimit) { signed char mask = 0; int i = 0; do { - mask = vp8_simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p], s[0 * p], - s[1 * p]); - vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); - ++s; + mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2 * y_stride], + y_ptr[-1 * y_stride], y_ptr[0 * y_stride], + y_ptr[1 * y_stride]); + vp8_simple_filter(mask, y_ptr - 2 * y_stride, y_ptr - 1 * y_stride, y_ptr, + y_ptr + 1 * y_stride); + ++y_ptr; } while (++i < 16); } -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p, +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { signed char mask = 0; int i = 0; do { - mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); - vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); - s += p; + mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2], y_ptr[-1], y_ptr[0], + y_ptr[1]); + vp8_simple_filter(mask, y_ptr - 2, y_ptr - 1, y_ptr, y_ptr + 1); + y_ptr += y_stride; } while (++i < 16); } diff --git a/libs/libvpx/vp8/common/mfqe.c b/libs/libvpx/vp8/common/mfqe.c index b6f8146b84..1fe7363f17 100644 --- a/libs/libvpx/vp8/common/mfqe.c +++ b/libs/libvpx/vp8/common/mfqe.c @@ -18,6 +18,7 @@ #include "./vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" +#include "vp8/common/common.h" #include "vp8/common/postproc.h" #include "vpx_dsp/variance.h" #include "vpx_mem/vpx_mem.h" @@ -211,6 +212,7 @@ static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) { { 0, 1, 4, 5 }, { 2, 3, 6, 7 }, { 8, 9, 12, 13 }, { 10, 11, 14, 15 } }; int i, j; + vp8_zero(*map); for (i = 0; i < 4; ++i) { map[i] = 1; for (j = 0; j < 4 && map[j]; ++j) { @@ -233,7 +235,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) { FRAME_TYPE frame_type = cm->frame_type; /* Point at base of Mb MODE_INFO list has motion vectors etc */ - const MODE_INFO *mode_info_context = cm->show_frame_mi; + const MODE_INFO *mode_info_context = cm->mi; int mb_row; int mb_col; int totmap, map[4]; diff --git a/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c index 899dc10ad9..eae852d592 100644 --- a/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c +++ b/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c @@ -35,41 +35,41 @@ void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst, } void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq, - unsigned char *dstu, - unsigned char *dstv, int stride, + unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { int i, j; for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) - vp8_dequant_idct_add_dspr2(q, dq, dstu, stride); + vp8_dequant_idct_add_dspr2(q, dq, dst_u, stride); else { - vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstu, stride, dstu, stride); + vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_u, stride, dst_u, stride); ((int *)q)[0] = 0; } q += 16; - dstu += 4; + dst_u += 4; } - dstu += 4 * stride - 8; + dst_u += 4 * stride - 8; } for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) - vp8_dequant_idct_add_dspr2(q, dq, dstv, stride); + vp8_dequant_idct_add_dspr2(q, dq, dst_v, stride); else { - vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstv, stride, dstv, stride); + vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_v, stride, dst_v, stride); ((int *)q)[0] = 0; } q += 16; - dstv += 4; + dst_v += 4; } - dstv += 4 * stride - 8; + dst_v += 4 * stride - 8; } } diff --git a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c index d2c3442515..21446fb413 100644 --- a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c +++ b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c @@ -934,8 +934,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, s4 = s3 + p; /* load quad-byte vectors - * memory is 4 byte aligned - */ + * memory is 4 byte aligned + */ p2 = *((uint32_t *)(s1 - 4)); p6 = *((uint32_t *)(s1)); p1 = *((uint32_t *)(s2 - 4)); @@ -990,8 +990,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, :); /* if (p1 - p4 == 0) and (p2 - p3 == 0) - * mask will be zero and filtering is not needed - */ + * mask will be zero and filtering is not needed + */ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask); @@ -2102,8 +2102,8 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p, s4 = s3 + p; /* load quad-byte vectors - * memory is 4 byte aligned - */ + * memory is 4 byte aligned + */ p2 = *((uint32_t *)(s1 - 4)); p6 = *((uint32_t *)(s1)); p1 = *((uint32_t *)(s2 - 4)); diff --git a/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c index f6020ab468..4fd6854c52 100644 --- a/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c +++ b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c @@ -12,7 +12,7 @@ #include "vpx_mem/vpx_mem.h" void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, - int stride, int8_t *eobs) { + int stride, char *eobs) { int i, j; for (i = 0; i < 4; i++) { @@ -32,40 +32,39 @@ void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, } } -void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu, - uint8_t *dstv, int stride, - int8_t *eobs) { +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int stride, char *eobs) { int i, j; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (*eobs++ > 1) { - vp8_dequant_idct_add_mmi(q, dq, dstu, stride); + vp8_dequant_idct_add_mmi(q, dq, dst_u, stride); } else { - vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride); + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride); memset(q, 0, 2 * sizeof(q[0])); } q += 16; - dstu += 4; + dst_u += 4; } - dstu += 4 * stride - 8; + dst_u += 4 * stride - 8; } for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (*eobs++ > 1) { - vp8_dequant_idct_add_mmi(q, dq, dstv, stride); + vp8_dequant_idct_add_mmi(q, dq, dst_v, stride); } else { - vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride); + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride); memset(q, 0, 2 * sizeof(q[0])); } q += 16; - dstv += 4; + dst_v += 4; } - dstv += 4 * stride - 8; + dst_v += 4 * stride - 8; } } diff --git a/libs/libvpx/vp8/common/mips/msa/idct_msa.c b/libs/libvpx/vp8/common/mips/msa/idct_msa.c index 3d516d0f81..efad0c29f8 100644 --- a/libs/libvpx/vp8/common/mips/msa/idct_msa.c +++ b/libs/libvpx/vp8/common/mips/msa/idct_msa.c @@ -134,7 +134,7 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred, ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride); } -void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) { +void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dqcoeff) { v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; @@ -157,22 +157,22 @@ void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) { ADD2(tmp0, 3, tmp1, 3, out0, out1); out0 >>= 3; out1 >>= 3; - mb_dq_coeff[0] = __msa_copy_s_h(out0, 0); - mb_dq_coeff[16] = __msa_copy_s_h(out0, 4); - mb_dq_coeff[32] = __msa_copy_s_h(out1, 0); - mb_dq_coeff[48] = __msa_copy_s_h(out1, 4); - mb_dq_coeff[64] = __msa_copy_s_h(out0, 1); - mb_dq_coeff[80] = __msa_copy_s_h(out0, 5); - mb_dq_coeff[96] = __msa_copy_s_h(out1, 1); - mb_dq_coeff[112] = __msa_copy_s_h(out1, 5); - mb_dq_coeff[128] = __msa_copy_s_h(out0, 2); - mb_dq_coeff[144] = __msa_copy_s_h(out0, 6); - mb_dq_coeff[160] = __msa_copy_s_h(out1, 2); - mb_dq_coeff[176] = __msa_copy_s_h(out1, 6); - mb_dq_coeff[192] = __msa_copy_s_h(out0, 3); - mb_dq_coeff[208] = __msa_copy_s_h(out0, 7); - mb_dq_coeff[224] = __msa_copy_s_h(out1, 3); - mb_dq_coeff[240] = __msa_copy_s_h(out1, 7); + mb_dqcoeff[0] = __msa_copy_s_h(out0, 0); + mb_dqcoeff[16] = __msa_copy_s_h(out0, 4); + mb_dqcoeff[32] = __msa_copy_s_h(out1, 0); + mb_dqcoeff[48] = __msa_copy_s_h(out1, 4); + mb_dqcoeff[64] = __msa_copy_s_h(out0, 1); + mb_dqcoeff[80] = __msa_copy_s_h(out0, 5); + mb_dqcoeff[96] = __msa_copy_s_h(out1, 1); + mb_dqcoeff[112] = __msa_copy_s_h(out1, 5); + mb_dqcoeff[128] = __msa_copy_s_h(out0, 2); + mb_dqcoeff[144] = __msa_copy_s_h(out0, 6); + mb_dqcoeff[160] = __msa_copy_s_h(out1, 2); + mb_dqcoeff[176] = __msa_copy_s_h(out1, 6); + mb_dqcoeff[192] = __msa_copy_s_h(out0, 3); + mb_dqcoeff[208] = __msa_copy_s_h(out0, 7); + mb_dqcoeff[224] = __msa_copy_s_h(out1, 3); + mb_dqcoeff[240] = __msa_copy_s_h(out1, 7); } static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, @@ -359,27 +359,27 @@ void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq, uint8_t *dst, } } -void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu, - uint8_t *dstv, int32_t stride, +void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int32_t stride, char *eobs) { int16_t *eobs_h = (int16_t *)eobs; if (eobs_h[0]) { if (eobs_h[0] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstu, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_u, stride); } } q += 32; - dstu += (stride * 4); + dst_u += (stride * 4); if (eobs_h[1]) { if (eobs_h[1] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstu, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_u, stride); } } @@ -387,20 +387,20 @@ void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu, if (eobs_h[2]) { if (eobs_h[2] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstv, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_v, stride); } } q += 32; - dstv += (stride * 4); + dst_v += (stride * 4); if (eobs_h[3]) { if (eobs_h[3] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstv, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_v, stride); } } } diff --git a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h index 6bec3adec3..14f83799ff 100644 --- a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h +++ b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ -#define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ +#ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ +#define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ #include @@ -1757,4 +1757,4 @@ \ tmp1_m; \ }) -#endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */ +#endif // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ diff --git a/libs/libvpx/vp8/common/modecont.c b/libs/libvpx/vp8/common/modecont.c index d6ad9bb99a..bab410374f 100644 --- a/libs/libvpx/vp8/common/modecont.c +++ b/libs/libvpx/vp8/common/modecont.c @@ -11,28 +11,16 @@ #include "entropy.h" const int vp8_mode_contexts[6][4] = { - { - /* 0 */ - 7, 1, 1, 143, - }, - { - /* 1 */ - 14, 18, 14, 107, - }, - { - /* 2 */ - 135, 64, 57, 68, - }, - { - /* 3 */ - 60, 56, 128, 65, - }, - { - /* 4 */ - 159, 134, 128, 34, - }, - { - /* 5 */ - 234, 188, 128, 28, - }, + { /* 0 */ + 7, 1, 1, 143 }, + { /* 1 */ + 14, 18, 14, 107 }, + { /* 2 */ + 135, 64, 57, 68 }, + { /* 3 */ + 60, 56, 128, 65 }, + { /* 4 */ + 159, 134, 128, 34 }, + { /* 5 */ + 234, 188, 128, 28 }, }; diff --git a/libs/libvpx/vp8/common/modecont.h b/libs/libvpx/vp8/common/modecont.h index b58c7dc2d3..031f74f2ff 100644 --- a/libs/libvpx/vp8/common/modecont.h +++ b/libs/libvpx/vp8/common/modecont.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_MODECONT_H_ -#define VP8_COMMON_MODECONT_H_ +#ifndef VPX_VP8_COMMON_MODECONT_H_ +#define VPX_VP8_COMMON_MODECONT_H_ #ifdef __cplusplus extern "C" { @@ -21,4 +21,4 @@ extern const int vp8_mode_contexts[6][4]; } // extern "C" #endif -#endif // VP8_COMMON_MODECONT_H_ +#endif // VPX_VP8_COMMON_MODECONT_H_ diff --git a/libs/libvpx/vp8/common/mv.h b/libs/libvpx/vp8/common/mv.h index b6d2147af8..4cde12f201 100644 --- a/libs/libvpx/vp8/common/mv.h +++ b/libs/libvpx/vp8/common/mv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_MV_H_ -#define VP8_COMMON_MV_H_ +#ifndef VPX_VP8_COMMON_MV_H_ +#define VPX_VP8_COMMON_MV_H_ #include "vpx/vpx_integer.h" #ifdef __cplusplus @@ -30,4 +30,4 @@ typedef union int_mv { } // extern "C" #endif -#endif // VP8_COMMON_MV_H_ +#endif // VPX_VP8_COMMON_MV_H_ diff --git a/libs/libvpx/vp8/common/onyx.h b/libs/libvpx/vp8/common/onyx.h index 72fba2ec56..05c72df3fa 100644 --- a/libs/libvpx/vp8/common/onyx.h +++ b/libs/libvpx/vp8/common/onyx.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ONYX_H_ -#define VP8_COMMON_ONYX_H_ +#ifndef VPX_VP8_COMMON_ONYX_H_ +#define VPX_VP8_COMMON_ONYX_H_ #ifdef __cplusplus extern "C" { @@ -247,38 +247,38 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); -void vp8_change_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); +void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf); -int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags, +int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); -int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags, + int64_t end_time); +int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags, size_t *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush); -int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest, +int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); -int vp8_use_as_reference(struct VP8_COMP *comp, int ref_frame_flags); -int vp8_update_reference(struct VP8_COMP *comp, int ref_frame_flags); -int vp8_get_reference(struct VP8_COMP *comp, +int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_get_reference(struct VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp8_set_reference(struct VP8_COMP *comp, +int vp8_set_reference(struct VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp8_update_entropy(struct VP8_COMP *comp, int update); -int vp8_set_roimap(struct VP8_COMP *comp, unsigned char *map, unsigned int rows, +int vp8_update_entropy(struct VP8_COMP *cpi, int update); +int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]); -int vp8_set_active_map(struct VP8_COMP *comp, unsigned char *map, +int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols); -int vp8_set_internal_size(struct VP8_COMP *comp, VPX_SCALING horiz_mode, +int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); -int vp8_get_quantizer(struct VP8_COMP *c); +int vp8_get_quantizer(struct VP8_COMP *cpi); #ifdef __cplusplus } #endif -#endif // VP8_COMMON_ONYX_H_ +#endif // VPX_VP8_COMMON_ONYX_H_ diff --git a/libs/libvpx/vp8/common/onyxc_int.h b/libs/libvpx/vp8/common/onyxc_int.h index 9a12c7fb67..ef8d007620 100644 --- a/libs/libvpx/vp8/common/onyxc_int.h +++ b/libs/libvpx/vp8/common/onyxc_int.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ONYXC_INT_H_ -#define VP8_COMMON_ONYXC_INT_H_ +#ifndef VPX_VP8_COMMON_ONYXC_INT_H_ +#define VPX_VP8_COMMON_ONYXC_INT_H_ #include "vpx_config.h" #include "vp8_rtcd.h" @@ -174,4 +174,4 @@ typedef struct VP8Common { } // extern "C" #endif -#endif // VP8_COMMON_ONYXC_INT_H_ +#endif // VPX_VP8_COMMON_ONYXC_INT_H_ diff --git a/libs/libvpx/vp8/common/onyxd.h b/libs/libvpx/vp8/common/onyxd.h index d3c1b0e972..801ef87b20 100644 --- a/libs/libvpx/vp8/common/onyxd.h +++ b/libs/libvpx/vp8/common/onyxd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ONYXD_H_ -#define VP8_COMMON_ONYXD_H_ +#ifndef VPX_VP8_COMMON_ONYXD_H_ +#define VPX_VP8_COMMON_ONYXD_H_ /* Create/destroy static data structures. */ #ifdef __cplusplus @@ -41,23 +41,23 @@ void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x); int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst); -int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size, - const uint8_t *dest, int64_t time_stamp); -int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd, +int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi, size_t size, + const uint8_t *source, int64_t time_stamp); +int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame); -vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp, +vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *comp, +vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp8dx_get_quantizer(const struct VP8D_COMP *c); +int vp8dx_get_quantizer(const struct VP8D_COMP *pbi); #ifdef __cplusplus } #endif -#endif // VP8_COMMON_ONYXD_H_ +#endif // VPX_VP8_COMMON_ONYXD_H_ diff --git a/libs/libvpx/vp8/common/postproc.c b/libs/libvpx/vp8/common/postproc.c index d67ee8a57d..2ed19c4fd5 100644 --- a/libs/libvpx/vp8/common/postproc.c +++ b/libs/libvpx/vp8/common/postproc.c @@ -65,7 +65,7 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; int ppl = (int)(level + .5); - const MODE_INFO *mode_info_context = cm->show_frame_mi; + const MODE_INFO *mode_info_context = cm->mi; int mbr, mbc; /* The pixel thresholds are adjusted according to if or not the macroblock @@ -151,124 +151,6 @@ void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, } #endif // CONFIG_POSTPROC -/* Blend the macro block with a solid colored square. Leave the - * edges unblended to give distinction to macro blocks in areas - * filled with the same color block. - */ -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y_1, int u_1, int v_1, int alpha, int stride) { - int i, j; - int y1_const = y_1 * ((1 << 16) - alpha); - int u1_const = u_1 * ((1 << 16) - alpha); - int v1_const = v_1 * ((1 << 16) - alpha); - - y += 2 * stride + 2; - for (i = 0; i < 12; ++i) { - for (j = 0; j < 12; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - u += stride + 1; - v += stride + 1; - - for (i = 0; i < 6; ++i) { - for (j = 0; j < 6; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -/* Blend only the edge of the macro block. Leave center - * unblended to allow for other visualizations to be layered. - */ -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y_1, int u_1, int v_1, int alpha, int stride) { - int i, j; - int y1_const = y_1 * ((1 << 16) - alpha); - int u1_const = u_1 * ((1 << 16) - alpha); - int v1_const = v_1 * ((1 << 16) - alpha); - - for (i = 0; i < 2; ++i) { - for (j = 0; j < 16; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - for (i = 0; i < 12; ++i) { - y[0] = (y[0] * alpha + y1_const) >> 16; - y[1] = (y[1] * alpha + y1_const) >> 16; - y[14] = (y[14] * alpha + y1_const) >> 16; - y[15] = (y[15] * alpha + y1_const) >> 16; - y += stride; - } - - for (i = 0; i < 2; ++i) { - for (j = 0; j < 16; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (j = 0; j < 8; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - - for (i = 0; i < 6; ++i) { - u[0] = (u[0] * alpha + u1_const) >> 16; - v[0] = (v[0] * alpha + v1_const) >> 16; - - u[7] = (u[7] * alpha + u1_const) >> 16; - v[7] = (v[7] * alpha + v1_const) >> 16; - - u += stride; - v += stride; - } - - for (j = 0; j < 8; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } -} - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y_1, int u_1, int v_1, int alpha, int stride) { - int i, j; - int y1_const = y_1 * ((1 << 16) - alpha); - int u1_const = u_1 * ((1 << 16) - alpha); - int v1_const = v_1 * ((1 << 16) - alpha); - - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (i = 0; i < 2; ++i) { - for (j = 0; j < 2; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - #if CONFIG_POSTPROC int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags) { @@ -325,7 +207,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vpx_clear_system_state(); if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid && - oci->current_video_frame >= 2 && + oci->current_video_frame > 10 && oci->postproc_state.last_base_qindex < 60 && oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) { vp8_multiframe_quality_enhance(oci); diff --git a/libs/libvpx/vp8/common/postproc.h b/libs/libvpx/vp8/common/postproc.h index 7be112b163..a14f5f1df1 100644 --- a/libs/libvpx/vp8/common/postproc.h +++ b/libs/libvpx/vp8/common/postproc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_POSTPROC_H_ -#define VP8_COMMON_POSTPROC_H_ +#ifndef VPX_VP8_COMMON_POSTPROC_H_ +#define VPX_VP8_COMMON_POSTPROC_H_ #include "vpx_ports/mem.h" struct postproc_state { @@ -27,13 +27,13 @@ struct postproc_state { extern "C" { #endif int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, - vp8_ppflags_t *flags); + vp8_ppflags_t *ppflags); -void vp8_de_noise(struct VP8Common *oci, YV12_BUFFER_CONFIG *source, +void vp8_de_noise(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag, int uvfilter); -void vp8_deblock(struct VP8Common *oci, YV12_BUFFER_CONFIG *source, +void vp8_deblock(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag); #define MFQE_PRECISION 4 @@ -43,4 +43,4 @@ void vp8_multiframe_quality_enhance(struct VP8Common *cm); } // extern "C" #endif -#endif // VP8_COMMON_POSTPROC_H_ +#endif // VPX_VP8_COMMON_POSTPROC_H_ diff --git a/libs/libvpx/vp8/common/ppflags.h b/libs/libvpx/vp8/common/ppflags.h index 96e3af6c9c..bdf08734b9 100644 --- a/libs/libvpx/vp8/common/ppflags.h +++ b/libs/libvpx/vp8/common/ppflags.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_PPFLAGS_H_ -#define VP8_COMMON_PPFLAGS_H_ +#ifndef VPX_VP8_COMMON_PPFLAGS_H_ +#define VPX_VP8_COMMON_PPFLAGS_H_ #ifdef __cplusplus extern "C" { @@ -36,4 +36,4 @@ typedef struct { } // extern "C" #endif -#endif // VP8_COMMON_PPFLAGS_H_ +#endif // VPX_VP8_COMMON_PPFLAGS_H_ diff --git a/libs/libvpx/vp8/common/quant_common.h b/libs/libvpx/vp8/common/quant_common.h index ff4203df87..049840a272 100644 --- a/libs/libvpx/vp8/common/quant_common.h +++ b/libs/libvpx/vp8/common/quant_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_QUANT_COMMON_H_ -#define VP8_COMMON_QUANT_COMMON_H_ +#ifndef VPX_VP8_COMMON_QUANT_COMMON_H_ +#define VPX_VP8_COMMON_QUANT_COMMON_H_ #include "string.h" #include "blockd.h" @@ -30,4 +30,4 @@ extern int vp8_ac_uv_quant(int QIndex, int Delta); } // extern "C" #endif -#endif // VP8_COMMON_QUANT_COMMON_H_ +#endif // VPX_VP8_COMMON_QUANT_COMMON_H_ diff --git a/libs/libvpx/vp8/common/reconinter.c b/libs/libvpx/vp8/common/reconinter.c index 48892c9b8e..2cb0709318 100644 --- a/libs/libvpx/vp8/common/reconinter.c +++ b/libs/libvpx/vp8/common/reconinter.c @@ -333,6 +333,13 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y, _16x16mv.as_mv.row &= x->fullpixel_mask; _16x16mv.as_mv.col &= x->fullpixel_mask; + if (2 * _16x16mv.as_mv.col < (x->mb_to_left_edge - (19 << 3)) || + 2 * _16x16mv.as_mv.col > x->mb_to_right_edge + (18 << 3) || + 2 * _16x16mv.as_mv.row < (x->mb_to_top_edge - (19 << 3)) || + 2 * _16x16mv.as_mv.row > x->mb_to_bottom_edge + (18 << 3)) { + return; + } + pre_stride >>= 1; offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); uptr = x->pre.u_buffer + offset; diff --git a/libs/libvpx/vp8/common/reconinter.h b/libs/libvpx/vp8/common/reconinter.h index 4cdd4fee0f..974e7ce754 100644 --- a/libs/libvpx/vp8/common/reconinter.h +++ b/libs/libvpx/vp8/common/reconinter.h @@ -8,30 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_RECONINTER_H_ -#define VP8_COMMON_RECONINTER_H_ +#ifndef VPX_VP8_COMMON_RECONINTER_H_ +#define VPX_VP8_COMMON_RECONINTER_H_ #ifdef __cplusplus extern "C" { #endif -extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); -extern void vp8_build_inter16x16_predictors_mb( - MACROBLOCKD *x, unsigned char *dst_y, unsigned char *dst_u, - unsigned char *dst_v, int dst_ystride, int dst_uvstride); +void vp8_build_inter_predictors_mb(MACROBLOCKD *xd); +void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, int dst_ystride, + int dst_uvstride); -extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, - unsigned char *dst_y, - int dst_ystride); -extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, - unsigned char *base_pre, - int pre_stride, vp8_subpix_fn_t sppf); +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y, + int dst_ystride); +void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, + int pre_stride, vp8_subpix_fn_t sppf); -extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); -extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); +void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); +void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_RECONINTER_H_ +#endif // VPX_VP8_COMMON_RECONINTER_H_ diff --git a/libs/libvpx/vp8/common/reconintra.h b/libs/libvpx/vp8/common/reconintra.h index fd7c725f35..029ac00a24 100644 --- a/libs/libvpx/vp8/common/reconintra.h +++ b/libs/libvpx/vp8/common/reconintra.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_RECONINTRA_H_ -#define VP8_COMMON_RECONINTRA_H_ +#ifndef VPX_VP8_COMMON_RECONINTRA_H_ +#define VPX_VP8_COMMON_RECONINTRA_H_ #include "vp8/common/blockd.h" @@ -32,4 +32,4 @@ void vp8_init_intra_predictors(void); } // extern "C" #endif -#endif // VP8_COMMON_RECONINTRA_H_ +#endif // VPX_VP8_COMMON_RECONINTRA_H_ diff --git a/libs/libvpx/vp8/common/reconintra4x4.h b/libs/libvpx/vp8/common/reconintra4x4.h index e17fc58c01..3618ec5cbe 100644 --- a/libs/libvpx/vp8/common/reconintra4x4.h +++ b/libs/libvpx/vp8/common/reconintra4x4.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_RECONINTRA4X4_H_ -#define VP8_COMMON_RECONINTRA4X4_H_ +#ifndef VPX_VP8_COMMON_RECONINTRA4X4_H_ +#define VPX_VP8_COMMON_RECONINTRA4X4_H_ #include "vp8/common/blockd.h" #ifdef __cplusplus @@ -31,7 +31,7 @@ static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd, *dst_ptr2 = *src_ptr; } -void vp8_intra4x4_predict(unsigned char *Above, unsigned char *yleft, +void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); @@ -42,4 +42,4 @@ void vp8_init_intra4x4_predictors_internal(void); } // extern "C" #endif -#endif // VP8_COMMON_RECONINTRA4X4_H_ +#endif // VPX_VP8_COMMON_RECONINTRA4X4_H_ diff --git a/libs/libvpx/vp8/common/rtcd_defs.pl b/libs/libvpx/vp8/common/rtcd_defs.pl index 3df745f75a..8452b5e854 100644 --- a/libs/libvpx/vp8/common/rtcd_defs.pl +++ b/libs/libvpx/vp8/common/rtcd_defs.pl @@ -31,10 +31,10 @@ forward_decls qw/vp8_common_forward_decls/; # # Dequant # -add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc"; +add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *DQC"; specialize qw/vp8_dequantize_b mmx neon msa mmi/; -add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride"; +add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *dest, int stride"; specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; @@ -46,20 +46,20 @@ specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/; # # Loopfilter # -add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/; -add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/; -add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/; -add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/; -add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit"; +add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit"; specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/; $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c; $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2; @@ -67,7 +67,7 @@ $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon; $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa; $vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi; -add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit"; +add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit"; specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/; $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c; $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2; @@ -75,7 +75,7 @@ $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon; $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa; $vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi; -add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit"; +add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit"; specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/; $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c; $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2; @@ -83,7 +83,7 @@ $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon; $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa; $vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi; -add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit"; +add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit"; specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/; $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c; $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2; @@ -95,31 +95,31 @@ $vp8_loop_filter_simple_bh_mmi=vp8_loop_filter_bhs_mmi; # IDCT # #idct16 -add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"; +add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride"; specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/; #iwalsh1 -add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output"; +add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *mb_dqcoeff"; specialize qw/vp8_short_inv_walsh4x4_1 dspr2/; #iwalsh16 -add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output"; +add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *mb_dqcoeff"; specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/; #idct1_scalar_add -add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"; +add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride"; specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/; # # RECON # -add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/; -add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/; -add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/; # @@ -127,11 +127,11 @@ specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/; # if (vpx_config("CONFIG_POSTPROC") eq "yes") { - add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; + add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; + add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; + add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; specialize qw/vp8_filter_by_weight16x16 sse2 msa/; @@ -145,29 +145,29 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { # # Subpixel # -add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/; -add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/; -add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/; -add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/; -add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/; -add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/; -add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict8x4 mmx neon msa/; +add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; +specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/; -add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict4x4 mmx neon msa/; +add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; +specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/; # # Encoder functions below this point. @@ -177,10 +177,8 @@ if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") { # # Block copy # -if ($opts{arch} =~ /x86/) { - add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n"; - specialize qw/vp8_copy32xn sse2 sse3/; -} +add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height"; +specialize qw/vp8_copy32xn sse2 sse3/; # # Forward DCT @@ -223,7 +221,7 @@ specialize qw/vp8_full_search_sad sse3 sse4_1/; $vp8_full_search_sad_sse3=vp8_full_search_sadx3; $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8; -add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; +add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; specialize qw/vp8_refining_search_sad sse2 msa/; $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4; $vp8_refining_search_sad_msa=vp8_refining_search_sadx4; diff --git a/libs/libvpx/vp8/common/setupintrarecon.h b/libs/libvpx/vp8/common/setupintrarecon.h index f3ffa16607..903a536aed 100644 --- a/libs/libvpx/vp8/common/setupintrarecon.h +++ b/libs/libvpx/vp8/common/setupintrarecon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_SETUPINTRARECON_H_ -#define VP8_COMMON_SETUPINTRARECON_H_ +#ifndef VPX_VP8_COMMON_SETUPINTRARECON_H_ +#define VPX_VP8_COMMON_SETUPINTRARECON_H_ #include "./vpx_config.h" #include "vpx_scale/yv12config.h" @@ -37,4 +37,4 @@ static INLINE void setup_intra_recon_left(unsigned char *y_buffer, } // extern "C" #endif -#endif // VP8_COMMON_SETUPINTRARECON_H_ +#endif // VPX_VP8_COMMON_SETUPINTRARECON_H_ diff --git a/libs/libvpx/vp8/common/swapyv12buffer.h b/libs/libvpx/vp8/common/swapyv12buffer.h index 0ee9a52ceb..e37c471f63 100644 --- a/libs/libvpx/vp8/common/swapyv12buffer.h +++ b/libs/libvpx/vp8/common/swapyv12buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_SWAPYV12BUFFER_H_ -#define VP8_COMMON_SWAPYV12BUFFER_H_ +#ifndef VPX_VP8_COMMON_SWAPYV12BUFFER_H_ +#define VPX_VP8_COMMON_SWAPYV12BUFFER_H_ #include "vpx_scale/yv12config.h" @@ -24,4 +24,4 @@ void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, } // extern "C" #endif -#endif // VP8_COMMON_SWAPYV12BUFFER_H_ +#endif // VPX_VP8_COMMON_SWAPYV12BUFFER_H_ diff --git a/libs/libvpx/vp8/common/systemdependent.h b/libs/libvpx/vp8/common/systemdependent.h index 3d44e37cf2..83a5513aae 100644 --- a/libs/libvpx/vp8/common/systemdependent.h +++ b/libs/libvpx/vp8/common/systemdependent.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_ -#define VP8_COMMON_SYSTEMDEPENDENT_H_ +#ifndef VPX_VP8_COMMON_SYSTEMDEPENDENT_H_ +#define VPX_VP8_COMMON_SYSTEMDEPENDENT_H_ #include "vpx_config.h" @@ -24,4 +24,4 @@ void vp8_machine_specific_config(struct VP8Common *); } // extern "C" #endif -#endif // VP8_COMMON_SYSTEMDEPENDENT_H_ +#endif // VPX_VP8_COMMON_SYSTEMDEPENDENT_H_ diff --git a/libs/libvpx/vp8/common/threading.h b/libs/libvpx/vp8/common/threading.h index c89cf9bad7..58b9013726 100644 --- a/libs/libvpx/vp8/common/threading.h +++ b/libs/libvpx/vp8/common/threading.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_THREADING_H_ -#define VP8_COMMON_THREADING_H_ +#ifndef VPX_VP8_COMMON_THREADING_H_ +#define VPX_VP8_COMMON_THREADING_H_ #include "./vpx_config.h" @@ -171,11 +171,15 @@ static inline int sem_destroy(sem_t *sem) { #define sem_wait(sem) (semaphore_wait(*sem)) #define sem_post(sem) semaphore_signal(*sem) #define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) -#define thread_sleep(nms) { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} +#define thread_sleep(nms) +/* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = + 1000*nms;nanosleep(&ts, NULL);} */ #else #include #include -#define thread_sleep(nms) {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} +#define thread_sleep(nms) sched_yield(); +/* {struct timespec ts;ts.tv_sec=0; + ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ #endif /* Not Windows. Assume pthreads */ @@ -195,7 +199,7 @@ static INLINE void vp8_atomic_spin_wait( const int nsync) { while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) { x86_pause_hint(); - thread_sleep(1); + thread_sleep(0); } } @@ -205,4 +209,4 @@ static INLINE void vp8_atomic_spin_wait( } // extern "C" #endif -#endif // VP8_COMMON_THREADING_H_ +#endif // VPX_VP8_COMMON_THREADING_H_ diff --git a/libs/libvpx/vp8/common/treecoder.c b/libs/libvpx/vp8/common/treecoder.c index 9feb40a5a7..f1e78f4321 100644 --- a/libs/libvpx/vp8/common/treecoder.c +++ b/libs/libvpx/vp8/common/treecoder.c @@ -12,6 +12,7 @@ #include #include "vp8/common/treecoder.h" +#include "vpx/vpx_integer.h" static void tree2tok(struct vp8_token_struct *const p, vp8_tree t, int i, int v, int L) { @@ -79,7 +80,7 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */ vp8_prob probs[/* n-1 */], unsigned int branch_ct[/* n-1 */][2], const unsigned int num_events[/* n */], - unsigned int Pfac, int rd) { + unsigned int Pfactor, int Round) { const int tree_len = n - 1; int t = 0; @@ -89,10 +90,10 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */ const unsigned int *const c = branch_ct[t]; const unsigned int tot = c[0] + c[1]; - assert(tot < (1 << 24)); /* no overflow below */ - if (tot) { - const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot; + const unsigned int p = + (unsigned int)(((uint64_t)c[0] * Pfactor) + (Round ? tot >> 1 : 0)) / + tot; probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ } else { probs[t] = vp8_prob_half; diff --git a/libs/libvpx/vp8/common/treecoder.h b/libs/libvpx/vp8/common/treecoder.h index d8503cf3f8..d7d8d0ead0 100644 --- a/libs/libvpx/vp8/common/treecoder.h +++ b/libs/libvpx/vp8/common/treecoder.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_TREECODER_H_ -#define VP8_COMMON_TREECODER_H_ +#ifndef VPX_VP8_COMMON_TREECODER_H_ +#define VPX_VP8_COMMON_TREECODER_H_ #ifdef __cplusplus extern "C" { @@ -32,7 +32,7 @@ typedef const bool_coder_spec c_bool_coder_spec; typedef const bool_writer c_bool_writer; typedef const bool_reader c_bool_reader; -#define vp8_complement(x) (255 - x) +#define vp8_complement(x) (255 - (x)) /* We build coding trees compactly in arrays. Each node of the tree is a pair of vp8_tree_indices. @@ -79,4 +79,4 @@ void vp8bc_tree_probs_from_distribution(int n, /* n = size of alphabet */ } // extern "C" #endif -#endif // VP8_COMMON_TREECODER_H_ +#endif // VPX_VP8_COMMON_TREECODER_H_ diff --git a/libs/libvpx/vp8/common/vp8_entropymodedata.h b/libs/libvpx/vp8/common/vp8_entropymodedata.h index 9a81ebfe62..3fc942e050 100644 --- a/libs/libvpx/vp8/common/vp8_entropymodedata.h +++ b/libs/libvpx/vp8/common/vp8_entropymodedata.h @@ -6,10 +6,10 @@ * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. -*/ + */ -#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_ -#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#ifndef VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#define VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ #ifdef __cplusplus extern "C" { @@ -169,4 +169,4 @@ const vp8_prob } // extern "C" #endif -#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#endif // VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ diff --git a/libs/libvpx/vp8/common/vp8_skin_detection.h b/libs/libvpx/vp8/common/vp8_skin_detection.h index 4d27f5eb2e..ef0e4ae4fe 100644 --- a/libs/libvpx/vp8/common/vp8_skin_detection.h +++ b/libs/libvpx/vp8/common/vp8_skin_detection.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_SKIN_DETECTION_H_ -#define VP8_COMMON_SKIN_DETECTION_H_ +#ifndef VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ +#define VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ #include "vp8/encoder/onyx_int.h" #include "vpx/vpx_integer.h" @@ -44,4 +44,4 @@ void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file); } // extern "C" #endif -#endif // VP8_COMMON_SKIN_DETECTION_H_ +#endif // VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ diff --git a/libs/libvpx/vp8/common/x86/bilinear_filter_sse2.c b/libs/libvpx/vp8/common/x86/bilinear_filter_sse2.c new file mode 100644 index 0000000000..9bf65d8045 --- /dev/null +++ b/libs/libvpx/vp8/common/x86/bilinear_filter_sse2.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp8_rtcd.h" +#include "./vpx_config.h" +#include "vp8/common/filter.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_ports/mem.h" + +static INLINE void horizontal_16x16(uint8_t *src, const int stride, + uint16_t *dst, const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + _mm_store_si128((__m128i *)dst, a_lo); + _mm_store_si128((__m128i *)(dst + 8), a_hi); + src += stride; + dst += 16; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0); + const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0); + + const __m128i b = _mm_loadu_si128((__m128i *)(src + 1)); + const __m128i b_lo = _mm_unpacklo_epi8(b, zero); + const __m128i b_hi = _mm_unpackhi_epi8(b, zero); + const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1); + const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1); + + const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered); + const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + _mm_store_si128((__m128i *)dst, shifted_lo); + _mm_store_si128((__m128i *)(dst + 8), shifted_hi); + src += stride; + dst += 16; + } + } +} + +static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 16; ++h) { + const __m128i row_lo = _mm_load_si128((__m128i *)src); + const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i packed = _mm_packus_epi16(row_lo, row_hi); + _mm_store_si128((__m128i *)dst, packed); + src += 16; + dst += stride; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + __m128i row_0_lo = _mm_load_si128((__m128i *)src); + __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8)); + src += 16; + for (h = 0; h < 16; ++h) { + const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0); + const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0); + + const __m128i row_1_lo = _mm_load_si128((__m128i *)src); + const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1); + const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1); + + const __m128i sum_lo = + _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered); + const __m128i sum_hi = + _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi); + _mm_store_si128((__m128i *)dst, packed); + row_0_lo = row_1_lo; + row_0_hi = row_1_hi; + src += 16; + dst += stride; + } + } +} + +void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]); + + assert((xoffset | yoffset) != 0); + + horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_16x16(FData, dst_ptr, dst_pitch, yoffset); +} + +static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst, + const int xoffset, const int height) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < height; ++h) { + const __m128i a = _mm_loadl_epi64((__m128i *)src); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + _mm_store_si128((__m128i *)dst, a_u16); + src += stride; + dst += 8; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + // Filter horizontally. Rather than load the whole array and transpose, load + // 16 values (overreading) and shift to set up the second value. Do an + // "extra" 9th line so the vertical pass has the necessary context. + for (h = 0; h < height; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i b = _mm_srli_si128(a, 1); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); + const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); + const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); + const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + _mm_store_si128((__m128i *)dst, shifted); + src += stride; + dst += 8; + } + } +} + +static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset, const int height) { + int h; + + if (yoffset == 0) { + for (h = 0; h < height; ++h) { + const __m128i row = _mm_load_si128((__m128i *)src); + const __m128i packed = _mm_packus_epi16(row, row); + _mm_storel_epi64((__m128i *)dst, packed); + src += 8; + dst += stride; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + __m128i row_0 = _mm_load_si128((__m128i *)src); + src += 8; + for (h = 0; h < height; ++h) { + const __m128i row_1 = _mm_load_si128((__m128i *)src); + const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); + const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); + const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + const __m128i packed = _mm_packus_epi16(shifted, shifted); + _mm_storel_epi64((__m128i *)dst, packed); + row_0 = row_1; + src += 8; + dst += stride; + } + } +} + +void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]); + + assert((xoffset | yoffset) != 0); + + horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9); + + vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8); +} + +void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]); + + assert((xoffset | yoffset) != 0); + + horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5); + + vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4); +} + +static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst, + const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + _mm_storel_epi64((__m128i *)dst, a_u16); + src += stride; + dst += 4; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i b = load_unaligned_u32(src + 1); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); + const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); + const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); + const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + _mm_storel_epi64((__m128i *)dst, shifted); + src += stride; + dst += 4; + } + } +} + +static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 4; h += 2) { + const __m128i row = _mm_load_si128((__m128i *)src); + __m128i packed = _mm_packus_epi16(row, row); + store_unaligned_u32(dst, packed); + dst += stride; + packed = _mm_srli_si128(packed, 4); + store_unaligned_u32(dst, packed); + dst += stride; + src += 8; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + for (h = 0; h < 4; h += 2) { + const __m128i row_0 = _mm_load_si128((__m128i *)src); + const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4)); + const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); + const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); + const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + __m128i packed = _mm_packus_epi16(shifted, shifted); + storeu_uint32(dst, _mm_cvtsi128_si32(packed)); + packed = _mm_srli_si128(packed, 4); + dst += stride; + storeu_uint32(dst, _mm_cvtsi128_si32(packed)); + dst += stride; + src += 8; + } + } +} + +void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]); + + assert((xoffset | yoffset) != 0); + + horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_4x4(FData, dst_ptr, dst_pitch, yoffset); +} diff --git a/libs/libvpx/vp8/common/x86/filter_x86.c b/libs/libvpx/vp8/common/x86/filter_x86.c deleted file mode 100644 index 2405342f02..0000000000 --- a/libs/libvpx/vp8/common/x86/filter_x86.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp8/common/x86/filter_x86.h" - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = { - { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 } -}; - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = { - { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } -}; diff --git a/libs/libvpx/vp8/common/x86/filter_x86.h b/libs/libvpx/vp8/common/x86/filter_x86.h deleted file mode 100644 index d282841bee..0000000000 --- a/libs/libvpx/vp8/common/x86/filter_x86.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP8_COMMON_X86_FILTER_X86_H_ -#define VP8_COMMON_X86_FILTER_X86_H_ - -#include "vpx_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with - * duplicated values */ - -/* duplicated 4x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); - -/* duplicated 8x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/libs/libvpx/vp8/common/x86/idct_blk_sse2.c b/libs/libvpx/vp8/common/x86/idct_blk_sse2.c index 8aefb27997..897ed5b652 100644 --- a/libs/libvpx/vp8/common/x86/idct_blk_sse2.c +++ b/libs/libvpx/vp8/common/x86/idct_blk_sse2.c @@ -42,43 +42,43 @@ void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, } void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, - unsigned char *dstu, - unsigned char *dstv, int stride, + unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { if (((short *)(eobs))[0]) { if (((short *)(eobs))[0] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride); } } q += 32; - dstu += stride * 4; + dst_u += stride * 4; if (((short *)(eobs))[1]) { if (((short *)(eobs))[1] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride); } } q += 32; if (((short *)(eobs))[2]) { if (((short *)(eobs))[2] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride); } } q += 32; - dstv += stride * 4; + dst_v += stride * 4; if (((short *)(eobs))[3]) { if (((short *)(eobs))[3] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride); } } } diff --git a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm index 82d7bf91a6..0043e93b06 100644 --- a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm +++ b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -13,7 +13,7 @@ SECTION .text -;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) +;void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff) global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE sym(vp8_short_inv_walsh4x4_sse2): push rbp diff --git a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm index 1f3a2baca0..67bcd0cbd7 100644 --- a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm +++ b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -10,8 +10,6 @@ %include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) - %define BLOCK_HEIGHT_WIDTH 4 %define vp8_filter_weight 128 @@ -205,280 +203,6 @@ sym(vp8_filter_block1dc_v6_mmx): ret -;void bilinear_predict8x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x4_mmx) PRIVATE -sym(vp8_bilinear_predict8x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - shl rax, 5 - - mov rsi, arg(0) ;src_ptr ; - add rax, rcx - - movsxd rdx, dword ptr arg(5) ;dst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - add rsi, rdx ; next line -.next_row_8x4: - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - movq mm5, mm7 ; - movq mm6, mm7 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 - - pmullw mm5, [rax] ; - pmullw mm6, [rax] ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - - pmullw mm3, [rax+16] ; - pmullw mm4, [rax+16] ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - packuswb mm3, mm4 - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch - add rsi, rdx ; next line - add rdi, r8 -%endif - cmp rdi, rcx ; - jne .next_row_8x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict4x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict4x4_mmx) PRIVATE -sym(vp8_bilinear_predict4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - shl rax, 5 - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;ldst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movq mm7, mm3 ; - packuswb mm7, mm0 ; - - add rsi, rdx ; next line -.next_row_4x4: - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - - movq mm5, mm7 ; - punpcklbw mm5, mm0 ; - - pmullw mm5, [rax] ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - movq mm7, mm3 ; - - packuswb mm7, mm0 ; - - pmullw mm3, [rax+16] ; - paddw mm3, mm5 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - packuswb mm3, mm0 - movd [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch ; - add rsi, rdx ; next line - add rdi, r8 -%endif - - cmp rdi, rcx ; - jne .next_row_4x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - SECTION_RODATA align 16 rd: diff --git a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm index 6e70f6d2e8..51c015e3df 100644 --- a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm +++ b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -10,7 +10,6 @@ %include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) %define BLOCK_HEIGHT_WIDTH 4 %define VP8_FILTER_WEIGHT 128 @@ -958,419 +957,6 @@ sym(vp8_unpack_block1d16_h6_sse2): ret -;void vp8_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp8_bilinear_filters_x86_8) -global sym(vp8_bilinear_predict16x16_sse2) PRIVATE -sym(vp8_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x8_sse2) PRIVATE -sym(vp8_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 rd: diff --git a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c index b9d087e20d..7fb83c2d5e 100644 --- a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c +++ b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c @@ -11,7 +11,6 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "vpx_ports/mem.h" -#include "filter_x86.h" extern const short vp8_six_tap_x86[8][6 * 8]; @@ -95,9 +94,7 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch - - ) { + int dst_pitch) { DECLARE_ALIGNED(16, unsigned short, FData2[24 * 24]); /* Temp data bufffer used in filtering */ @@ -236,9 +233,7 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr, void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch - - ) { + int dst_pitch) { DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]); if (xoffset) { @@ -351,8 +346,8 @@ void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, yoffset); } else { /* ssse3 second-pass only function couldn't handle (xoffset==0 && - * yoffset==0) case correctly. Add copy function here to guarantee - * six-tap function handles all possible offsets. */ + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ int r; for (r = 0; r < 4; ++r) { diff --git a/libs/libvpx/vp8/decoder/dboolhuff.h b/libs/libvpx/vp8/decoder/dboolhuff.h index 04c027cd78..f2a18f0d90 100644 --- a/libs/libvpx/vp8/decoder/dboolhuff.h +++ b/libs/libvpx/vp8/decoder/dboolhuff.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DBOOLHUFF_H_ -#define VP8_DECODER_DBOOLHUFF_H_ +#ifndef VPX_VP8_DECODER_DBOOLHUFF_H_ +#define VPX_VP8_DECODER_DBOOLHUFF_H_ #include #include @@ -76,7 +76,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { } { - register int shift = vp8_norm[range]; + const unsigned char shift = vp8_norm[(unsigned char)range]; range <<= shift; value <<= shift; count -= shift; @@ -127,4 +127,4 @@ static INLINE int vp8dx_bool_error(BOOL_DECODER *br) { } // extern "C" #endif -#endif // VP8_DECODER_DBOOLHUFF_H_ +#endif // VPX_VP8_DECODER_DBOOLHUFF_H_ diff --git a/libs/libvpx/vp8/decoder/decodeframe.c b/libs/libvpx/vp8/decoder/decodeframe.c index 077bd3da26..650d1d0408 100644 --- a/libs/libvpx/vp8/decoder/decodeframe.c +++ b/libs/libvpx/vp8/decoder/decodeframe.c @@ -674,7 +674,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi, static int read_is_valid(const unsigned char *start, size_t len, const unsigned char *end) { - return (start + len > start && start + len <= end); + return len != 0 && end > start && len <= (size_t)(end - start); } static unsigned int read_available_partition_size( @@ -686,6 +686,12 @@ static unsigned int read_available_partition_size( const unsigned char *partition_size_ptr = token_part_sizes + i * 3; unsigned int partition_size = 0; ptrdiff_t bytes_left = fragment_end - fragment_start; + if (bytes_left < 0) { + vpx_internal_error( + &pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition. No bytes left %d.", + (int)bytes_left); + } /* Calculate the length of this partition. The last partition * size is implicit. If the partition size can't be read, then * either use the remaining data in the buffer (for EC mode) @@ -750,6 +756,9 @@ static void setup_token_decoder(VP8D_COMP *pbi, ptrdiff_t ext_first_part_size = token_part_sizes - pbi->fragments.ptrs[0] + 3 * (num_token_partitions - 1); + if (fragment_size < (unsigned int)ext_first_part_size) + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "Corrupted fragment size %d", fragment_size); fragment_size -= (unsigned int)ext_first_part_size; if (fragment_size > 0) { pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size; @@ -767,6 +776,9 @@ static void setup_token_decoder(VP8D_COMP *pbi, first_fragment_end, fragment_end, fragment_idx - 1, num_token_partitions); pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size; + if (fragment_size < (unsigned int)partition_size) + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "Corrupted fragment size %d", fragment_size); fragment_size -= (unsigned int)partition_size; assert(fragment_idx <= num_token_partitions); if (fragment_size > 0) { @@ -1208,7 +1220,11 @@ int vp8_decode_frame(VP8D_COMP *pbi) { if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) && pc->multi_token_partition != ONE_PARTITION) { unsigned int thread; - vp8mt_decode_mb_rows(pbi, xd); + if (vp8mt_decode_mb_rows(pbi, xd)) { + vp8_decoder_remove_threads(pbi); + pbi->restart_threads = 1; + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, NULL); + } vp8_yv12_extend_frame_borders(yv12_fb_new); for (thread = 0; thread < pbi->decoding_thread_count; ++thread) { corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted; diff --git a/libs/libvpx/vp8/decoder/decodemv.h b/libs/libvpx/vp8/decoder/decodemv.h index f33b07351d..504e943d85 100644 --- a/libs/libvpx/vp8/decoder/decodemv.h +++ b/libs/libvpx/vp8/decoder/decodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DECODEMV_H_ -#define VP8_DECODER_DECODEMV_H_ +#ifndef VPX_VP8_DECODER_DECODEMV_H_ +#define VPX_VP8_DECODER_DECODEMV_H_ #include "onyxd_int.h" @@ -23,4 +23,4 @@ void vp8_decode_mode_mvs(VP8D_COMP *); } // extern "C" #endif -#endif // VP8_DECODER_DECODEMV_H_ +#endif // VPX_VP8_DECODER_DECODEMV_H_ diff --git a/libs/libvpx/vp8/decoder/decoderthreading.h b/libs/libvpx/vp8/decoder/decoderthreading.h index c563cf6e93..3d49bc8317 100644 --- a/libs/libvpx/vp8/decoder/decoderthreading.h +++ b/libs/libvpx/vp8/decoder/decoderthreading.h @@ -8,15 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DECODERTHREADING_H_ -#define VP8_DECODER_DECODERTHREADING_H_ +#ifndef VPX_VP8_DECODER_DECODERTHREADING_H_ +#define VPX_VP8_DECODER_DECODERTHREADING_H_ #ifdef __cplusplus extern "C" { #endif #if CONFIG_MULTITHREAD -void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); +int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); void vp8_decoder_remove_threads(VP8D_COMP *pbi); void vp8_decoder_create_threads(VP8D_COMP *pbi); void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); @@ -27,4 +27,4 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); } // extern "C" #endif -#endif // VP8_DECODER_DECODERTHREADING_H_ +#endif // VPX_VP8_DECODER_DECODERTHREADING_H_ diff --git a/libs/libvpx/vp8/decoder/detokenize.h b/libs/libvpx/vp8/decoder/detokenize.h index f0b125444f..410a431ba0 100644 --- a/libs/libvpx/vp8/decoder/detokenize.h +++ b/libs/libvpx/vp8/decoder/detokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DETOKENIZE_H_ -#define VP8_DECODER_DETOKENIZE_H_ +#ifndef VPX_VP8_DECODER_DETOKENIZE_H_ +#define VPX_VP8_DECODER_DETOKENIZE_H_ #include "onyxd_int.h" @@ -24,4 +24,4 @@ int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); } // extern "C" #endif -#endif // VP8_DECODER_DETOKENIZE_H_ +#endif // VPX_VP8_DECODER_DETOKENIZE_H_ diff --git a/libs/libvpx/vp8/decoder/ec_types.h b/libs/libvpx/vp8/decoder/ec_types.h index 0ab08b649a..84feb269df 100644 --- a/libs/libvpx/vp8/decoder/ec_types.h +++ b/libs/libvpx/vp8/decoder/ec_types.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_EC_TYPES_H_ -#define VP8_DECODER_EC_TYPES_H_ +#ifndef VPX_VP8_DECODER_EC_TYPES_H_ +#define VPX_VP8_DECODER_EC_TYPES_H_ #ifdef __cplusplus extern "C" { @@ -34,7 +34,9 @@ typedef struct { /* Structure used to hold all the overlaps of a macroblock. The overlaps of a * macroblock is further divided into block overlaps. */ -typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP; +typedef struct { + B_OVERLAP overlaps[16]; +} MB_OVERLAP; /* Structure for keeping track of motion vectors and which reference frame they * refer to. Used for motion vector interpolation. @@ -48,4 +50,4 @@ typedef struct { } // extern "C" #endif -#endif // VP8_DECODER_EC_TYPES_H_ +#endif // VPX_VP8_DECODER_EC_TYPES_H_ diff --git a/libs/libvpx/vp8/decoder/error_concealment.c b/libs/libvpx/vp8/decoder/error_concealment.c index e22141492c..85982e4de3 100644 --- a/libs/libvpx/vp8/decoder/error_concealment.c +++ b/libs/libvpx/vp8/decoder/error_concealment.c @@ -147,8 +147,8 @@ static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi, } } -void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols, - union b_mode_info *bmi, int b_row, int b_col) { +static void calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols, + union b_mode_info *bmi, int b_row, int b_col) { MB_OVERLAP *mb_overlap; int row, col, rel_row, rel_col; int new_row, new_col; @@ -280,9 +280,9 @@ static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi, int sub_col; for (sub_row = 0; sub_row < 4; ++sub_row) { for (sub_col = 0; sub_col < 4; ++sub_col) { - vp8_calculate_overlaps(overlaps, mb_rows, mb_cols, - &(prev_mi->bmi[sub_row * 4 + sub_col]), - 4 * mb_row + sub_row, 4 * mb_col + sub_col); + calculate_overlaps(overlaps, mb_rows, mb_cols, + &(prev_mi->bmi[sub_row * 4 + sub_col]), + 4 * mb_row + sub_row, 4 * mb_col + sub_col); } } } diff --git a/libs/libvpx/vp8/decoder/error_concealment.h b/libs/libvpx/vp8/decoder/error_concealment.h index 89c78c1442..608a79f189 100644 --- a/libs/libvpx/vp8/decoder/error_concealment.h +++ b/libs/libvpx/vp8/decoder/error_concealment.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_ -#define VP8_DECODER_ERROR_CONCEALMENT_H_ +#ifndef VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ +#define VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ #include "onyxd_int.h" #include "ec_types.h" @@ -38,4 +38,4 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, int mb_row, int mb_col, } // extern "C" #endif -#endif // VP8_DECODER_ERROR_CONCEALMENT_H_ +#endif // VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ diff --git a/libs/libvpx/vp8/decoder/onyxd_if.c b/libs/libvpx/vp8/decoder/onyxd_if.c index f516eb0c78..c6fb51d0cb 100644 --- a/libs/libvpx/vp8/decoder/onyxd_if.c +++ b/libs/libvpx/vp8/decoder/onyxd_if.c @@ -16,6 +16,7 @@ #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/alloccommon.h" +#include "vp8/common/common.h" #include "vp8/common/loopfilter.h" #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" @@ -321,21 +322,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx]; pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx]; - if (setjmp(pbi->common.error.jmp)) { - /* We do not know if the missing frame(s) was supposed to update - * any of the reference buffers, but we act conservative and - * mark only the last buffer as corrupted. - */ - cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; - - if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) { - cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - } - goto decode_exit; - } - - pbi->common.error.setjmp = 1; - retcode = vp8_decode_frame(pbi); if (retcode < 0) { @@ -344,6 +330,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, } pbi->common.error.error_code = VPX_CODEC_ERROR; + // Propagate the error info. + if (pbi->mb.error_info.error_code != 0) { + pbi->common.error.error_code = pbi->mb.error_info.error_code; + memcpy(pbi->common.error.detail, pbi->mb.error_info.detail, + sizeof(pbi->mb.error_info.detail)); + } goto decode_exit; } @@ -382,7 +374,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, pbi->last_time_stamp = time_stamp; decode_exit: - pbi->common.error.setjmp = 0; vpx_clear_system_state(); return retcode; } @@ -445,7 +436,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) { #if CONFIG_MULTITHREAD if (setjmp(fb->pbi[0]->common.error.jmp)) { vp8_remove_decoder_instances(fb); - memset(fb->pbi, 0, sizeof(fb->pbi)); + vp8_zero(fb->pbi); vpx_clear_system_state(); return VPX_CODEC_ERROR; } @@ -471,6 +462,6 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) { return VPX_CODEC_OK; } -int vp8dx_get_quantizer(const VP8D_COMP *cpi) { - return cpi->common.base_qindex; +int vp8dx_get_quantizer(const VP8D_COMP *pbi) { + return pbi->common.base_qindex; } diff --git a/libs/libvpx/vp8/decoder/onyxd_int.h b/libs/libvpx/vp8/decoder/onyxd_int.h index 5ecacdbb97..cf2c066d9b 100644 --- a/libs/libvpx/vp8/decoder/onyxd_int.h +++ b/libs/libvpx/vp8/decoder/onyxd_int.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_ONYXD_INT_H_ -#define VP8_DECODER_ONYXD_INT_H_ +#ifndef VPX_VP8_DECODER_ONYXD_INT_H_ +#define VPX_VP8_DECODER_ONYXD_INT_H_ #include "vpx_config.h" #include "vp8/common/onyxd.h" @@ -31,7 +31,9 @@ typedef struct { void *ptr2; } DECODETHREAD_DATA; -typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC; +typedef struct { + MACROBLOCKD mbd; +} MB_ROW_DEC; typedef struct { int enabled; @@ -116,11 +118,17 @@ typedef struct VP8D_COMP { vpx_decrypt_cb decrypt_cb; void *decrypt_state; +#if CONFIG_MULTITHREAD + // Restart threads on next frame if set to 1. + // This is set when error happens in multithreaded decoding and all threads + // are shut down. + int restart_threads; +#endif } VP8D_COMP; void vp8cx_init_de_quantizer(VP8D_COMP *pbi); void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); -int vp8_decode_frame(VP8D_COMP *cpi); +int vp8_decode_frame(VP8D_COMP *pbi); int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); int vp8_remove_decoder_instances(struct frame_buffers *fb); @@ -128,8 +136,8 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); #if CONFIG_DEBUG #define CHECK_MEM_ERROR(lval, expr) \ do { \ - lval = (expr); \ - if (!lval) \ + (lval) = (expr); \ + if (!(lval)) \ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ "Failed to allocate " #lval " at %s:%d", __FILE__, \ __LINE__); \ @@ -137,8 +145,8 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); #else #define CHECK_MEM_ERROR(lval, expr) \ do { \ - lval = (expr); \ - if (!lval) \ + (lval) = (expr); \ + if (!(lval)) \ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ "Failed to allocate " #lval); \ } while (0) @@ -148,4 +156,4 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); } // extern "C" #endif -#endif // VP8_DECODER_ONYXD_INT_H_ +#endif // VPX_VP8_DECODER_ONYXD_INT_H_ diff --git a/libs/libvpx/vp8/decoder/threading.c b/libs/libvpx/vp8/decoder/threading.c index d0213f75c1..561922de32 100644 --- a/libs/libvpx/vp8/decoder/threading.c +++ b/libs/libvpx/vp8/decoder/threading.c @@ -15,8 +15,8 @@ #endif #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" +#include "vp8/common/common.h" #include "vp8/common/threading.h" - #include "vp8/common/loopfilter.h" #include "vp8/common/extend.h" #include "vpx_ports/vpx_timer.h" @@ -400,16 +400,32 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset; xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset; - xd->pre.y_buffer = - ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset; - xd->pre.u_buffer = - ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset; - xd->pre.v_buffer = - ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset; - /* propagate errors from reference frames */ xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]; + if (xd->corrupted) { + // Move current decoding marcoblock to the end of row for all rows + // assigned to this thread, such that other threads won't be waiting. + for (; mb_row < pc->mb_rows; + mb_row += (pbi->decoding_thread_count + 1)) { + current_mb_col = &pbi->mt_current_mb_col[mb_row]; + vpx_atomic_store_release(current_mb_col, pc->mb_cols + nsync); + } + vpx_internal_error(&xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Corrupted reference frame"); + } + + if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) { + const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame; + xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset; + xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset; + xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset; + } else { + // ref_frame is INTRA_FRAME, pre buffer should not be used. + xd->pre.y_buffer = 0; + xd->pre.u_buffer = 0; + xd->pre.v_buffer = 0; + } mt_decode_macroblock(pbi, xd, 0); xd->left_available = 1; @@ -557,8 +573,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count; } - /* signal end of frame decoding if this thread processed the last mb_row */ - if (last_mb_row == (pc->mb_rows - 1)) sem_post(&pbi->h_event_end_decoding); + /* signal end of decoding of current thread for current frame */ + if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows) + sem_post(&pbi->h_event_end_decoding); } static THREAD_FUNCTION thread_decoding_proc(void *p_data) { @@ -576,7 +593,13 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { } else { MACROBLOCKD *xd = &mbrd->mbd; xd->left_context = &mb_row_left_context; - + if (setjmp(xd->error_info.jmp)) { + xd->error_info.setjmp = 0; + // Signal the end of decoding for current thread. + sem_post(&pbi->h_event_end_decoding); + continue; + } + xd->error_info.setjmp = 1; mt_decode_mb_rows(pbi, xd, ithread + 1); } } @@ -738,25 +761,28 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); - for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR( - pbi->mt_yabove_row[i], - vpx_memalign( - 16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1)))); + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(pbi->mt_yabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (width + (VP8BORDERINPIXELS << 1)))); + vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1)); + } CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); - for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR( - pbi->mt_uabove_row[i], - vpx_memalign(16, - sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(pbi->mt_uabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (uv_width + VP8BORDERINPIXELS))); + vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS); + } CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); - for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR( - pbi->mt_vabove_row[i], - vpx_memalign(16, - sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(pbi->mt_vabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (uv_width + VP8BORDERINPIXELS))); + vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS); + } /* Allocate memory for left_col buffers. */ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); @@ -812,7 +838,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) { } } -void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { +int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { VP8_COMMON *pc = &pbi->common; unsigned int i; int j; @@ -858,7 +884,22 @@ void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { sem_post(&pbi->h_event_start_decoding[i]); } + if (setjmp(xd->error_info.jmp)) { + xd->error_info.setjmp = 0; + xd->corrupted = 1; + // Wait for other threads to finish. This prevents other threads decoding + // the current frame while the main thread starts decoding the next frame, + // which causes a data race. + for (i = 0; i < pbi->decoding_thread_count; ++i) + sem_wait(&pbi->h_event_end_decoding); + return -1; + } + + xd->error_info.setjmp = 1; mt_decode_mb_rows(pbi, xd, 0); - sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ + for (i = 0; i < pbi->decoding_thread_count + 1; ++i) + sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ + + return 0; } diff --git a/libs/libvpx/vp8/decoder/treereader.h b/libs/libvpx/vp8/decoder/treereader.h index dd0f0986e9..4bf938a741 100644 --- a/libs/libvpx/vp8/decoder/treereader.h +++ b/libs/libvpx/vp8/decoder/treereader.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_TREEREADER_H_ -#define VP8_DECODER_TREEREADER_H_ +#ifndef VPX_VP8_DECODER_TREEREADER_H_ +#define VPX_VP8_DECODER_TREEREADER_H_ #include "./vpx_config.h" #include "vp8/common/treecoder.h" @@ -30,7 +30,7 @@ typedef BOOL_DECODER vp8_reader; static INLINE int vp8_treed_read( vp8_reader *const r, /* !!! must return a 0 or 1 !!! */ vp8_tree t, const vp8_prob *const p) { - register vp8_tree_index i = 0; + vp8_tree_index i = 0; while ((i = t[i + vp8_read(r, p[i >> 1])]) > 0) { } @@ -42,4 +42,4 @@ static INLINE int vp8_treed_read( } // extern "C" #endif -#endif // VP8_DECODER_TREEREADER_H_ +#endif // VPX_VP8_DECODER_TREEREADER_H_ diff --git a/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c index c42005df6c..6fc60805f6 100644 --- a/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -9,6 +9,8 @@ */ #include + +#include "./vp8_rtcd.h" #include "vp8/encoder/block.h" static const uint16_t inv_zig_zag[16] = { 1, 2, 6, 7, 3, 5, 8, 13, @@ -26,9 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { zig_zag1 = vld1q_u16(inv_zig_zag + 8); int16x8_t x0, x1, sz0, sz1, y0, y1; uint16x8_t eob0, eob1; +#ifndef __aarch64__ uint16x4_t eob_d16; uint32x2_t eob_d32; uint32x4_t eob_q32; +#endif // __arch64__ /* sign of z: z >> 15 */ sz0 = vshrq_n_s16(z0, 15); @@ -66,11 +70,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* select the largest value */ eob0 = vmaxq_u16(eob0, eob1); +#ifdef __aarch64__ + *d->eob = (int8_t)vmaxvq_u16(eob0); +#else eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); eob_q32 = vmovl_u16(eob_d16); eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); eob_d32 = vpmax_u32(eob_d32, eob_d32); + vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); +#endif // __aarch64__ + /* qcoeff = x */ vst1q_s16(d->qcoeff, x0); vst1q_s16(d->qcoeff + 8, x1); @@ -78,6 +88,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* dqcoeff = x * dequant */ vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); - - vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); } diff --git a/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c index 76853e6524..99dff6b520 100644 --- a/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c +++ b/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c @@ -10,6 +10,8 @@ #include +#include "./vp8_rtcd.h" + void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, dEmptys16; diff --git a/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c index 8d6ea4ccbe..02056f2f90 100644 --- a/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c +++ b/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c @@ -9,6 +9,8 @@ */ #include + +#include "./vp8_rtcd.h" #include "vpx_ports/arm.h" #ifdef VPX_INCOMPATIBLE_GCC diff --git a/libs/libvpx/vp8/encoder/bitstream.c b/libs/libvpx/vp8/encoder/bitstream.c index 8cacb64505..64bf0a79e9 100644 --- a/libs/libvpx/vp8/encoder/bitstream.c +++ b/libs/libvpx/vp8/encoder/bitstream.c @@ -41,13 +41,6 @@ const int vp8cx_base_skip_false_prob[128] = { unsigned __int64 Sectionbits[500]; #endif -#ifdef VP8_ENTROPY_STATS -int intra_mode_stats[10][10][10]; -static unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]; -extern unsigned int active_section; -#endif - #ifdef MODE_STATS int count_mb_seg[4] = { 0, 0, 0, 0 }; #endif @@ -428,10 +421,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { vp8_convert_rfct_to_prob(cpi); -#ifdef VP8_ENTROPY_STATS - active_section = 1; -#endif - if (pc->mb_no_coeff_skip) { int total_mbs = pc->mb_rows * pc->mb_cols; @@ -472,10 +461,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { xd->mb_to_top_edge = -((mb_row * 16) << 3); xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; -#ifdef VP8_ENTROPY_STATS - active_section = 9; -#endif - if (cpi->mb.e_mbd.update_mb_segmentation_map) { write_mb_features(w, mi, &cpi->mb.e_mbd); } @@ -486,9 +471,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { if (rf == INTRA_FRAME) { vp8_write(w, 0, cpi->prob_intra_coded); -#ifdef VP8_ENTROPY_STATS - active_section = 6; -#endif write_ymode(w, mode, pc->fc.ymode_prob); if (mode == B_PRED) { @@ -522,28 +504,13 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { vp8_clamp_mv2(&best_mv, xd); vp8_mv_ref_probs(mv_ref_p, ct); - -#ifdef VP8_ENTROPY_STATS - accum_mv_refs(mode, ct); -#endif } -#ifdef VP8_ENTROPY_STATS - active_section = 3; -#endif - write_mv_ref(w, mode, mv_ref_p); switch (mode) /* new, split require MVs */ { - case NEWMV: - -#ifdef VP8_ENTROPY_STATS - active_section = 5; -#endif - - write_mv(w, &mi->mv.as_mv, &best_mv, mvc); - break; + case NEWMV: write_mv(w, &mi->mv.as_mv, &best_mv, mvc); break; case SPLITMV: { int j = 0; @@ -574,9 +541,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2[mv_contz]); if (blockmode == NEW4X4) { -#ifdef VP8_ENTROPY_STATS - active_section = 11; -#endif write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *)mvc); } } while (++j < cpi->mb.partition_info->count); @@ -642,10 +606,6 @@ static void write_kfmodes(VP8_COMP *cpi) { const B_PREDICTION_MODE L = left_block_mode(m, i); const int bm = m->bmi[i].as_mode; -#ifdef VP8_ENTROPY_STATS - ++intra_mode_stats[A][L][bm]; -#endif - write_bmode(bc, bm, vp8_kf_bmode_prob[A][L]); } while (++i < 16); } @@ -973,10 +933,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { vp8_write(w, u, upd); #endif -#ifdef VP8_ENTROPY_STATS - ++tree_update_hist[i][j][k][t][u]; -#endif - if (u) { /* send/use new probability */ @@ -990,16 +946,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { } while (++t < ENTROPY_NODES); -/* Accum token counts for generation of default statistics */ -#ifdef VP8_ENTROPY_STATS - t = 0; - - do { - context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t]; - } while (++t < MAX_ENTROPY_TOKENS); - -#endif - } while (++k < PREV_COEF_CONTEXTS); } while (++j < COEF_BANDS); } while (++i < BLOCK_TYPES); @@ -1097,12 +1043,18 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, cx_data[1] = 0x01; cx_data[2] = 0x2a; + /* Pack scale and frame size into 16 bits. Store it 8 bits at a time. + * https://tools.ietf.org/html/rfc6386 + * 9.1. Uncompressed Data Chunk + * 16 bits : (2 bits Horizontal Scale << 14) | Width (14 bits) + * 16 bits : (2 bits Vertical Scale << 14) | Height (14 bits) + */ v = (pc->horiz_scale << 14) | pc->Width; - cx_data[3] = v; + cx_data[3] = v & 0xff; cx_data[4] = v >> 8; v = (pc->vert_scale << 14) | pc->Height; - cx_data[5] = v; + cx_data[5] = v & 0xff; cx_data[6] = v >> 8; extra_bytes_packed = 7; @@ -1286,15 +1238,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, if (pc->frame_type != KEY_FRAME) vp8_write_bit(bc, pc->refresh_last_frame); -#ifdef VP8_ENTROPY_STATS - - if (pc->frame_type == INTER_FRAME) - active_section = 0; - else - active_section = 7; - -#endif - vpx_clear_system_state(); #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING @@ -1308,25 +1251,13 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, vp8_update_coef_probs(cpi); #endif -#ifdef VP8_ENTROPY_STATS - active_section = 2; -#endif - /* Write out the mb_no_coeff_skip flag */ vp8_write_bit(bc, pc->mb_no_coeff_skip); if (pc->frame_type == KEY_FRAME) { write_kfmodes(cpi); - -#ifdef VP8_ENTROPY_STATS - active_section = 8; -#endif } else { pack_inter_mode_mvs(cpi); - -#ifdef VP8_ENTROPY_STATS - active_section = 1; -#endif } vp8_stop_encode(bc); @@ -1337,11 +1268,30 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, /* update frame tag */ { + /* Pack partition size, show frame, version and frame type into to 24 bits. + * Store it 8 bits at a time. + * https://tools.ietf.org/html/rfc6386 + * 9.1. Uncompressed Data Chunk + * The uncompressed data chunk comprises a common (for key frames and + * interframes) 3-byte frame tag that contains four fields, as follows: + * + * 1. A 1-bit frame type (0 for key frames, 1 for interframes). + * + * 2. A 3-bit version number (0 - 3 are defined as four different + * profiles with different decoding complexity; other values may be + * defined for future variants of the VP8 data format). + * + * 3. A 1-bit show_frame flag (0 when current frame is not for display, + * 1 when current frame is for display). + * + * 4. A 19-bit field containing the size of the first data partition in + * bytes + */ int v = (oh.first_partition_length_in_bytes << 5) | (oh.show_frame << 4) | (oh.version << 1) | oh.type; - dest[0] = v; - dest[1] = v >> 8; + dest[0] = v & 0xff; + dest[1] = (v >> 8) & 0xff; dest[2] = v >> 16; } @@ -1431,50 +1381,3 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, } #endif } - -#ifdef VP8_ENTROPY_STATS -void print_tree_update_probs() { - int i, j, k, l; - FILE *f = fopen("context.c", "a"); - int Sum; - fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n"); - fprintf(f, - "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] " - "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n"); - - for (i = 0; i < BLOCK_TYPES; ++i) { - fprintf(f, " { \n"); - - for (j = 0; j < COEF_BANDS; ++j) { - fprintf(f, " {\n"); - - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - fprintf(f, " {"); - - for (l = 0; l < ENTROPY_NODES; ++l) { - Sum = - tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1]; - - if (Sum > 0) { - if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0) - fprintf(f, "%3ld, ", - (tree_update_hist[i][j][k][l][0] * 255) / Sum); - else - fprintf(f, "%3ld, ", 1); - } else - fprintf(f, "%3ld, ", 128); - } - - fprintf(f, "},\n"); - } - - fprintf(f, " },\n"); - } - - fprintf(f, " },\n"); - } - - fprintf(f, "};\n"); - fclose(f); -} -#endif diff --git a/libs/libvpx/vp8/encoder/bitstream.h b/libs/libvpx/vp8/encoder/bitstream.h index ed45bff9e2..ee3f3e4aab 100644 --- a/libs/libvpx/vp8/encoder/bitstream.h +++ b/libs/libvpx/vp8/encoder/bitstream.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_BITSTREAM_H_ -#define VP8_ENCODER_BITSTREAM_H_ +#ifndef VPX_VP8_ENCODER_BITSTREAM_H_ +#define VPX_VP8_ENCODER_BITSTREAM_H_ #ifdef __cplusplus extern "C" { @@ -29,4 +29,4 @@ void vp8_update_coef_probs(struct VP8_COMP *cpi); } // extern "C" #endif -#endif // VP8_ENCODER_BITSTREAM_H_ +#endif // VPX_VP8_ENCODER_BITSTREAM_H_ diff --git a/libs/libvpx/vp8/encoder/block.h b/libs/libvpx/vp8/encoder/block.h index 492af0e41f..1bc5ef75bc 100644 --- a/libs/libvpx/vp8/encoder/block.h +++ b/libs/libvpx/vp8/encoder/block.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_BLOCK_H_ -#define VP8_ENCODER_BLOCK_H_ +#ifndef VPX_VP8_ENCODER_BLOCK_H_ +#define VPX_VP8_ENCODER_BLOCK_H_ #include "vp8/common/onyx.h" #include "vp8/common/blockd.h" @@ -165,4 +165,4 @@ typedef struct macroblock { } // extern "C" #endif -#endif // VP8_ENCODER_BLOCK_H_ +#endif // VPX_VP8_ENCODER_BLOCK_H_ diff --git a/libs/libvpx/vp8/encoder/boolhuff.c b/libs/libvpx/vp8/encoder/boolhuff.c index 04f8db9331..819c2f22a0 100644 --- a/libs/libvpx/vp8/encoder/boolhuff.c +++ b/libs/libvpx/vp8/encoder/boolhuff.c @@ -15,10 +15,6 @@ unsigned __int64 Sectionbits[500]; #endif -#ifdef VP8_ENTROPY_STATS -unsigned int active_section = 0; -#endif - const unsigned int vp8_prob_cost[256] = { 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, @@ -42,26 +38,26 @@ const unsigned int vp8_prob_cost[256] = { 12, 10, 9, 7, 6, 4, 3, 1, 1 }; -void vp8_start_encode(BOOL_CODER *br, unsigned char *source, +void vp8_start_encode(BOOL_CODER *bc, unsigned char *source, unsigned char *source_end) { - br->lowvalue = 0; - br->range = 255; - br->count = -24; - br->buffer = source; - br->buffer_end = source_end; - br->pos = 0; + bc->lowvalue = 0; + bc->range = 255; + bc->count = -24; + bc->buffer = source; + bc->buffer_end = source_end; + bc->pos = 0; } -void vp8_stop_encode(BOOL_CODER *br) { +void vp8_stop_encode(BOOL_CODER *bc) { int i; - for (i = 0; i < 32; ++i) vp8_encode_bool(br, 0, 128); + for (i = 0; i < 32; ++i) vp8_encode_bool(bc, 0, 128); } -void vp8_encode_value(BOOL_CODER *br, int data, int bits) { +void vp8_encode_value(BOOL_CODER *bc, int data, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) { - vp8_encode_bool(br, (1 & (data >> bit)), 0x80); + vp8_encode_bool(bc, (1 & (data >> bit)), 0x80); } } diff --git a/libs/libvpx/vp8/encoder/boolhuff.h b/libs/libvpx/vp8/encoder/boolhuff.h index d001eea9cd..8ac0a2cc4a 100644 --- a/libs/libvpx/vp8/encoder/boolhuff.h +++ b/libs/libvpx/vp8/encoder/boolhuff.h @@ -9,14 +9,14 @@ */ /**************************************************************************** -* -* Module Title : boolhuff.h -* -* Description : Bool Coder header file. -* -****************************************************************************/ -#ifndef VP8_ENCODER_BOOLHUFF_H_ -#define VP8_ENCODER_BOOLHUFF_H_ + * + * Module Title : boolhuff.h + * + * Description : Bool Coder header file. + * + ****************************************************************************/ +#ifndef VPX_VP8_ENCODER_BOOLHUFF_H_ +#define VPX_VP8_ENCODER_BOOLHUFF_H_ #include "vpx_ports/mem.h" #include "vpx/internal/vpx_codec_internal.h" @@ -35,11 +35,11 @@ typedef struct { struct vpx_internal_error_info *error; } BOOL_CODER; -extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, - unsigned char *buffer_end); +void vp8_start_encode(BOOL_CODER *bc, unsigned char *source, + unsigned char *source_end); -extern void vp8_encode_value(BOOL_CODER *br, int data, int bits); -extern void vp8_stop_encode(BOOL_CODER *bc); +void vp8_encode_value(BOOL_CODER *bc, int data, int bits); +void vp8_stop_encode(BOOL_CODER *bc); extern const unsigned int vp8_prob_cost[256]; DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); @@ -56,23 +56,12 @@ static int validate_buffer(const unsigned char *start, size_t len, return 0; } -static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { +static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) { unsigned int split; - int count = br->count; - unsigned int range = br->range; - unsigned int lowvalue = br->lowvalue; - register int shift; - -#ifdef VP8_ENTROPY_STATS -#if defined(SECTIONBITS_OUTPUT) - - if (bit) - Sectionbits[active_section] += vp8_prob_cost[255 - probability]; - else - Sectionbits[active_section] += vp8_prob_cost[probability]; - -#endif -#endif + int count = bc->count; + unsigned int range = bc->range; + unsigned int lowvalue = bc->lowvalue; + int shift; split = 1 + (((range - 1) * probability) >> 8); @@ -80,7 +69,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { if (bit) { lowvalue += split; - range = br->range - split; + range = bc->range - split; } shift = vp8_norm[range]; @@ -92,18 +81,18 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { int offset = shift - count; if ((lowvalue << (offset - 1)) & 0x80000000) { - int x = br->pos - 1; + int x = bc->pos - 1; - while (x >= 0 && br->buffer[x] == 0xff) { - br->buffer[x] = (unsigned char)0; + while (x >= 0 && bc->buffer[x] == 0xff) { + bc->buffer[x] = (unsigned char)0; x--; } - br->buffer[x] += 1; + bc->buffer[x] += 1; } - validate_buffer(br->buffer + br->pos, 1, br->buffer_end, br->error); - br->buffer[br->pos++] = (lowvalue >> (24 - offset)); + validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error); + bc->buffer[bc->pos++] = (lowvalue >> (24 - offset)); lowvalue <<= offset; shift = count; @@ -112,13 +101,13 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { } lowvalue <<= shift; - br->count = count; - br->lowvalue = lowvalue; - br->range = range; + bc->count = count; + bc->lowvalue = lowvalue; + bc->range = range; } #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_BOOLHUFF_H_ +#endif // VPX_VP8_ENCODER_BOOLHUFF_H_ diff --git a/libs/libvpx/vp8/common/copy_c.c b/libs/libvpx/vp8/encoder/copy_c.c similarity index 100% rename from libs/libvpx/vp8/common/copy_c.c rename to libs/libvpx/vp8/encoder/copy_c.c diff --git a/libs/libvpx/vp8/encoder/dct_value_cost.h b/libs/libvpx/vp8/encoder/dct_value_cost.h index 278dce73f4..0cd6cb4e65 100644 --- a/libs/libvpx/vp8/encoder/dct_value_cost.h +++ b/libs/libvpx/vp8/encoder/dct_value_cost.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DCT_VALUE_COST_H_ -#define VP8_ENCODER_DCT_VALUE_COST_H_ +#ifndef VPX_VP8_ENCODER_DCT_VALUE_COST_H_ +#define VPX_VP8_ENCODER_DCT_VALUE_COST_H_ #ifdef __cplusplus extern "C" { @@ -341,4 +341,4 @@ static const short dct_value_cost[2048 * 2] = { } // extern "C" #endif -#endif // VP8_ENCODER_DCT_VALUE_COST_H_ +#endif // VPX_VP8_ENCODER_DCT_VALUE_COST_H_ diff --git a/libs/libvpx/vp8/encoder/dct_value_tokens.h b/libs/libvpx/vp8/encoder/dct_value_tokens.h index 0597deab2d..5cc4505f09 100644 --- a/libs/libvpx/vp8/encoder/dct_value_tokens.h +++ b/libs/libvpx/vp8/encoder/dct_value_tokens.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_ -#define VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#ifndef VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#define VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ #ifdef __cplusplus extern "C" { @@ -845,4 +845,4 @@ static const TOKENVALUE dct_value_tokens[2048 * 2] = { } // extern "C" #endif -#endif // VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#endif // VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ diff --git a/libs/libvpx/vp8/encoder/defaultcoefcounts.h b/libs/libvpx/vp8/encoder/defaultcoefcounts.h index 2976325dc5..a3ab34c8a0 100644 --- a/libs/libvpx/vp8/encoder/defaultcoefcounts.h +++ b/libs/libvpx/vp8/encoder/defaultcoefcounts.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ -#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#ifndef VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#define VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ #ifdef __cplusplus extern "C" { @@ -232,4 +232,4 @@ static const unsigned int default_coef_counts } // extern "C" #endif -#endif // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#endif // VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ diff --git a/libs/libvpx/vp8/encoder/denoising.c b/libs/libvpx/vp8/encoder/denoising.c index eb963b97e3..e54d1e9f4b 100644 --- a/libs/libvpx/vp8/encoder/denoising.c +++ b/libs/libvpx/vp8/encoder/denoising.c @@ -213,13 +213,12 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, return FILTER_BLOCK; } -int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, - int mc_avg_uv_stride, - unsigned char *running_avg_uv, int avg_uv_stride, +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, + unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising) { - unsigned char *running_avg_uv_start = running_avg_uv; + unsigned char *running_avg_start = running_avg; unsigned char *sig_start = sig; int sum_diff_thresh; int r, c; @@ -259,13 +258,13 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, int adjustment = 0; int absdiff = 0; - diff = mc_running_avg_uv[c] - sig[c]; + diff = mc_running_avg[c] - sig[c]; absdiff = abs(diff); // When |diff| <= |3 + shift_inc1|, use pixel value from // last denoised raw. if (absdiff <= 3 + shift_inc1) { - running_avg_uv[c] = mc_running_avg_uv[c]; + running_avg[c] = mc_running_avg[c]; sum_diff += diff; } else { if (absdiff >= 4 && absdiff <= 7) { @@ -277,16 +276,16 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } if (diff > 0) { if ((sig[c] + adjustment) > 255) { - running_avg_uv[c] = 255; + running_avg[c] = 255; } else { - running_avg_uv[c] = sig[c] + adjustment; + running_avg[c] = sig[c] + adjustment; } sum_diff += adjustment; } else { if ((sig[c] - adjustment) < 0) { - running_avg_uv[c] = 0; + running_avg[c] = 0; } else { - running_avg_uv[c] = sig[c] - adjustment; + running_avg[c] = sig[c] - adjustment; } sum_diff -= adjustment; } @@ -294,8 +293,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } /* Update pointers for next iteration. */ sig += sig_stride; - mc_running_avg_uv += mc_avg_uv_stride; - running_avg_uv += avg_uv_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; } sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; @@ -314,27 +313,27 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, // Only apply the adjustment for max delta up to 3. if (delta < 4) { sig -= sig_stride * 8; - mc_running_avg_uv -= mc_avg_uv_stride * 8; - running_avg_uv -= avg_uv_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; for (r = 0; r < 8; ++r) { for (c = 0; c < 8; ++c) { - int diff = mc_running_avg_uv[c] - sig[c]; + int diff = mc_running_avg[c] - sig[c]; int adjustment = abs(diff); if (adjustment > delta) adjustment = delta; if (diff > 0) { // Bring denoised signal down. - if (running_avg_uv[c] - adjustment < 0) { - running_avg_uv[c] = 0; + if (running_avg[c] - adjustment < 0) { + running_avg[c] = 0; } else { - running_avg_uv[c] = running_avg_uv[c] - adjustment; + running_avg[c] = running_avg[c] - adjustment; } sum_diff -= adjustment; } else if (diff < 0) { // Bring denoised signal up. - if (running_avg_uv[c] + adjustment > 255) { - running_avg_uv[c] = 255; + if (running_avg[c] + adjustment > 255) { + running_avg[c] = 255; } else { - running_avg_uv[c] = running_avg_uv[c] + adjustment; + running_avg[c] = running_avg[c] + adjustment; } sum_diff += adjustment; } @@ -342,8 +341,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, // TODO(marpan): Check here if abs(sum_diff) has gone below the // threshold sum_diff_thresh, and if so, we can exit the row loop. sig += sig_stride; - mc_running_avg_uv += mc_avg_uv_stride; - running_avg_uv += avg_uv_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; } if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK; } else { @@ -351,7 +350,7 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } } - vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, sig_stride); + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); return FILTER_BLOCK; } diff --git a/libs/libvpx/vp8/encoder/denoising.h b/libs/libvpx/vp8/encoder/denoising.h index 91d87b3a1c..51ae3b0ab3 100644 --- a/libs/libvpx/vp8/encoder/denoising.h +++ b/libs/libvpx/vp8/encoder/denoising.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DENOISING_H_ -#define VP8_ENCODER_DENOISING_H_ +#ifndef VPX_VP8_ENCODER_DENOISING_H_ +#define VPX_VP8_ENCODER_DENOISING_H_ #include "block.h" #include "vp8/common/loopfilter.h" @@ -100,4 +100,4 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MACROBLOCK *x, } // extern "C" #endif -#endif // VP8_ENCODER_DENOISING_H_ +#endif // VPX_VP8_ENCODER_DENOISING_H_ diff --git a/libs/libvpx/vp8/encoder/encodeframe.c b/libs/libvpx/vp8/encoder/encodeframe.c index 9bb0df72d5..2b3d9564ce 100644 --- a/libs/libvpx/vp8/encoder/encodeframe.c +++ b/libs/libvpx/vp8/encoder/encodeframe.c @@ -64,9 +64,9 @@ unsigned int b_modes[14] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; * Eventually this should be replaced by custom no-reference routines, * which will be faster. */ -static const unsigned char VP8_VAR_OFFS[16] = { - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 -}; +static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128 }; /* Original activity measure from Tim T's code. */ static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) { diff --git a/libs/libvpx/vp8/encoder/encodeframe.h b/libs/libvpx/vp8/encoder/encodeframe.h index 5274aba412..cc8cf4d713 100644 --- a/libs/libvpx/vp8/encoder/encodeframe.h +++ b/libs/libvpx/vp8/encoder/encodeframe.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEFRAME_H_ -#define VP8_ENCODER_ENCODEFRAME_H_ +#ifndef VPX_VP8_ENCODER_ENCODEFRAME_H_ +#define VPX_VP8_ENCODER_ENCODEFRAME_H_ #include "vp8/encoder/tokenize.h" @@ -37,4 +37,4 @@ int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x, } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEFRAME_H_ +#endif // VPX_VP8_ENCODER_ENCODEFRAME_H_ diff --git a/libs/libvpx/vp8/encoder/encodeintra.h b/libs/libvpx/vp8/encoder/encodeintra.h index 3956cf5fb1..021dc5ed76 100644 --- a/libs/libvpx/vp8/encoder/encodeintra.h +++ b/libs/libvpx/vp8/encoder/encodeintra.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEINTRA_H_ -#define VP8_ENCODER_ENCODEINTRA_H_ +#ifndef VPX_VP8_ENCODER_ENCODEINTRA_H_ +#define VPX_VP8_ENCODER_ENCODEINTRA_H_ #include "onyx_int.h" #ifdef __cplusplus @@ -25,4 +25,4 @@ void vp8_encode_intra4x4block(MACROBLOCK *x, int ib); } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEINTRA_H_ +#endif // VPX_VP8_ENCODER_ENCODEINTRA_H_ diff --git a/libs/libvpx/vp8/encoder/encodemb.h b/libs/libvpx/vp8/encoder/encodemb.h index b55ba3ac3f..db577ddc10 100644 --- a/libs/libvpx/vp8/encoder/encodemb.h +++ b/libs/libvpx/vp8/encoder/encodemb.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEMB_H_ -#define VP8_ENCODER_ENCODEMB_H_ +#ifndef VPX_VP8_ENCODER_ENCODEMB_H_ +#define VPX_VP8_ENCODER_ENCODEMB_H_ #include "onyx_int.h" @@ -37,4 +37,4 @@ void vp8_encode_inter16x16y(MACROBLOCK *x); } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEMB_H_ +#endif // VPX_VP8_ENCODER_ENCODEMB_H_ diff --git a/libs/libvpx/vp8/encoder/encodemv.c b/libs/libvpx/vp8/encoder/encodemv.c index ea93ccd710..04adf105b9 100644 --- a/libs/libvpx/vp8/encoder/encodemv.c +++ b/libs/libvpx/vp8/encoder/encodemv.c @@ -16,10 +16,6 @@ #include -#ifdef VP8_ENTROPY_STATS -extern unsigned int active_section; -#endif - static void encode_mvcomponent(vp8_writer *const w, const int v, const struct mv_context *mvc) { const vp8_prob *p = mvc->prob; @@ -309,9 +305,6 @@ void vp8_write_mvprobs(VP8_COMP *cpi) { vp8_writer *const w = cpi->bc; MV_CONTEXT *mvc = cpi->common.fc.mvc; int flags[2] = { 0, 0 }; -#ifdef VP8_ENTROPY_STATS - active_section = 4; -#endif write_component_probs(w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->mb.MVcount[0], 0, &flags[0]); @@ -323,8 +316,4 @@ void vp8_write_mvprobs(VP8_COMP *cpi) { vp8_build_component_cost_table( cpi->mb.mvcost, (const MV_CONTEXT *)cpi->common.fc.mvc, flags); } - -#ifdef VP8_ENTROPY_STATS - active_section = 5; -#endif } diff --git a/libs/libvpx/vp8/encoder/encodemv.h b/libs/libvpx/vp8/encoder/encodemv.h index 87db30f310..347b9feffe 100644 --- a/libs/libvpx/vp8/encoder/encodemv.h +++ b/libs/libvpx/vp8/encoder/encodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEMV_H_ -#define VP8_ENCODER_ENCODEMV_H_ +#ifndef VPX_VP8_ENCODER_ENCODEMV_H_ +#define VPX_VP8_ENCODER_ENCODEMV_H_ #include "onyx_int.h" @@ -26,4 +26,4 @@ void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEMV_H_ +#endif // VPX_VP8_ENCODER_ENCODEMV_H_ diff --git a/libs/libvpx/vp8/encoder/ethreading.h b/libs/libvpx/vp8/encoder/ethreading.h index 95bf73d182..598fe60559 100644 --- a/libs/libvpx/vp8/encoder/ethreading.h +++ b/libs/libvpx/vp8/encoder/ethreading.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ETHREADING_H_ -#define VP8_ENCODER_ETHREADING_H_ +#ifndef VPX_VP8_ENCODER_ETHREADING_H_ +#define VPX_VP8_ENCODER_ETHREADING_H_ #include "vp8/encoder/onyx_int.h" @@ -29,4 +29,4 @@ void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi); } #endif -#endif // VP8_ENCODER_ETHREADING_H_ +#endif // VPX_VP8_ENCODER_ETHREADING_H_ diff --git a/libs/libvpx/vp8/encoder/firstpass.c b/libs/libvpx/vp8/encoder/firstpass.c index 70f9243410..4ea991e524 100644 --- a/libs/libvpx/vp8/encoder/firstpass.c +++ b/libs/libvpx/vp8/encoder/firstpass.c @@ -989,11 +989,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, bits_per_mb_at_this_q = vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; - bits_per_mb_at_this_q = (int)(.5 + - err_correction_factor * speed_correction * - cpi->twopass.est_max_qcorrection_factor * - cpi->twopass.section_max_qfactor * - (double)bits_per_mb_at_this_q); + bits_per_mb_at_this_q = + (int)(.5 + err_correction_factor * speed_correction * + cpi->twopass.est_max_qcorrection_factor * + cpi->twopass.section_max_qfactor * + (double)bits_per_mb_at_this_q); /* Mode and motion overhead */ /* As Q rises in real encode loop rd code will force overhead down @@ -1086,9 +1086,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; bits_per_mb_at_this_q = - (int)(.5 + - err_correction_factor * speed_correction * clip_iifactor * - (double)bits_per_mb_at_this_q); + (int)(.5 + err_correction_factor * speed_correction * clip_iifactor * + (double)bits_per_mb_at_this_q); /* Mode and motion overhead */ /* As Q rises in real encode loop rd code will force overhead down @@ -1273,9 +1272,8 @@ void vp8_init_second_pass(VP8_COMP *cpi) { * sum duration is not. Its calculated based on the actual durations of * all frames from the first pass. */ - vp8_new_framerate(cpi, - 10000000.0 * cpi->twopass.total_stats.count / - cpi->twopass.total_stats.duration); + vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / + cpi->twopass.total_stats.duration); cpi->output_framerate = cpi->framerate; cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * @@ -1739,10 +1737,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Dont break out very close to a key frame */ ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) && - (!flash_detected) && ((mv_ratio_accumulator > 100.0) || - (abs_mv_in_out_accumulator > 3.0) || - (mv_in_out_accumulator < -2.0) || - ((boost_score - old_boost_score) < 2.0)))) { + (!flash_detected) && + ((mv_ratio_accumulator > 100.0) || + (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0) || + ((boost_score - old_boost_score) < 2.0)))) { boost_score = old_boost_score; break; } @@ -1815,8 +1814,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { (next_frame.pcnt_inter > 0.75) && ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) && - (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <= - (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))) + (cpi->gfu_boost > 100) && + (cpi->twopass.gf_decay_rate <= + (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))) #endif { int Boost; diff --git a/libs/libvpx/vp8/encoder/firstpass.h b/libs/libvpx/vp8/encoder/firstpass.h index ac8a7b1bfb..f5490f1eff 100644 --- a/libs/libvpx/vp8/encoder/firstpass.h +++ b/libs/libvpx/vp8/encoder/firstpass.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_FIRSTPASS_H_ -#define VP8_ENCODER_FIRSTPASS_H_ +#ifndef VPX_VP8_ENCODER_FIRSTPASS_H_ +#define VPX_VP8_ENCODER_FIRSTPASS_H_ #ifdef __cplusplus extern "C" { @@ -28,4 +28,4 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count); } // extern "C" #endif -#endif // VP8_ENCODER_FIRSTPASS_H_ +#endif // VPX_VP8_ENCODER_FIRSTPASS_H_ diff --git a/libs/libvpx/vp8/encoder/lookahead.h b/libs/libvpx/vp8/encoder/lookahead.h index a67f226946..bf0401190b 100644 --- a/libs/libvpx/vp8/encoder/lookahead.h +++ b/libs/libvpx/vp8/encoder/lookahead.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_LOOKAHEAD_H_ -#define VP8_ENCODER_LOOKAHEAD_H_ +#ifndef VPX_VP8_ENCODER_LOOKAHEAD_H_ +#define VPX_VP8_ENCODER_LOOKAHEAD_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" @@ -74,7 +74,7 @@ int vp8_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, struct lookahead_entry *vp8_lookahead_pop(struct lookahead_ctx *ctx, int drain); #define PEEK_FORWARD 1 -#define PEEK_BACKWARD -1 +#define PEEK_BACKWARD (-1) /**\brief Get a future source buffer to encode * * \param[in] ctx Pointer to the lookahead context @@ -96,4 +96,4 @@ unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx); } // extern "C" #endif -#endif // VP8_ENCODER_LOOKAHEAD_H_ +#endif // VPX_VP8_ENCODER_LOOKAHEAD_H_ diff --git a/libs/libvpx/vp8/encoder/mcomp.c b/libs/libvpx/vp8/encoder/mcomp.c index 970120f3b2..999d6e851a 100644 --- a/libs/libvpx/vp8/encoder/mcomp.c +++ b/libs/libvpx/vp8/encoder/mcomp.c @@ -21,11 +21,6 @@ #include "vp8/common/common.h" #include "vpx_dsp/vpx_dsp_common.h" -#ifdef VP8_ENTROPY_STATS -static int mv_ref_ct[31][4][2]; -static int mv_mode_cts[4][2]; -#endif - int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) { /* MV costing is based on the distribution of vectors in the previous * frame and as such will tend to over state the cost of vectors. In @@ -34,19 +29,22 @@ int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) { * NEAREST for subsequent blocks. The "Weight" parameter allows, to a * limited extent, for some account to be taken of these factors. */ - return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + - mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * - Weight) >> - 7; + const int mv_idx_row = + clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); + const int mv_idx_col = + clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); + return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7; } static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int error_per_bit) { /* Ignore mv costing if mvcost is NULL */ if (mvcost) { - return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + - mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * - error_per_bit + + const int mv_idx_row = + clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); + const int mv_idx_col = + clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); + return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit + 128) >> 8; } @@ -1131,6 +1129,7 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#if HAVE_SSE2 || HAVE_MSA int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int_mv *best_mv, int search_param, int sad_per_bit, int *num00, vp8_variance_fn_ptr_t *fn_ptr, @@ -1279,6 +1278,7 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#endif // HAVE_SSE2 || HAVE_MSA int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, int distance, @@ -1366,6 +1366,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#if HAVE_SSSE3 int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], @@ -1484,7 +1485,9 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#endif // HAVE_SSSE3 +#if HAVE_SSE4_1 int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], @@ -1630,6 +1633,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#endif // HAVE_SSE4_1 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int error_per_bit, @@ -1709,6 +1713,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#if HAVE_SSE2 || HAVE_MSA int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int error_per_bit, int search_range, vp8_variance_fn_ptr_t *fn_ptr, @@ -1818,96 +1823,4 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } - -#ifdef VP8_ENTROPY_STATS -void print_mode_context(void) { - FILE *f = fopen("modecont.c", "w"); - int i, j; - - fprintf(f, "#include \"entropy.h\"\n"); - fprintf(f, "const int vp8_mode_contexts[6][4] =\n"); - fprintf(f, "{\n"); - - for (j = 0; j < 6; ++j) { - fprintf(f, " { /* %d */\n", j); - fprintf(f, " "); - - for (i = 0; i < 4; ++i) { - int overal_prob; - int this_prob; - int count; - - /* Overall probs */ - count = mv_mode_cts[i][0] + mv_mode_cts[i][1]; - - if (count) - overal_prob = 256 * mv_mode_cts[i][0] / count; - else - overal_prob = 128; - - if (overal_prob == 0) overal_prob = 1; - - /* context probs */ - count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1]; - - if (count) - this_prob = 256 * mv_ref_ct[j][i][0] / count; - else - this_prob = 128; - - if (this_prob == 0) this_prob = 1; - - fprintf(f, "%5d, ", this_prob); - } - - fprintf(f, " },\n"); - } - - fprintf(f, "};\n"); - fclose(f); -} - -/* MV ref count VP8_ENTROPY_STATS stats code */ -#ifdef VP8_ENTROPY_STATS -void init_mv_ref_counts() { - memset(mv_ref_ct, 0, sizeof(mv_ref_ct)); - memset(mv_mode_cts, 0, sizeof(mv_mode_cts)); -} - -void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) { - if (m == ZEROMV) { - ++mv_ref_ct[ct[0]][0][0]; - ++mv_mode_cts[0][0]; - } else { - ++mv_ref_ct[ct[0]][0][1]; - ++mv_mode_cts[0][1]; - - if (m == NEARESTMV) { - ++mv_ref_ct[ct[1]][1][0]; - ++mv_mode_cts[1][0]; - } else { - ++mv_ref_ct[ct[1]][1][1]; - ++mv_mode_cts[1][1]; - - if (m == NEARMV) { - ++mv_ref_ct[ct[2]][2][0]; - ++mv_mode_cts[2][0]; - } else { - ++mv_ref_ct[ct[2]][2][1]; - ++mv_mode_cts[2][1]; - - if (m == NEWMV) { - ++mv_ref_ct[ct[3]][3][0]; - ++mv_mode_cts[3][0]; - } else { - ++mv_ref_ct[ct[3]][3][1]; - ++mv_mode_cts[3][1]; - } - } - } - } -} - -#endif /* END MV ref count VP8_ENTROPY_STATS stats code */ - -#endif +#endif // HAVE_SSE2 || HAVE_MSA diff --git a/libs/libvpx/vp8/encoder/mcomp.h b/libs/libvpx/vp8/encoder/mcomp.h index b6228798ff..6c77995da4 100644 --- a/libs/libvpx/vp8/encoder/mcomp.h +++ b/libs/libvpx/vp8/encoder/mcomp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_MCOMP_H_ -#define VP8_ENCODER_MCOMP_H_ +#ifndef VPX_VP8_ENCODER_MCOMP_H_ +#define VPX_VP8_ENCODER_MCOMP_H_ #include "block.h" #include "vpx_dsp/variance.h" @@ -18,11 +18,6 @@ extern "C" { #endif -#ifdef VP8_ENTROPY_STATS -extern void init_mv_ref_counts(); -extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); -#endif - /* The maximum number of steps in a step search given the largest allowed * initial step */ @@ -34,15 +29,14 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); /* Maximum size of the first step in full pel units */ #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) -extern void print_mode_context(void); -extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight); -extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); -extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); +int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight); +void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); +void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); -extern int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int_mv *best_mv, int search_param, int error_per_bit, - const vp8_variance_fn_ptr_t *vf, int *mvsadcost[2], - int *mvcost[2], int_mv *center_mv); +int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int_mv *best_mv, int search_param, int sad_per_bit, + const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2], + int *mvcost[2], int_mv *center_mv); typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -51,10 +45,10 @@ typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int *mvcost[2], int *distortion, unsigned int *sse); -extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; -extern fractional_mv_step_fp vp8_find_best_sub_pixel_step; -extern fractional_mv_step_fp vp8_find_best_half_pixel_step; -extern fractional_mv_step_fp vp8_skip_fractional_mv_step; +fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; +fractional_mv_step_fp vp8_find_best_sub_pixel_step; +fractional_mv_step_fp vp8_find_best_half_pixel_step; +fractional_mv_step_fp vp8_skip_fractional_mv_step; typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, @@ -78,4 +72,4 @@ typedef int (*vp8_diamond_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } // extern "C" #endif -#endif // VP8_ENCODER_MCOMP_H_ +#endif // VPX_VP8_ENCODER_MCOMP_H_ diff --git a/libs/libvpx/vp8/encoder/modecosts.h b/libs/libvpx/vp8/encoder/modecosts.h index dfb8989f7f..09ee2b5520 100644 --- a/libs/libvpx/vp8/encoder/modecosts.h +++ b/libs/libvpx/vp8/encoder/modecosts.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_MODECOSTS_H_ -#define VP8_ENCODER_MODECOSTS_H_ +#ifndef VPX_VP8_ENCODER_MODECOSTS_H_ +#define VPX_VP8_ENCODER_MODECOSTS_H_ #ifdef __cplusplus extern "C" { @@ -17,10 +17,10 @@ extern "C" { struct VP8_COMP; -void vp8_init_mode_costs(struct VP8_COMP *x); +void vp8_init_mode_costs(struct VP8_COMP *c); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_MODECOSTS_H_ +#endif // VPX_VP8_ENCODER_MODECOSTS_H_ diff --git a/libs/libvpx/vp8/encoder/mr_dissim.h b/libs/libvpx/vp8/encoder/mr_dissim.h index da36628afa..58f5a97623 100644 --- a/libs/libvpx/vp8/encoder/mr_dissim.h +++ b/libs/libvpx/vp8/encoder/mr_dissim.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_MR_DISSIM_H_ -#define VP8_ENCODER_MR_DISSIM_H_ +#ifndef VPX_VP8_ENCODER_MR_DISSIM_H_ +#define VPX_VP8_ENCODER_MR_DISSIM_H_ #include "vpx_config.h" #ifdef __cplusplus @@ -24,4 +24,4 @@ extern void vp8_store_drop_frame_info(VP8_COMP *cpi); } // extern "C" #endif -#endif // VP8_ENCODER_MR_DISSIM_H_ +#endif // VPX_VP8_ENCODER_MR_DISSIM_H_ diff --git a/libs/libvpx/vp8/encoder/onyx_if.c b/libs/libvpx/vp8/encoder/onyx_if.c index 2243182425..4fd1574924 100644 --- a/libs/libvpx/vp8/encoder/onyx_if.c +++ b/libs/libvpx/vp8/encoder/onyx_if.c @@ -65,9 +65,7 @@ extern int vp8_update_coef_context(VP8_COMP *cpi); extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag); -extern void print_parms(VP8_CONFIG *ocf, char *filenam); extern unsigned int vp8_get_processor_freq(); -extern void print_tree_update_probs(); int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); @@ -101,10 +99,6 @@ extern int skip_true_count; extern int skip_false_count; #endif -#ifdef VP8_ENTROPY_STATS -extern int intra_mode_stats[10][10][10]; -#endif - #ifdef SPEEDSTATS unsigned int frames_at_speed[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -224,6 +218,8 @@ static void save_layer_context(VP8_COMP *cpi) { lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot; lc->force_maxqp = cpi->force_maxqp; lc->last_frame_percent_intra = cpi->last_frame_percent_intra; + lc->last_q[0] = cpi->last_q[0]; + lc->last_q[1] = cpi->last_q[1]; memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage, sizeof(cpi->mb.count_mb_ref_frame_usage)); @@ -261,6 +257,8 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) { cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot; cpi->force_maxqp = lc->force_maxqp; cpi->last_frame_percent_intra = lc->last_frame_percent_intra; + cpi->last_q[0] = lc->last_q[0]; + cpi->last_q[1] = lc->last_q[1]; memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage, sizeof(cpi->mb.count_mb_ref_frame_usage)); @@ -689,8 +687,8 @@ static void set_default_lf_deltas(VP8_COMP *cpi) { /* Convenience macros for mapping speed and mode into a continuous * range */ -#define GOOD(x) (x + 1) -#define RT(x) (x + 7) +#define GOOD(x) ((x) + 1) +#define RT(x) ((x) + 7) static int speed_map(int speed, const int *map) { int res; @@ -743,9 +741,9 @@ static const int mode_check_freq_map_zn2[] = { 0, RT(10), 1 << 1, RT(11), 1 << 2, RT(12), 1 << 3, INT_MAX }; -static const int mode_check_freq_map_vhbpred[] = { - 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX -}; +static const int mode_check_freq_map_vhbpred[] = { 0, GOOD(5), 2, RT(0), + 0, RT(3), 2, RT(5), + 4, INT_MAX }; static const int mode_check_freq_map_near2[] = { 0, GOOD(5), 2, RT(0), 0, RT(3), 2, @@ -761,13 +759,13 @@ static const int mode_check_freq_map_new2[] = { 0, GOOD(5), 4, RT(0), 1 << 3, RT(11), 1 << 4, RT(12), 1 << 5, INT_MAX }; -static const int mode_check_freq_map_split1[] = { - 0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX -}; +static const int mode_check_freq_map_split1[] = { 0, GOOD(2), 2, GOOD(3), + 7, RT(1), 2, RT(2), + 7, INT_MAX }; -static const int mode_check_freq_map_split2[] = { - 0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX -}; +static const int mode_check_freq_map_split2[] = { 0, GOOD(1), 2, GOOD(2), + 4, GOOD(3), 15, RT(1), + 4, RT(2), 15, INT_MAX }; void vp8_set_speed_features(VP8_COMP *cpi) { SPEED_FEATURES *sf = &cpi->sf; @@ -1534,6 +1532,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { } } + cpi->ext_refresh_frame_flags_pending = 0; + cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; @@ -1893,10 +1893,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); -#ifdef VP8_ENTROPY_STATS - init_context_counters(); -#endif - /*Initialize the feed-forward activity masking.*/ cpi->activity_avg = 90 << 12; @@ -2005,10 +2001,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->mb.rd_thresh_mult[i] = 128; } -#ifdef VP8_ENTROPY_STATS - init_mv_ref_counts(); -#endif - #if CONFIG_MULTITHREAD if (vp8cx_create_encoder_threads(cpi)) { vp8_remove_compressor(&cpi); @@ -2106,8 +2098,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { return cpi; } -void vp8_remove_compressor(VP8_COMP **ptr) { - VP8_COMP *cpi = *ptr; +void vp8_remove_compressor(VP8_COMP **comp) { + VP8_COMP *cpi = *comp; if (!cpi) return; @@ -2120,12 +2112,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) { #endif -#ifdef VP8_ENTROPY_STATS - print_context_counters(); - print_tree_update_probs(); - print_mode_context(); -#endif - #if CONFIG_INTERNAL_STATS if (cpi->pass != 1) { @@ -2252,40 +2238,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) { } #endif -#ifdef VP8_ENTROPY_STATS - { - int i, j, k; - FILE *fmode = fopen("modecontext.c", "w"); - - fprintf(fmode, "\n#include \"entropymode.h\"\n\n"); - fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts "); - fprintf(fmode, - "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n"); - - for (i = 0; i < 10; ++i) { - fprintf(fmode, " { /* Above Mode : %d */\n", i); - - for (j = 0; j < 10; ++j) { - fprintf(fmode, " {"); - - for (k = 0; k < 10; ++k) { - if (!intra_mode_stats[i][j][k]) - fprintf(fmode, " %5d, ", 1); - else - fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]); - } - - fprintf(fmode, "}, /* left_mode %d */\n", j); - } - - fprintf(fmode, " },\n"); - } - - fprintf(fmode, "};\n"); - fclose(fmode); - } -#endif - #if defined(SECTIONBITS_OUTPUT) if (0) { @@ -2326,7 +2278,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { vp8_remove_common(&cpi->common); vpx_free(cpi); - *ptr = 0; + *comp = 0; #ifdef OUTPUT_YUV_SRC fclose(yuv_file); @@ -2464,6 +2416,7 @@ int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags) { if (ref_frame_flags & VP8_ALTR_FRAME) cpi->common.refresh_alt_ref_frame = 1; + cpi->ext_refresh_frame_flags_pending = 1; return 0; } @@ -2862,7 +2815,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) fclose(yframe); } #endif -/* return of 0 means drop frame */ #if !CONFIG_REALTIME_ONLY /* Function to test for conditions that indeicate we should loop @@ -3364,11 +3316,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; if (cpi->oxcf.mr_encoder_id) { - // TODO(marpan): This constraint shouldn't be needed, as we would like - // to allow for key frame setting (forced or periodic) defined per - // spatial layer. For now, keep this in. - cm->frame_type = low_res_frame_info->frame_type; - // Check if lower resolution is available for motion vector reuse. if (cm->frame_type != KEY_FRAME) { cpi->mr_low_res_mv_avail = 1; @@ -3393,7 +3340,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]); */ } + // Disable motion vector reuse (i.e., disable any usage of the low_res) + // if the previous lower stream is skipped/disabled. + if (low_res_frame_info->skip_encoding_prev_stream) { + cpi->mr_low_res_mv_avail = 0; + } } + // This stream is not skipped (i.e., it's being encoded), so set this skip + // flag to 0. This is needed for the next stream (i.e., which is the next + // frame to be encoded). + low_res_frame_info->skip_encoding_prev_stream = 0; // On a key frame: For the lowest resolution, keep track of the key frame // counter value. For the higher resolutions, reset the current video @@ -3559,6 +3515,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cm->current_video_frame++; cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; // We advance the temporal pattern for dropped frames. cpi->temporal_pattern_counter++; @@ -3600,6 +3557,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif cm->current_video_frame++; cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; // We advance the temporal pattern for dropped frames. cpi->temporal_pattern_counter++; return; @@ -3799,7 +3757,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, /* Setup background Q adjustment for error resilient mode. * For multi-layer encodes only enable this for the base layer. - */ + */ if (cpi->cyclic_refresh_mode_enabled) { // Special case for screen_content_mode with golden frame updates. int disable_cr_gf = @@ -4001,6 +3959,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { if (vp8_drop_encodedframe_overshoot(cpi, Q)) return; + if (cm->frame_type != KEY_FRAME) + cpi->last_pred_err_mb = + (int)(cpi->mb.prediction_error / cpi->common.MBs); } cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); @@ -4283,6 +4244,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->common.current_video_frame++; cpi->frames_since_key++; cpi->drop_frame_count++; + cpi->ext_refresh_frame_flags_pending = 0; // We advance the temporal pattern for dropped frames. cpi->temporal_pattern_counter++; return; @@ -4391,8 +4353,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, /* For inter frames the current default behavior is that when * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer * This is purely an encoder decision at present. + * Avoid this behavior when refresh flags are set by the user. */ - if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame) { + if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame && + !cpi->ext_refresh_frame_flags_pending) { cm->copy_buffer_to_arf = 2; } else { cm->copy_buffer_to_arf = 0; @@ -4699,6 +4663,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif + cpi->ext_refresh_frame_flags_pending = 0; + if (cm->refresh_golden_frame == 1) { cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; } else { @@ -4782,8 +4748,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->temporal_pattern_counter++; } -/* reset to normal state now that we are done. */ - #if 0 { char filename[512]; @@ -4866,14 +4830,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, cm = &cpi->common; - if (setjmp(cpi->common.error.jmp)) { - cpi->common.error.setjmp = 0; - vpx_clear_system_state(); - return VPX_CODEC_CORRUPT_FRAME; - } - - cpi->common.error.setjmp = 1; - vpx_usec_timer_start(&cmptimer); cpi->source = NULL; @@ -4999,10 +4955,13 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, // be received for that high layer, which will yield an incorrect // frame rate (from time-stamp adjustment in above calculation). if (cpi->oxcf.mr_encoder_id) { - cpi->ref_framerate = low_res_frame_info->low_res_framerate; + if (!low_res_frame_info->skip_encoding_base_stream) + cpi->ref_framerate = low_res_frame_info->low_res_framerate; } else { // Keep track of frame rate for lowest resolution. low_res_frame_info->low_res_framerate = cpi->ref_framerate; + // The base stream is being encoded so set skip flag to 0. + low_res_frame_info->skip_encoding_base_stream = 0; } } #endif diff --git a/libs/libvpx/vp8/encoder/onyx_int.h b/libs/libvpx/vp8/encoder/onyx_int.h index c489b46c2d..50a750da31 100644 --- a/libs/libvpx/vp8/encoder/onyx_int.h +++ b/libs/libvpx/vp8/encoder/onyx_int.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ONYX_INT_H_ -#define VP8_ENCODER_ONYX_INT_H_ +#ifndef VPX_VP8_ENCODER_ONYX_INT_H_ +#define VPX_VP8_ENCODER_ONYX_INT_H_ #include #include "vpx_config.h" @@ -57,6 +57,9 @@ extern "C" { #define VP8_TEMPORAL_ALT_REF !CONFIG_REALTIME_ONLY +/* vp8 uses 10,000,000 ticks/second as time stamp */ +#define TICKS_PER_SEC 10000000 + typedef struct { int kf_indicated; unsigned int frames_since_key; @@ -257,6 +260,7 @@ typedef struct { int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + int last_q[2]; } LAYER_CONTEXT; typedef struct VP8_COMP { @@ -510,6 +514,7 @@ typedef struct VP8_COMP { int force_maxqp; int frames_since_last_drop_overshoot; + int last_pred_err_mb; // GF update for 1 pass cbr. int gf_update_onepass_cbr; @@ -695,6 +700,8 @@ typedef struct VP8_COMP { // Use the static threshold from ROI settings. int use_roi_static_threshold; + + int ext_refresh_frame_flags_pending; } VP8_COMP; void vp8_initialize_enc(void); @@ -714,8 +721,8 @@ void vp8_set_speed_features(VP8_COMP *cpi); #if CONFIG_DEBUG #define CHECK_MEM_ERROR(lval, expr) \ do { \ - lval = (expr); \ - if (!lval) \ + (lval) = (expr); \ + if (!(lval)) \ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ "Failed to allocate " #lval " at %s:%d", __FILE__, \ __LINE__); \ @@ -723,8 +730,8 @@ void vp8_set_speed_features(VP8_COMP *cpi); #else #define CHECK_MEM_ERROR(lval, expr) \ do { \ - lval = (expr); \ - if (!lval) \ + (lval) = (expr); \ + if (!(lval)) \ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ "Failed to allocate " #lval); \ } while (0) @@ -733,4 +740,4 @@ void vp8_set_speed_features(VP8_COMP *cpi); } // extern "C" #endif -#endif // VP8_ENCODER_ONYX_INT_H_ +#endif // VPX_VP8_ENCODER_ONYX_INT_H_ diff --git a/libs/libvpx/vp8/encoder/pickinter.c b/libs/libvpx/vp8/encoder/pickinter.c index a9943eb6ab..dc72eed88c 100644 --- a/libs/libvpx/vp8/encoder/pickinter.c +++ b/libs/libvpx/vp8/encoder/pickinter.c @@ -173,9 +173,8 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) { static int pick_intra4x4block(MACROBLOCK *x, int ib, B_PREDICTION_MODE *best_mode, - const int *mode_costs, - - int *bestrate, int *bestdistortion) { + const int *mode_costs, int *bestrate, + int *bestdistortion) { BLOCKD *b = &x->e_mbd.block[ib]; BLOCK *be = &x->block[ib]; int dst_stride = x->e_mbd.dst.y_stride; @@ -564,7 +563,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO best_mbmode; - int_mv best_ref_mv_sb[2]; + int_mv best_ref_mv_sb[2] = { { 0 }, { 0 } }; int_mv mode_mv_sb[2][MB_MODE_COUNT]; int_mv best_ref_mv; int_mv *mode_mv; @@ -602,7 +601,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, /* search range got from mv_pred(). It uses step_param levels. (0-7) */ int sr = 0; - unsigned char *plane[4][3]; + unsigned char *plane[4][3] = { { 0, 0 } }; int ref_frame_map[4]; int sign_bias = 0; int dot_artifact_candidate = 0; @@ -631,13 +630,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, } } #endif + assert(plane[LAST_FRAME][0] != NULL); dot_artifact_candidate = check_dot_artifact_candidate( cpi, x, target_y, stride, plane[LAST_FRAME][0], mb_row, mb_col, 0); // If not found in Y channel, check UV channel. if (!dot_artifact_candidate) { + assert(plane[LAST_FRAME][1] != NULL); dot_artifact_candidate = check_dot_artifact_candidate( cpi, x, target_u, stride_uv, plane[LAST_FRAME][1], mb_row, mb_col, 1); if (!dot_artifact_candidate) { + assert(plane[LAST_FRAME][2] != NULL); dot_artifact_candidate = check_dot_artifact_candidate( cpi, x, target_v, stride_uv, plane[LAST_FRAME][2], mb_row, mb_col, 2); @@ -741,10 +743,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; /* If the frame has big static background and current MB is in low - * motion area, its mode decision is biased to ZEROMV mode. - * No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). - * At such speed settings, ZEROMV is already heavily favored. - */ + * motion area, its mode decision is biased to ZEROMV mode. + * No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). + * At such speed settings, ZEROMV is already heavily favored. + */ if (cpi->Speed < 12) { calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment); } @@ -1068,10 +1070,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128); } + // fall through case NEARESTMV: case NEARMV: if (mode_mv[this_mode].as_int == 0) continue; + // fall through case ZEROMV: @@ -1301,9 +1305,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, update_mvcount(x, &best_ref_mv); } -void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) { +void vp8_pick_intra_mode(MACROBLOCK *x, int *rate) { int error4x4, error16x16 = INT_MAX; - int rate, best_rate = 0, distortion, best_sse; + int rate_, best_rate = 0, distortion, best_sse; MB_PREDICTION_MODE mode, best_mode = DC_PRED; int this_rd; unsigned int sse; @@ -1321,23 +1325,23 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) { xd->predictor, 16); distortion = vpx_variance16x16(*(b->base_src), b->src_stride, xd->predictor, 16, &sse); - rate = x->mbmode_cost[xd->frame_type][mode]; - this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + rate_ = x->mbmode_cost[xd->frame_type][mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate_, distortion); if (error16x16 > this_rd) { error16x16 = this_rd; best_mode = mode; best_sse = sse; - best_rate = rate; + best_rate = rate_; } } xd->mode_info_context->mbmi.mode = best_mode; - error4x4 = pick_intra4x4mby_modes(x, &rate, &best_sse); + error4x4 = pick_intra4x4mby_modes(x, &rate_, &best_sse); if (error4x4 < error16x16) { xd->mode_info_context->mbmi.mode = B_PRED; - best_rate = rate; + best_rate = rate_; } - *rate_ = best_rate; + *rate = best_rate; } diff --git a/libs/libvpx/vp8/encoder/pickinter.h b/libs/libvpx/vp8/encoder/pickinter.h index bf1d0c9749..392fb41593 100644 --- a/libs/libvpx/vp8/encoder/pickinter.h +++ b/libs/libvpx/vp8/encoder/pickinter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_PICKINTER_H_ -#define VP8_ENCODER_PICKINTER_H_ +#ifndef VPX_VP8_ENCODER_PICKINTER_H_ +#define VPX_VP8_ENCODER_PICKINTER_H_ #include "vpx_config.h" #include "vp8/common/onyxc_int.h" @@ -30,4 +30,4 @@ extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb, } // extern "C" #endif -#endif // VP8_ENCODER_PICKINTER_H_ +#endif // VPX_VP8_ENCODER_PICKINTER_H_ diff --git a/libs/libvpx/vp8/encoder/picklpf.h b/libs/libvpx/vp8/encoder/picklpf.h index e6ad0dbf26..03597e5427 100644 --- a/libs/libvpx/vp8/encoder/picklpf.h +++ b/libs/libvpx/vp8/encoder/picklpf.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_PICKLPF_H_ -#define VP8_ENCODER_PICKLPF_H_ +#ifndef VPX_VP8_ENCODER_PICKLPF_H_ +#define VPX_VP8_ENCODER_PICKLPF_H_ #ifdef __cplusplus extern "C" { @@ -27,4 +27,4 @@ void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi); } #endif -#endif // VP8_ENCODER_PICKLPF_H_ +#endif // VPX_VP8_ENCODER_PICKLPF_H_ diff --git a/libs/libvpx/vp8/encoder/quantize.h b/libs/libvpx/vp8/encoder/quantize.h index 267150f99f..78746c0c20 100644 --- a/libs/libvpx/vp8/encoder/quantize.h +++ b/libs/libvpx/vp8/encoder/quantize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_QUANTIZE_H_ -#define VP8_ENCODER_QUANTIZE_H_ +#ifndef VPX_VP8_ENCODER_QUANTIZE_H_ +#define VPX_VP8_ENCODER_QUANTIZE_H_ #ifdef __cplusplus extern "C" { @@ -31,4 +31,4 @@ extern void vp8cx_init_quantizer(struct VP8_COMP *cpi); } // extern "C" #endif -#endif // VP8_ENCODER_QUANTIZE_H_ +#endif // VPX_VP8_ENCODER_QUANTIZE_H_ diff --git a/libs/libvpx/vp8/encoder/ratectrl.c b/libs/libvpx/vp8/encoder/ratectrl.c index e58c310980..dbd76edad0 100644 --- a/libs/libvpx/vp8/encoder/ratectrl.c +++ b/libs/libvpx/vp8/encoder/ratectrl.c @@ -996,7 +996,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { * bits on this frame even if it is a contructed arf. * The active maximum quantizer insures that an appropriate * number of bits will be spent if needed for contstructed ARFs. - */ + */ cpi->this_frame_target = 0; } @@ -1052,9 +1052,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { * overflow when values are large */ projected_size_based_on_q = - (int)(((.5 + - rate_correction_factor * - vp8_bits_per_mb[cpi->common.frame_type][Q]) * + (int)(((.5 + rate_correction_factor * + vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS)); @@ -1126,6 +1125,14 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { } } +static int limit_q_cbr_inter(int last_q, int current_q) { + int limit_down = 12; + if (last_q - current_q > limit_down) + return (last_q - limit_down); + else + return current_q; +} + int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { int Q = cpi->active_worst_quality; @@ -1265,6 +1272,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { } } + // Limit decrease in Q for 1 pass CBR screen content mode. + if (cpi->common.frame_type != KEY_FRAME && cpi->pass == 0 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.screen_content_mode) + Q = limit_q_cbr_inter(cpi->last_q[1], Q); + return Q; } @@ -1465,7 +1478,7 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { (cpi->oxcf.screen_content_mode == 2 || (cpi->drop_frames_allowed && (force_drop_overshoot || - (cpi->rate_correction_factor < (4.0f * MIN_BPB_FACTOR) && + (cpi->rate_correction_factor < (8.0f * MIN_BPB_FACTOR) && cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) { // Note: the "projected_frame_size" from encode_frame() only gives estimate // of mode/motion vector rate (in non-rd mode): so below we only require @@ -1485,7 +1498,8 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4)) thresh_rate = thresh_rate >> 3; if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate && - pred_err_mb > thresh_pred_err_mb) || + pred_err_mb > thresh_pred_err_mb && + pred_err_mb > 2 * cpi->last_pred_err_mb) || force_drop_overshoot) { unsigned int i; double new_correction_factor; diff --git a/libs/libvpx/vp8/encoder/ratectrl.h b/libs/libvpx/vp8/encoder/ratectrl.h index 249de4e706..844c72cb86 100644 --- a/libs/libvpx/vp8/encoder/ratectrl.h +++ b/libs/libvpx/vp8/encoder/ratectrl.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_RATECTRL_H_ -#define VP8_ENCODER_RATECTRL_H_ +#ifndef VPX_VP8_ENCODER_RATECTRL_H_ +#define VPX_VP8_ENCODER_RATECTRL_H_ #include "onyx_int.h" @@ -37,4 +37,4 @@ extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q); } // extern "C" #endif -#endif // VP8_ENCODER_RATECTRL_H_ +#endif // VPX_VP8_ENCODER_RATECTRL_H_ diff --git a/libs/libvpx/vp8/encoder/rdopt.c b/libs/libvpx/vp8/encoder/rdopt.c index e210b44105..79a858e437 100644 --- a/libs/libvpx/vp8/encoder/rdopt.c +++ b/libs/libvpx/vp8/encoder/rdopt.c @@ -770,9 +770,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, vp8_quantize_mbuv(x); rate_to = rd_cost_mbuv(x); - this_rate = rate_to + - x->intra_uv_mode_cost[xd->frame_type] - [xd->mode_info_context->mbmi.uv_mode]; + this_rate = + rate_to + x->intra_uv_mode_cost[xd->frame_type] + [xd->mode_info_context->mbmi.uv_mode]; this_distortion = vp8_mbuverror(x) / 4; @@ -989,7 +989,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, br += rate; for (i = 0; i < label_count; ++i) { - int_mv mode_mv[B_MODE_COUNT]; + int_mv mode_mv[B_MODE_COUNT] = { { 0 }, { 0 } }; int best_label_rd = INT_MAX; B_PREDICTION_MODE mode_selected = ZERO4X4; int bestlabelyrate = 0; @@ -1767,7 +1767,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, /* search range got from mv_pred(). It uses step_param levels. (0-7) */ int sr = 0; - unsigned char *plane[4][3]; + unsigned char *plane[4][3] = { { 0, 0 } }; int ref_frame_map[4]; int sign_bias = 0; @@ -1779,6 +1779,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, best_rd_sse = UINT_MAX; #endif + // _uv variables are not set consistantly before calling update_best_mode. + rd.rate_uv = 0; + rd.distortion_uv = 0; + mode_mv = mode_mv_sb[sign_bias]; best_ref_mv.as_int = 0; best_mode.rd = INT_MAX; @@ -1846,6 +1850,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, /* everything but intra */ if (x->e_mbd.mode_info_context->mbmi.ref_frame) { + assert(plane[this_ref_frame][0] != NULL && + plane[this_ref_frame][1] != NULL && + plane[this_ref_frame][2] != NULL); x->e_mbd.pre.y_buffer = plane[this_ref_frame][0]; x->e_mbd.pre.u_buffer = plane[this_ref_frame][1]; x->e_mbd.pre.v_buffer = plane[this_ref_frame][2]; @@ -1940,6 +1947,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.distortion2 += distortion; if (tmp_rd < best_mode.yrd) { + assert(uv_intra_done); rd.rate2 += uv_intra_rate; rd.rate_uv = uv_intra_rate_tokenonly; rd.distortion2 += uv_intra_distortion; @@ -2000,6 +2008,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.distortion2 += distortion; rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type] [x->e_mbd.mode_info_context->mbmi.mode]; + assert(uv_intra_done); rd.rate2 += uv_intra_rate; rd.rate_uv = uv_intra_rate_tokenonly; rd.distortion2 += uv_intra_distortion; @@ -2131,6 +2140,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96); } + // fall through case NEARESTMV: case NEARMV: @@ -2147,6 +2157,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, (mode_mv[this_mode].as_int == 0)) { continue; } + // fall through case ZEROMV: @@ -2352,11 +2363,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd_update_mvcount(x, &best_ref_mv); } -void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate) { int error4x4, error16x16; int rate4x4, rate16x16 = 0, rateuv; int dist4x4, dist16x16, distuv; - int rate; + int rate_; int rate4x4_tokenonly = 0; int rate16x16_tokenonly = 0; int rateuv_tokenonly = 0; @@ -2364,7 +2375,7 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv); - rate = rateuv; + rate_ = rateuv; error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly, &dist16x16); @@ -2374,10 +2385,10 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { if (error4x4 < error16x16) { x->e_mbd.mode_info_context->mbmi.mode = B_PRED; - rate += rate4x4; + rate_ += rate4x4; } else { - rate += rate16x16; + rate_ += rate16x16; } - *rate_ = rate; + *rate = rate_; } diff --git a/libs/libvpx/vp8/encoder/rdopt.h b/libs/libvpx/vp8/encoder/rdopt.h index 960bd8f1cd..cc3db8197c 100644 --- a/libs/libvpx/vp8/encoder/rdopt.h +++ b/libs/libvpx/vp8/encoder/rdopt.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_RDOPT_H_ -#define VP8_ENCODER_RDOPT_H_ +#ifndef VPX_VP8_ENCODER_RDOPT_H_ +#define VPX_VP8_ENCODER_RDOPT_H_ #include "./vpx_config.h" @@ -63,12 +63,12 @@ static INLINE void insertsortsad(int arr[], int idx[], int len) { } } -extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); -extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, - int *returnrate, int *returndistortion, - int *returnintra, int mb_row, int mb_col); -extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); +void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); +void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col); +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb, unsigned char *plane[3], @@ -110,9 +110,9 @@ static INLINE void get_reference_search_order(const VP8_COMP *cpi, for (; i < 4; ++i) ref_frame_map[i] = -1; } -extern void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, - int_mv *mvp, int refframe, int *ref_frame_sign_bias, - int *sr, int near_sadidx[]); +void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, + int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr, + int near_sadidx[]); void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]); int VP8_UVSSE(MACROBLOCK *x); @@ -123,4 +123,4 @@ void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv); } // extern "C" #endif -#endif // VP8_ENCODER_RDOPT_H_ +#endif // VPX_VP8_ENCODER_RDOPT_H_ diff --git a/libs/libvpx/vp8/encoder/segmentation.h b/libs/libvpx/vp8/encoder/segmentation.h index 1395a34118..4ddbdbbd26 100644 --- a/libs/libvpx/vp8/encoder/segmentation.h +++ b/libs/libvpx/vp8/encoder/segmentation.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_SEGMENTATION_H_ -#define VP8_ENCODER_SEGMENTATION_H_ +#ifndef VPX_VP8_ENCODER_SEGMENTATION_H_ +#define VPX_VP8_ENCODER_SEGMENTATION_H_ #include "string.h" #include "vp8/common/blockd.h" @@ -26,4 +26,4 @@ extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, } // extern "C" #endif -#endif // VP8_ENCODER_SEGMENTATION_H_ +#endif // VPX_VP8_ENCODER_SEGMENTATION_H_ diff --git a/libs/libvpx/vp8/encoder/temporal_filter.c b/libs/libvpx/vp8/encoder/temporal_filter.c index 0a7d25fb0a..76f99a17d7 100644 --- a/libs/libvpx/vp8/encoder/temporal_filter.c +++ b/libs/libvpx/vp8/encoder/temporal_filter.c @@ -159,6 +159,7 @@ static int vp8_temporal_filter_find_matching_mb_c(VP8_COMP *cpi, bestsme = vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv, step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16], NULL, NULL, &best_ref_mv1); + (void)bestsme; // Ignore unused return value. #if ALT_REF_SUBPEL_ENABLED /* Try sub-pixel MC? */ diff --git a/libs/libvpx/vp8/encoder/temporal_filter.h b/libs/libvpx/vp8/encoder/temporal_filter.h index 865d909fb6..fd39f5cb87 100644 --- a/libs/libvpx/vp8/encoder/temporal_filter.h +++ b/libs/libvpx/vp8/encoder/temporal_filter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_ -#define VP8_ENCODER_TEMPORAL_FILTER_H_ +#ifndef VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ +#define VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ #ifdef __cplusplus extern "C" { @@ -23,4 +23,4 @@ void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance); } #endif -#endif // VP8_ENCODER_TEMPORAL_FILTER_H_ +#endif // VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ diff --git a/libs/libvpx/vp8/encoder/tokenize.c b/libs/libvpx/vp8/encoder/tokenize.c index ca5f0e3d89..c3d7026607 100644 --- a/libs/libvpx/vp8/encoder/tokenize.c +++ b/libs/libvpx/vp8/encoder/tokenize.c @@ -19,10 +19,6 @@ /* Global event counters used for accumulating statistics across several compressions, then generating context.c = initial stats. */ -#ifdef VP8_ENTROPY_STATS -_int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; -#endif void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); void vp8_fix_contexts(MACROBLOCKD *x); @@ -383,72 +379,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) { tokenize1st_order_b(x, t, plane_type, cpi); } -#ifdef VP8_ENTROPY_STATS - -void init_context_counters(void) { - memset(context_counters, 0, sizeof(context_counters)); -} - -void print_context_counters() { - int type, band, pt, t; - - FILE *const f = fopen("context.c", "w"); - - fprintf(f, "#include \"entropy.h\"\n"); - - fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n"); - - fprintf(f, - "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] " - "[MAX_ENTROPY_TOKENS];\n\n"); - - fprintf(f, - "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] " - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {"); - -#define Comma(X) (X ? "," : "") - - type = 0; - - do { - fprintf(f, "%s\n { /* block Type %d */", Comma(type), type); - - band = 0; - - do { - fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); - - pt = 0; - - do { - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; - - do { - const _int64 x = context_counters[type][band][pt][t]; - const int y = (int)x; - - assert(x == (_int64)y); /* no overflow handling yet */ - fprintf(f, "%s %d", Comma(t), y); - - } while (++t < MAX_ENTROPY_TOKENS); - - fprintf(f, "}"); - } while (++pt < PREV_COEF_CONTEXTS); - - fprintf(f, "\n }"); - - } while (++band < COEF_BANDS); - - fprintf(f, "\n }"); - } while (++type < BLOCK_TYPES); - - fprintf(f, "\n};\n"); - fclose(f); -} -#endif - static void stuff2nd_order_b(TOKENEXTRA **tp, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, VP8_COMP *cpi, MACROBLOCK *x) { int pt; /* near block/prev token context index */ diff --git a/libs/libvpx/vp8/encoder/tokenize.h b/libs/libvpx/vp8/encoder/tokenize.h index e5dbdfc5af..47b5be17f1 100644 --- a/libs/libvpx/vp8/encoder/tokenize.h +++ b/libs/libvpx/vp8/encoder/tokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_TOKENIZE_H_ -#define VP8_ENCODER_TOKENIZE_H_ +#ifndef VPX_VP8_ENCODER_TOKENIZE_H_ +#define VPX_VP8_ENCODER_TOKENIZE_H_ #include "vp8/common/entropy.h" #include "block.h" @@ -34,14 +34,6 @@ typedef struct { int rd_cost_mby(MACROBLOCKD *); -#ifdef VP8_ENTROPY_STATS -void init_context_counters(); -void print_context_counters(); - -extern _int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; -#endif - extern const short *const vp8_dct_value_cost_ptr; /* TODO: The Token field should be broken out into a separate char array to * improve cache locality, since it's needed for costing when the rest of the @@ -53,4 +45,4 @@ extern const TOKENVALUE *const vp8_dct_value_tokens_ptr; } // extern "C" #endif -#endif // VP8_ENCODER_TOKENIZE_H_ +#endif // VPX_VP8_ENCODER_TOKENIZE_H_ diff --git a/libs/libvpx/vp8/encoder/treewriter.h b/libs/libvpx/vp8/encoder/treewriter.h index dadbbe3f80..c02683a58b 100644 --- a/libs/libvpx/vp8/encoder/treewriter.h +++ b/libs/libvpx/vp8/encoder/treewriter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_TREEWRITER_H_ -#define VP8_ENCODER_TREEWRITER_H_ +#ifndef VPX_VP8_ENCODER_TREEWRITER_H_ +#define VPX_VP8_ENCODER_TREEWRITER_H_ /* Trees map alphabets into huffman-like codes suitable for an arithmetic bit coder. Timothy S Murphy 11 October 2004 */ @@ -56,8 +56,7 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2], static void vp8_treed_write(vp8_writer *const w, vp8_tree t, const vp8_prob *const p, int v, - int n /* number of bits in v, assumed nonzero */ - ) { + int n) { /* number of bits in v, assumed nonzero */ vp8_tree_index i = 0; do { @@ -73,8 +72,7 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t, } static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v, - int n /* number of bits in v, assumed nonzero */ - ) { + int n) { /* number of bits in v, assumed nonzero */ int c = 0; vp8_tree_index i = 0; @@ -93,12 +91,12 @@ static INLINE int vp8_cost_token(vp8_tree t, const vp8_prob *const p, /* Fill array of costs for all possible token values. */ -void vp8_cost_tokens(int *Costs, const vp8_prob *, vp8_tree); +void vp8_cost_tokens(int *c, const vp8_prob *, vp8_tree); -void vp8_cost_tokens2(int *Costs, const vp8_prob *, vp8_tree, int); +void vp8_cost_tokens2(int *c, const vp8_prob *, vp8_tree, int); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_TREEWRITER_H_ +#endif // VPX_VP8_ENCODER_TREEWRITER_H_ diff --git a/libs/libvpx/vp8/encoder/x86/encodeopt.asm b/libs/libvpx/vp8/encoder/x86/block_error_sse2.asm similarity index 100% rename from libs/libvpx/vp8/encoder/x86/encodeopt.asm rename to libs/libvpx/vp8/encoder/x86/block_error_sse2.asm diff --git a/libs/libvpx/vp8/common/x86/copy_sse2.asm b/libs/libvpx/vp8/encoder/x86/copy_sse2.asm similarity index 100% rename from libs/libvpx/vp8/common/x86/copy_sse2.asm rename to libs/libvpx/vp8/encoder/x86/copy_sse2.asm diff --git a/libs/libvpx/vp8/common/x86/copy_sse3.asm b/libs/libvpx/vp8/encoder/x86/copy_sse3.asm similarity index 100% rename from libs/libvpx/vp8/common/x86/copy_sse3.asm rename to libs/libvpx/vp8/encoder/x86/copy_sse3.asm diff --git a/libs/libvpx/vp8/encoder/x86/quantize_sse4.c b/libs/libvpx/vp8/encoder/x86/quantize_sse4.c index 6f2c163492..389c16705d 100644 --- a/libs/libvpx/vp8/encoder/x86/quantize_sse4.c +++ b/libs/libvpx/vp8/encoder/x86/quantize_sse4.c @@ -11,28 +11,29 @@ #include /* SSE4.1 */ #include "./vp8_rtcd.h" -#include "vp8/encoder/block.h" #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ +#include "vp8/encoder/block.h" -#define SELECT_EOB(i, z, x, y, q) \ - do { \ - short boost = *zbin_boost_ptr; \ - short x_z = _mm_extract_epi16(x, z); \ - short y_z = _mm_extract_epi16(y, z); \ - int cmp = (x_z < boost) | (y_z == 0); \ - zbin_boost_ptr++; \ - if (cmp) break; \ - q = _mm_insert_epi16(q, y_z, z); \ - eob = i; \ - zbin_boost_ptr = b->zrun_zbin_boost; \ +#define SELECT_EOB(i, z, x, y, q) \ + do { \ + short boost = *zbin_boost_ptr; \ + /* Technically _mm_extract_epi16() returns an int: */ \ + /* https://bugs.llvm.org/show_bug.cgi?id=41657 */ \ + short x_z = (short)_mm_extract_epi16(x, z); \ + short y_z = (short)_mm_extract_epi16(y, z); \ + int cmp = (x_z < boost) | (y_z == 0); \ + zbin_boost_ptr++; \ + if (cmp) break; \ + q = _mm_insert_epi16(q, y_z, z); \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ } while (0) void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { char eob = 0; short *zbin_boost_ptr = b->zrun_zbin_boost; - __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, - dqcoeff1; + __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); @@ -53,15 +54,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); - /* Sign of z: z >> 15 */ - sz0 = _mm_srai_epi16(z0, 15); - sz1 = _mm_srai_epi16(z1, 15); - - /* x = abs(z): (z ^ sz) - sz */ - x0 = _mm_xor_si128(z0, sz0); - x1 = _mm_xor_si128(z1, sz1); - x0 = _mm_sub_epi16(x0, sz0); - x1 = _mm_sub_epi16(x1, sz1); + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); /* zbin[] + zbin_extra */ zbin0 = _mm_add_epi16(zbin0, zbin_extra); @@ -89,11 +84,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { y0 = _mm_mulhi_epi16(y0, quant_shift0); y1 = _mm_mulhi_epi16(y1, quant_shift1); - /* Return the sign: (y ^ sz) - sz */ - y0 = _mm_xor_si128(y0, sz0); - y1 = _mm_xor_si128(y1, sz1); - y0 = _mm_sub_epi16(y0, sz0); - y1 = _mm_sub_epi16(y1, sz1); + /* Restore the sign. */ + y0 = _mm_sign_epi16(y0, z0); + y1 = _mm_sign_epi16(y1, z1); /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); diff --git a/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c index d547450154..147c30cc35 100644 --- a/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c +++ b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c @@ -52,9 +52,9 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; - DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = { - 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 - }; + DECLARE_ALIGNED(16, const uint8_t, + pshufb_zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); /* sign of z: z >> 15 */ diff --git a/libs/libvpx/vp8/vp8_common.mk b/libs/libvpx/vp8/vp8_common.mk index 246fe6a677..3b442b1e4a 100644 --- a/libs/libvpx/vp8/vp8_common.mk +++ b/libs/libvpx/vp8/vp8_common.mk @@ -15,7 +15,6 @@ VP8_COMMON_SRCS-yes += common/onyxd.h VP8_COMMON_SRCS-yes += common/alloccommon.c VP8_COMMON_SRCS-yes += common/blockd.c VP8_COMMON_SRCS-yes += common/coefupdateprobs.h -VP8_COMMON_SRCS-yes += common/copy_c.c # VP8_COMMON_SRCS-yes += common/debugmodes.c VP8_COMMON_SRCS-yes += common/default_coef_probs.h VP8_COMMON_SRCS-yes += common/dequantize.c @@ -70,8 +69,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h VP8_COMMON_SRCS-yes += common/treecoder.c -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c @@ -82,14 +79,13 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm -VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/bilinear_filter_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm -VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) @@ -130,14 +126,13 @@ endif # common (neon intrinsics) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.h VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_loopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c diff --git a/libs/libvpx/vp8/vp8_cx_iface.c b/libs/libvpx/vp8/vp8_cx_iface.c index af6689fd97..eb04f67fa6 100644 --- a/libs/libvpx/vp8/vp8_cx_iface.c +++ b/libs/libvpx/vp8/vp8_cx_iface.c @@ -16,7 +16,9 @@ #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/system_state.h" #include "vpx_ports/vpx_once.h" +#include "vpx_util/vpx_timestamp.h" #include "vp8/encoder/onyx_int.h" #include "vpx/vp8cx.h" #include "vp8/encoder/firstpass.h" @@ -49,7 +51,7 @@ static struct vp8_extracfg default_extracfg = { #if !(CONFIG_REALTIME_ONLY) 0, /* cpu_used */ #else - 4, /* cpu_used */ + 4, /* cpu_used */ #endif 0, /* enable_auto_alt_ref */ 0, /* noise_sensitivity */ @@ -74,6 +76,9 @@ struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; struct vp8_extracfg vp8_cfg; + vpx_rational64_t timestamp_ratio; + vpx_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; VP8_CONFIG oxcf; struct VP8_COMP *cpi; unsigned char *cx_data; @@ -105,10 +110,10 @@ static vpx_codec_err_t update_error_state( return VPX_CODEC_INVALID_PARAM; \ } while (0) -#define RANGE_CHECK(p, memb, lo, hi) \ - do { \ - if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \ - ERROR(#memb " out of range [" #lo ".." #hi "]"); \ +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ } while (0) #define RANGE_CHECK_HI(p, memb, hi) \ @@ -126,6 +131,22 @@ static vpx_codec_err_t update_error_state( if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \ } while (0) +#if defined(_MSC_VER) +#define COMPILE_TIME_ASSERT(boolexp) \ + do { \ + char compile_time_assert[(boolexp) ? 1 : -1]; \ + (void)compile_time_assert; \ + } while (0) +#else /* !_MSC_VER */ +#define COMPILE_TIME_ASSERT(boolexp) \ + do { \ + struct { \ + unsigned int compile_time_assert : (boolexp) ? 1 : -1; \ + } compile_time_assert; \ + (void)compile_time_assert; \ + } while (0) +#endif /* _MSC_VER */ + static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, const struct vp8_extracfg *vp8_cfg, @@ -258,9 +279,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img) { switch (img->fmt) { case VPX_IMG_FMT_YV12: - case VPX_IMG_FMT_I420: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: break; + case VPX_IMG_FMT_I420: break; default: ERROR("Invalid image format. Only YV12 and I420 images are supported"); } @@ -484,6 +503,9 @@ static vpx_codec_err_t update_extracfg(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_cpu_used(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp8_extracfg extra_cfg = ctx->vp8_cfg; extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + // Use fastest speed setting (speed 16 or -16) if it's set beyond the range. + extra_cfg.cpu_used = VPXMIN(16, extra_cfg.cpu_used); + extra_cfg.cpu_used = VPXMAX(-16, extra_cfg.cpu_used); return update_extracfg(ctx, &extra_cfg); } @@ -577,7 +599,7 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, void **mem_loc) { - vpx_codec_err_t res = 0; + vpx_codec_err_t res = VPX_CODEC_OK; #if CONFIG_MULTI_RES_ENCODING LOWER_RES_FRAME_INFO *shared_mem_loc; @@ -586,12 +608,13 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO)); if (!shared_mem_loc) { - res = VPX_CODEC_MEM_ERROR; + return VPX_CODEC_MEM_ERROR; } shared_mem_loc->mb_info = calloc(mb_rows * mb_cols, sizeof(LOWER_RES_MB_INFO)); if (!(shared_mem_loc->mb_info)) { + free(shared_mem_loc); res = VPX_CODEC_MEM_ERROR; } else { *mem_loc = (void *)shared_mem_loc; @@ -655,6 +678,12 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0); if (!res) { + priv->pts_offset_initialized = 0; + priv->timestamp_ratio.den = priv->cfg.g_timebase.den; + priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num; + priv->timestamp_ratio.num *= TICKS_PER_SEC; + reduce_ratio(&priv->timestamp_ratio); + set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg); priv->cpi = vp8_create_compressor(&priv->oxcf); if (!priv->cpi) res = VPX_CODEC_MEM_ERROR; @@ -719,12 +748,14 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, new_qc = MODE_BESTQUALITY; if (deadline) { + /* Convert duration parameter from stream timebase to microseconds */ uint64_t duration_us; - /* Convert duration parameter from stream timebase to microseconds */ - duration_us = (uint64_t)duration * 1000000 * - (uint64_t)ctx->cfg.g_timebase.num / - (uint64_t)ctx->cfg.g_timebase.den; + COMPILE_TIME_ASSERT(TICKS_PER_SEC > 1000000 && + (TICKS_PER_SEC % 1000000) == 0); + + duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / + (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); /* If the deadline is more that the duration this frame is to be shown, * use good quality mode. Otherwise use realtime mode. @@ -798,16 +829,38 @@ static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, - vpx_enc_frame_flags_t flags, + vpx_enc_frame_flags_t enc_flags, unsigned long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_codec_err_t res = VPX_CODEC_OK; + // Make a copy as volatile to avoid -Wclobbered with longjmp. + volatile vpx_enc_frame_flags_t flags = enc_flags; + volatile vpx_codec_pts_t pts_val = pts; - if (!ctx->cfg.rc_target_bitrate) return res; + if (!ctx->cfg.rc_target_bitrate) { +#if CONFIG_MULTI_RES_ENCODING + if (!ctx->cpi) return VPX_CODEC_ERROR; + if (ctx->cpi->oxcf.mr_total_resolutions > 1) { + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info; + if (!low_res_frame_info) return VPX_CODEC_ERROR; + low_res_frame_info->skip_encoding_prev_stream = 1; + if (ctx->cpi->oxcf.mr_encoder_id == 0) + low_res_frame_info->skip_encoding_base_stream = 1; + } +#endif + return res; + } if (img) res = validate_img(ctx, img); if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1); + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts_val; + ctx->pts_offset_initialized = 1; + } + pts_val -= ctx->pts_offset; + pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); @@ -829,6 +882,12 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, } } + if (setjmp(ctx->cpi->common.error.jmp)) { + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + return VPX_CODEC_CORRUPT_FRAME; + } + /* Initialize the encoder instance on the first frame*/ if (!res && ctx->cpi) { unsigned int lib_flags; @@ -851,11 +910,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, /* Convert API flags to internal codec lib flags */ lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - /* vp8 use 10,000,000 ticks/second as time stamp */ dst_time_stamp = - pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den; - dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / - ctx->cfg.g_timebase.den; + pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + dst_end_time_stamp = (pts_val + duration) * ctx->timestamp_ratio.num / + ctx->timestamp_ratio.den; if (img != NULL) { res = image2yuvconfig(img, &sd); @@ -875,6 +933,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, cx_data_end = ctx->cx_data + cx_data_sz; lib_flags = 0; + ctx->cpi->common.error.setjmp = 1; + while (cx_data_sz >= ctx->cx_data_sz / 2) { comp_data_state = vp8_get_compressed_data( ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp, @@ -892,16 +952,21 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; /* Add the frame packet to the list of returned packets. */ - round = (vpx_codec_pts_t)10000000 * ctx->cfg.g_timebase.num / 2 - 1; + round = (vpx_codec_pts_t)ctx->timestamp_ratio.num / 2; + if (round > 0) --round; delta = (dst_end_time_stamp - dst_time_stamp); pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = - (dst_time_stamp * ctx->cfg.g_timebase.den + round) / - ctx->cfg.g_timebase.num / 10000000; + (dst_time_stamp * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num + + ctx->pts_offset; pkt.data.frame.duration = - (unsigned long)((delta * ctx->cfg.g_timebase.den + round) / - ctx->cfg.g_timebase.num / 10000000); + (unsigned long)((delta * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num); pkt.data.frame.flags = lib_flags << 16; + pkt.data.frame.width[0] = cpi->common.Width; + pkt.data.frame.height[0] = cpi->common.Height; + pkt.data.frame.spatial_layer_encoded[0] = 1; if (lib_flags & FRAMEFLAGS_KEY) { pkt.data.frame.flags |= VPX_FRAME_IS_KEY; @@ -916,9 +981,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, * Invisible frames have no duration. */ pkt.data.frame.pts = - ((cpi->last_time_stamp_seen * ctx->cfg.g_timebase.den + round) / - ctx->cfg.g_timebase.num / 10000000) + - 1; + ((cpi->last_time_stamp_seen * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num) + + ctx->pts_offset + 1; pkt.data.frame.duration = 0; } @@ -1176,7 +1241,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { { 0, { - 0, /* g_usage */ + 0, /* g_usage (unused) */ 0, /* g_threads */ 0, /* g_profile */ @@ -1259,6 +1324,9 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = { vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t cfg_maps; */ vp8e_encode, /* vpx_codec_encode_fn_t encode; */ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t get_cx_data; */ - vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem, + vp8e_set_config, + NULL, + vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; diff --git a/libs/libvpx/vp8/vp8_dx_iface.c b/libs/libvpx/vp8/vp8_dx_iface.c index f20283c1e1..f441ed46ff 100644 --- a/libs/libvpx/vp8/vp8_dx_iface.c +++ b/libs/libvpx/vp8/vp8_dx_iface.c @@ -38,13 +38,19 @@ typedef vpx_codec_stream_info_t vp8_stream_info_t; /* Structures for handling memory allocations */ typedef enum { VP8_SEG_ALG_PRIV = 256, VP8_SEG_MAX } mem_seg_id_t; -#define NELEMENTS(x) ((int)(sizeof(x) / sizeof(x[0]))) +#define NELEMENTS(x) ((int)(sizeof(x) / sizeof((x)[0]))) struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_dec_cfg_t cfg; vp8_stream_info_t si; int decoder_init; +#if CONFIG_MULTITHREAD + // Restart threads on next frame if set to 1. + // This is set when error happens in multithreaded decoding and all threads + // are shut down. + int restart_threads; +#endif int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; vpx_decrypt_cb decrypt_cb; @@ -200,9 +206,9 @@ static vpx_codec_err_t update_error_state( static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, void *user_priv) { /** vpx_img_wrap() doesn't allow specifying independent strides for - * the Y, U, and V planes, nor other alignment adjustments that - * might be representable by a YV12_BUFFER_CONFIG, so we just - * initialize all the fields.*/ + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ img->fmt = VPX_IMG_FMT_I420; img->w = yv12->y_stride; img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15; @@ -268,7 +274,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { volatile vpx_codec_err_t res; - unsigned int resolution_change = 0; + volatile unsigned int resolution_change = 0; unsigned int w, h; if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) { @@ -298,6 +304,27 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1; +#if CONFIG_MULTITHREAD + if (!res && ctx->restart_threads) { + struct frame_buffers *fb = &ctx->yv12_frame_buffers; + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + VP8_COMMON *const pc = &pbi->common; + if (setjmp(pbi->common.error.jmp)) { + vp8_remove_decoder_instances(fb); + vp8_zero(fb->pbi); + vpx_clear_system_state(); + return VPX_CODEC_ERROR; + } + pbi->common.error.setjmp = 1; + pbi->max_threads = ctx->cfg.threads; + vp8_decoder_create_threads(pbi); + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_alloc_temp_buffers(pbi, pc->Width, pc->mb_rows); + } + ctx->restart_threads = 0; + pbi->common.error.setjmp = 0; + } +#endif /* Initialize the decoder instance on the first frame*/ if (!res && !ctx->decoder_init) { VP8D_CONFIG oxcf; @@ -335,8 +362,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (!res) { VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + VP8_COMMON *const pc = &pbi->common; if (resolution_change) { - VP8_COMMON *const pc = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; #if CONFIG_MULTITHREAD int i; @@ -428,9 +455,35 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, pbi->common.fb_idx_ref_cnt[0] = 0; } + if (setjmp(pbi->common.error.jmp)) { + /* We do not know if the missing frame(s) was supposed to update + * any of the reference buffers, but we act conservative and + * mark only the last buffer as corrupted. + */ + pc->yv12_fb[pc->lst_fb_idx].corrupted = 1; + + if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) { + pc->fb_idx_ref_cnt[pc->new_fb_idx]--; + } + pc->error.setjmp = 0; +#if CONFIG_MULTITHREAD + if (pbi->restart_threads) { + ctx->si.w = 0; + ctx->si.h = 0; + ctx->restart_threads = 1; + } +#endif + res = update_error_state(ctx, &pbi->common.error); + return res; + } + + pbi->common.error.setjmp = 1; + /* update the pbi fragment data */ pbi->fragments = ctx->fragments; - +#if CONFIG_MULTITHREAD + pbi->restart_threads = 0; +#endif ctx->user_priv = user_priv; if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline)) { res = update_error_state(ctx, &pbi->common.error); diff --git a/libs/libvpx/vp8/vp8cx.mk b/libs/libvpx/vp8/vp8cx.mk index 0dac0169d5..3a8f8ea45a 100644 --- a/libs/libvpx/vp8/vp8cx.mk +++ b/libs/libvpx/vp8/vp8cx.mk @@ -23,6 +23,7 @@ VP8_CX_SRCS-yes += vp8_cx_iface.c VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h VP8_CX_SRCS-yes += encoder/bitstream.c VP8_CX_SRCS-yes += encoder/boolhuff.c +VP8_CX_SRCS-yes += encoder/copy_c.c VP8_CX_SRCS-yes += encoder/dct.c VP8_CX_SRCS-yes += encoder/encodeframe.c VP8_CX_SRCS-yes += encoder/encodeframe.h @@ -82,6 +83,8 @@ VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h endif +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse3.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c @@ -92,9 +95,9 @@ ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c endif +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/block_error_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c -VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c new file mode 100644 index 0000000000..219ff63cb8 --- /dev/null +++ b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +// Use macros to make sure argument lane is passed in as an constant integer. + +#define vmull_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = vmull_lane_s32(vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = vmull_lane_s32(vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = vmull_lane_s32(vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = vmull_lane_s32(vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlal_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlal_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlal_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlal_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlal_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlsl_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlsl_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlsl_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlsl_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlsl_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +static INLINE int32x4x2_t +highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[0], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[0].val[1], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[1], DCT_CONST_BITS)); + return out; +} + +#define highbd_iadst_half_butterfly(in, c, lane, out) \ + do { \ + int64x2x2_t t[2]; \ + vmull_lane_s32_dual(in, c, lane, t); \ + out = highbd_dct_const_round_shift_low_8(t); \ + } while (0) + +#define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \ + do { \ + vmull_lane_s32_dual(in0, c, lane0, s0); \ + vmull_lane_s32_dual(in0, c, lane1, s1); \ + vmlal_lane_s32_dual(in1, c, lane1, s0); \ + vmlsl_lane_s32_dual(in1, c, lane0, s1); \ + } while (0) + +static INLINE int32x4x2_t vaddq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vaddq_s32(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vaddq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vaddq_s64(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vsubq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vsubq_s32(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vsubq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vsubq_s64(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vcombine_s32_dual(const int32x2x2_t in0, + const int32x2x2_t in1) { + int32x4x2_t out; + out.val[0] = vcombine_s32(in0.val[0], in1.val[0]); + out.val[1] = vcombine_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t highbd_add_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sum_hi = vaddq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sum_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sum_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t highbd_sub_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sub_lo = vsubq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sub_hi = vsubq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sub_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sub_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sub_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sub_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) { + int32x4x2_t out; + out.val[0] = vnegq_s32(in.val[0]); + out.val[1] = vnegq_s32(in.val[1]); + return out; +} + +static void highbd_iadst16_neon(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd) { + const int32x4_t c_1_31_5_27 = + create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int32x4_t c_9_23_13_19 = + create_s32x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int32x4_t c_17_15_21_11 = + create_s32x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int32x4_t c_25_7_29_3 = + create_s32x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int32x4_t c_4_28_20_12 = + create_s32x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int32x4_t c_16_n16_8_24 = + create_s32x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int32x4x2_t in[16], out[16]; + int32x4x2_t x[16], t[12]; + int64x2x2_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int64x2x2_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + highbd_iadst_butterfly(x[0], x[1], vget_low_s32(c_1_31_5_27), 0, 1, s0, s1); + highbd_iadst_butterfly(x[2], x[3], vget_high_s32(c_1_31_5_27), 0, 1, s2, s3); + highbd_iadst_butterfly(x[4], x[5], vget_low_s32(c_9_23_13_19), 0, 1, s4, s5); + highbd_iadst_butterfly(x[6], x[7], vget_high_s32(c_9_23_13_19), 0, 1, s6, s7); + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_17_15_21_11), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_17_15_21_11), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[12], x[13], vget_low_s32(c_25_7_29_3), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[14], x[15], vget_high_s32(c_25_7_29_3), 0, 1, s14, + s15); + + x[0] = highbd_add_dct_const_round_shift_low_8(s0, s8); + x[1] = highbd_add_dct_const_round_shift_low_8(s1, s9); + x[2] = highbd_add_dct_const_round_shift_low_8(s2, s10); + x[3] = highbd_add_dct_const_round_shift_low_8(s3, s11); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s12); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s13); + x[6] = highbd_add_dct_const_round_shift_low_8(s6, s14); + x[7] = highbd_add_dct_const_round_shift_low_8(s7, s15); + x[8] = highbd_sub_dct_const_round_shift_low_8(s0, s8); + x[9] = highbd_sub_dct_const_round_shift_low_8(s1, s9); + x[10] = highbd_sub_dct_const_round_shift_low_8(s2, s10); + x[11] = highbd_sub_dct_const_round_shift_low_8(s3, s11); + x[12] = highbd_sub_dct_const_round_shift_low_8(s4, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s5, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s6, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_4_28_20_12), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_4_28_20_12), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[13], x[12], vget_low_s32(c_4_28_20_12), 1, 0, s13, + s12); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_4_28_20_12), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[4]); + x[1] = vaddq_s32_dual(t[1], t[5]); + x[2] = vaddq_s32_dual(t[2], t[6]); + x[3] = vaddq_s32_dual(t[3], t[7]); + x[4] = vsubq_s32_dual(t[0], t[4]); + x[5] = vsubq_s32_dual(t[1], t[5]); + x[6] = vsubq_s32_dual(t[2], t[6]); + x[7] = vsubq_s32_dual(t[3], t[7]); + x[8] = highbd_add_dct_const_round_shift_low_8(s8, s12); + x[9] = highbd_add_dct_const_round_shift_low_8(s9, s13); + x[10] = highbd_add_dct_const_round_shift_low_8(s10, s14); + x[11] = highbd_add_dct_const_round_shift_low_8(s11, s15); + x[12] = highbd_sub_dct_const_round_shift_low_8(s8, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s9, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s10, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly(x[4], x[5], vget_high_s32(c_16_n16_8_24), 0, 1, s4, + s5); + highbd_iadst_butterfly(x[7], x[6], vget_high_s32(c_16_n16_8_24), 1, 0, s7, + s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + highbd_iadst_butterfly(x[12], x[13], vget_high_s32(c_16_n16_8_24), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_16_n16_8_24), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[2]); + x[1] = vaddq_s32_dual(t[1], t[3]); + x[2] = vsubq_s32_dual(t[0], t[2]); + x[3] = vsubq_s32_dual(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s6); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s7); + x[6] = highbd_sub_dct_const_round_shift_low_8(s4, s6); + x[7] = highbd_sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s32_dual(t[8], t[10]); + x[9] = vaddq_s32_dual(t[9], t[11]); + x[10] = vsubq_s32_dual(t[8], t[10]); + x[11] = vsubq_s32_dual(t[9], t[11]); + x[12] = highbd_add_dct_const_round_shift_low_8(s12, s14); + x[13] = highbd_add_dct_const_round_shift_low_8(s13, s15); + x[14] = highbd_sub_dct_const_round_shift_low_8(s12, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + { + const int32x4x2_t sum = vaddq_s32_dual(x[2], x[3]); + const int32x4x2_t sub = vsubq_s32_dual(x[2], x[3]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[2]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[3]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[7], x[6]); + const int32x4x2_t sub = vsubq_s32_dual(x[7], x[6]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[6]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[7]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[11], x[10]); + const int32x4x2_t sub = vsubq_s32_dual(x[11], x[10]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[10]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[11]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[14], x[15]); + const int32x4x2_t sub = vsubq_s32_dual(x[14], x[15]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[14]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[15]); + } + + out[0] = x[0]; + out[1] = vnegq_s32_dual(x[8]); + out[2] = x[12]; + out[3] = vnegq_s32_dual(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s32_dual(x[13]); + out[14] = x[9]; + out[15] = vnegq_s32_dual(x[1]); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +typedef void (*highbd_iht_1d)(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, const int bd); + +typedef struct { + highbd_iht_1d cols, rows; // vertical and horizontal +} highbd_iht_2d; + +void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + if (bd == 8) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 1); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 1); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 1); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 1); // right 8 columns + } else { + static const highbd_iht_2d IHT_16[] = { + { vpx_highbd_idct16x16_256_add_half1d, + vpx_highbd_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { highbd_iadst16_neon, + vpx_highbd_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_highbd_idct16x16_256_add_half1d, + highbd_iadst16_neon }, // DCT_ADST = 2 + { highbd_iadst16_neon, highbd_iadst16_neon } // ADST_ADST = 3 + }; + const highbd_iht_2d ht = IHT_16[tx_type]; + int32_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, bd); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, bd); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, bd); // left 8 columns + ht.cols(row_output + 8 * 16, NULL, dest + 8, stride, + bd); // right 8 columns + } +} diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c new file mode 100644 index 0000000000..52c4f1937d --- /dev/null +++ b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void highbd_iadst4(int32x4_t *const io) { + const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 }; + const int32x4_t sinpi = vld1q_s32(sinpis); + int64x2x2_t s[7], t[4]; + int32x4_t s7; + + s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0); + s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0); + s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1); + s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1); + s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0); + s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1); + s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1); + s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0); + s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0); + s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1); + s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1); + s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1); + s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1); + s7 = vsubq_s32(io[0], io[2]); + s7 = vaddq_s32(s7, io[3]); + + s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]); + s[3] = s[2]; + s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0); + + t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]); + t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]); + t[2] = s[2]; + t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]); + t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]); + t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]); + t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]); + io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS)); + io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS)); + io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS)); + io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS)); +} + +void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); + + if (bd == 8) { + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_s16_4x4q(&a[0], &a[1]); + + switch (tx_type) { + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + } + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + + case DCT_ADST: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + } + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max); +} diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c new file mode 100644 index 0000000000..2232c6841c --- /dev/null +++ b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_iadst_half_butterfly_neon(int32x4_t *const x, + const int32x2_t c) { + const int32x4_t sum = vaddq_s32(x[0], x[1]); + const int32x4_t sub = vsubq_s32(x[0], x[1]); + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); + const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); + const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); + const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); + const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); + + x[0] = vcombine_s32(out0_lo, out0_hi); + x[1] = vcombine_s32(out1_lo, out1_hi); +} + +static INLINE void highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 1); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 0); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 1); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 0); +} + +static INLINE void highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1); +} + +static INLINE int32x4_t highbd_add_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]); + const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE int32x4_t highbd_sub_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]); + const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1, + int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, + int32x4_t *const io6, int32x4_t *const io7) { + const int32x4_t c0 = + create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int32x4_t c1 = + create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int32x4_t c2 = + create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int32x4_t x[8], t[4]; + int64x2_t s[8][2]; + + x[0] = *io7; + x[1] = *io0; + x[2] = *io5; + x[3] = *io2; + x[4] = *io3; + x[5] = *io4; + x[6] = *io1; + x[7] = *io6; + + // stage 1 + highbd_iadst_butterfly_lane_0_1_neon(x[0], x[1], vget_low_s32(c0), s[0], + s[1]); + highbd_iadst_butterfly_lane_0_1_neon(x[2], x[3], vget_high_s32(c0), s[2], + s[3]); + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_low_s32(c1), s[4], + s[5]); + highbd_iadst_butterfly_lane_0_1_neon(x[6], x[7], vget_high_s32(c1), s[6], + s[7]); + + x[0] = highbd_add_dct_const_round_shift_low_8(s[0], s[4]); + x[1] = highbd_add_dct_const_round_shift_low_8(s[1], s[5]); + x[2] = highbd_add_dct_const_round_shift_low_8(s[2], s[6]); + x[3] = highbd_add_dct_const_round_shift_low_8(s[3], s[7]); + x[4] = highbd_sub_dct_const_round_shift_low_8(s[0], s[4]); + x[5] = highbd_sub_dct_const_round_shift_low_8(s[1], s[5]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[2], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[3], s[7]); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_high_s32(c2), s[4], + s[5]); + highbd_iadst_butterfly_lane_1_0_neon(x[7], x[6], vget_high_s32(c2), s[7], + s[6]); + + x[0] = vaddq_s32(t[0], t[2]); + x[1] = vaddq_s32(t[1], t[3]); + x[2] = vsubq_s32(t[0], t[2]); + x[3] = vsubq_s32(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s[4], s[6]); + x[5] = highbd_add_dct_const_round_shift_low_8(s[5], s[7]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[4], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[5], s[7]); + + // stage 3 + highbd_iadst_half_butterfly_neon(x + 2, vget_low_s32(c2)); + highbd_iadst_half_butterfly_neon(x + 6, vget_low_s32(c2)); + + *io0 = x[0]; + *io1 = vnegq_s32(x[4]); + *io2 = x[6]; + *io3 = vnegq_s32(x[2]); + *io4 = x[3]; + *io5 = vnegq_s32(x[7]); + *io6 = x[5]; + *io7 = vnegq_s32(x[1]); +} + +void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); + + if (bd == 8) { + c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); + + switch (tx_type) { + case DCT_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + case ADST_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + + case DCT_ADST: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + default: { + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + } + + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } else { + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + + case DCT_ADST: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + } + + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); + } + highbd_add8x8(c, dest, stride, bd); +} diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c new file mode 100644 index 0000000000..db72ff1161 --- /dev/null +++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { + int16x8_t in[16], out[16]; + const int16x4_t c_1_31_5_27 = + create_s16x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int16x4_t c_9_23_13_19 = + create_s16x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int16x4_t c_17_15_21_11 = + create_s16x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int16x4_t c_25_7_29_3 = + create_s16x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int16x4_t c_4_28_20_12 = + create_s16x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int16x4_t c_16_n16_8_24 = + create_s16x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int16x8_t x[16], t[12]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int32x4_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c_1_31_5_27, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c_1_31_5_27, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c_9_23_13_19, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c_9_23_13_19, s6, s7); + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_17_15_21_11, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_17_15_21_11, s10, s11); + iadst_butterfly_lane_0_1_neon(x[12], x[13], c_25_7_29_3, s12, s13); + iadst_butterfly_lane_2_3_neon(x[14], x[15], c_25_7_29_3, s14, s15); + + x[0] = add_dct_const_round_shift_low_8(s0, s8); + x[1] = add_dct_const_round_shift_low_8(s1, s9); + x[2] = add_dct_const_round_shift_low_8(s2, s10); + x[3] = add_dct_const_round_shift_low_8(s3, s11); + x[4] = add_dct_const_round_shift_low_8(s4, s12); + x[5] = add_dct_const_round_shift_low_8(s5, s13); + x[6] = add_dct_const_round_shift_low_8(s6, s14); + x[7] = add_dct_const_round_shift_low_8(s7, s15); + x[8] = sub_dct_const_round_shift_low_8(s0, s8); + x[9] = sub_dct_const_round_shift_low_8(s1, s9); + x[10] = sub_dct_const_round_shift_low_8(s2, s10); + x[11] = sub_dct_const_round_shift_low_8(s3, s11); + x[12] = sub_dct_const_round_shift_low_8(s4, s12); + x[13] = sub_dct_const_round_shift_low_8(s5, s13); + x[14] = sub_dct_const_round_shift_low_8(s6, s14); + x[15] = sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_4_28_20_12, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_4_28_20_12, s10, s11); + iadst_butterfly_lane_1_0_neon(x[13], x[12], c_4_28_20_12, s13, s12); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_4_28_20_12, s15, s14); + + x[0] = vaddq_s16(t[0], t[4]); + x[1] = vaddq_s16(t[1], t[5]); + x[2] = vaddq_s16(t[2], t[6]); + x[3] = vaddq_s16(t[3], t[7]); + x[4] = vsubq_s16(t[0], t[4]); + x[5] = vsubq_s16(t[1], t[5]); + x[6] = vsubq_s16(t[2], t[6]); + x[7] = vsubq_s16(t[3], t[7]); + x[8] = add_dct_const_round_shift_low_8(s8, s12); + x[9] = add_dct_const_round_shift_low_8(s9, s13); + x[10] = add_dct_const_round_shift_low_8(s10, s14); + x[11] = add_dct_const_round_shift_low_8(s11, s15); + x[12] = sub_dct_const_round_shift_low_8(s8, s12); + x[13] = sub_dct_const_round_shift_low_8(s9, s13); + x[14] = sub_dct_const_round_shift_low_8(s10, s14); + x[15] = sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c_16_n16_8_24, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c_16_n16_8_24, s7, s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + iadst_butterfly_lane_2_3_neon(x[12], x[13], c_16_n16_8_24, s12, s13); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_16_n16_8_24, s15, s14); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s16(t[8], t[10]); + x[9] = vaddq_s16(t[9], t[11]); + x[10] = vsubq_s16(t[8], t[10]); + x[11] = vsubq_s16(t[9], t[11]); + x[12] = add_dct_const_round_shift_low_8(s12, s14); + x[13] = add_dct_const_round_shift_low_8(s13, s15); + x[14] = sub_dct_const_round_shift_low_8(s12, s14); + x[15] = sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24); + iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24); + + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 0); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 0); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 0); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 0); // right 8 columns +} diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index 025254c3f3..4f0a90f215 100644 --- a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -14,206 +14,63 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { - int32x4_t q8s32, q9s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - - d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16)); - d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16)); - - q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1])); - q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1])); - q0x2s32 = vtrnq_s32(q8s32, q9s32); - - *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); - *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); -} - -static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16) { - *d0s16 = vdup_n_s16(cospi_8_64); - *d1s16 = vdup_n_s16(cospi_16_64); - *d2s16 = vdup_n_s16(cospi_24_64); -} - -static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16) { - *d3s16 = vdup_n_s16(sinpi_1_9); - *d4s16 = vdup_n_s16(sinpi_2_9); - *q3s16 = vdupq_n_s16(sinpi_3_9); - *d5s16 = vdup_n_s16(sinpi_4_9); -} - -static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16, int16x8_t *q8s16, - int16x8_t *q9s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - int32x4_t q10s32, q13s32, q14s32, q15s32; - int16x8_t q13s16, q14s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, *d2s16); - q10s32 = vmull_s16(d17s16, *d0s16); - q13s32 = vmull_s16(d23s16, *d1s16); - q14s32 = vmull_s16(d24s16, *d1s16); - q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); - q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); - - d26s16 = vrshrn_n_s32(q13s32, 14); - d27s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d28s16 = vrshrn_n_s32(q10s32, 14); - - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - *q8s16 = vaddq_s16(q13s16, q14s16); - *q9s16 = vsubq_s16(q13s16, q14s16); - *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp -} - -static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16, - int16x8_t *q8s16, int16x8_t *q9s16) { - int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d6s16 = vget_low_s16(*q3s16); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - q10s32 = vmull_s16(*d3s16, d16s16); - q11s32 = vmull_s16(*d4s16, d16s16); - q12s32 = vmull_s16(d6s16, d17s16); - q13s32 = vmull_s16(*d5s16, d18s16); - q14s32 = vmull_s16(*d3s16, d18s16); - q15s32 = vmovl_s16(d16s16); - q15s32 = vaddw_s16(q15s32, d19s16); - q8s32 = vmull_s16(*d4s16, d19s16); - q15s32 = vsubw_s16(q15s32, d18s16); - q9s32 = vmull_s16(*d5s16, d19s16); - - q10s32 = vaddq_s32(q10s32, q13s32); - q10s32 = vaddq_s32(q10s32, q8s32); - q11s32 = vsubq_s32(q11s32, q14s32); - q8s32 = vdupq_n_s32(sinpi_3_9); - q11s32 = vsubq_s32(q11s32, q9s32); - q15s32 = vmulq_s32(q15s32, q8s32); - - q13s32 = vaddq_s32(q10s32, q12s32); - q10s32 = vaddq_s32(q10s32, q11s32); - q14s32 = vaddq_s32(q11s32, q12s32); - q10s32 = vsubq_s32(q10s32, q12s32); - - d16s16 = vrshrn_n_s32(q13s32, 14); - d17s16 = vrshrn_n_s32(q14s32, 14); - d18s16 = vrshrn_n_s32(q15s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - - *q8s16 = vcombine_s16(d16s16, d17s16); - *q9s16 = vcombine_s16(d18s16, d19s16); -} - void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - uint8x8_t d26u8, d27u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; - uint32x2_t d26u32, d27u32; - int16x8_t q3s16, q8s16, q9s16; - uint16x8_t q8u16, q9u16; + int16x8_t a[2]; + uint8x8_t s[2], d[2]; + uint16x8_t sum[2]; - d26u32 = d27u32 = vdup_n_u32(0); + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - - TRANSPOSE4X4(&q8s16, &q9s16); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_s16_4x4q(&a[0], &a[1]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht4x4_16_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate constants - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - - // first transform rows - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - case 2: // idct_iadst - // generate constantsyy - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; - case 3: // iadst_iadst - // generate constants - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); - dest += stride; - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + s[0] = load_u8(dest, stride); + s[1] = load_u8(dest + 2 * stride, stride); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); + store_u8(dest, stride, d[0]); + store_u8(dest + 2 * stride, stride, d[1]); } diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index 1c739861c3..46ee632e01 100644 --- a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -14,527 +14,55 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" -static int16_t cospi_2_64 = 16305; -static int16_t cospi_4_64 = 16069; -static int16_t cospi_6_64 = 15679; -static int16_t cospi_8_64 = 15137; -static int16_t cospi_10_64 = 14449; -static int16_t cospi_12_64 = 13623; -static int16_t cospi_14_64 = 12665; -static int16_t cospi_16_64 = 11585; -static int16_t cospi_18_64 = 10394; -static int16_t cospi_20_64 = 9102; -static int16_t cospi_22_64 = 7723; -static int16_t cospi_24_64 = 6270; -static int16_t cospi_26_64 = 4756; -static int16_t cospi_28_64 = 3196; -static int16_t cospi_30_64 = 1606; - -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vrshrn_n_s32(q2s32, 14); - d9s16 = vrshrn_n_s32(q3s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vrshrn_n_s32(q2s32, 14); - d15s16 = vrshrn_n_s32(q3s32, 14); - d12s16 = vrshrn_n_s32(q9s32, 14); - d13s16 = vrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); - - d18s16 = vrshrn_n_s32(q2s32, 14); - d19s16 = vrshrn_n_s32(q3s32, 14); - d22s16 = vrshrn_n_s32(q13s32, 14); - d23s16 = vrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vrshrn_n_s32(q2s32, 14); - d27s16 = vrshrn_n_s32(q3s32, 14); - d30s16 = vrshrn_n_s32(q8s32, 14); - d31s16 = vrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vrshrn_n_s32(q9s32, 14); - d11s16 = vrshrn_n_s32(q10s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); -} - -static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q2s16, q4s16, q5s16, q6s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; - int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - d14s16 = vdup_n_s16(cospi_2_64); - d15s16 = vdup_n_s16(cospi_30_64); - - q1s32 = vmull_s16(d30s16, d14s16); - q2s32 = vmull_s16(d31s16, d14s16); - q3s32 = vmull_s16(d30s16, d15s16); - q4s32 = vmull_s16(d31s16, d15s16); - - d30s16 = vdup_n_s16(cospi_18_64); - d31s16 = vdup_n_s16(cospi_14_64); - - q1s32 = vmlal_s16(q1s32, d16s16, d15s16); - q2s32 = vmlal_s16(q2s32, d17s16, d15s16); - q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); - q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); - - q5s32 = vmull_s16(d22s16, d30s16); - q6s32 = vmull_s16(d23s16, d30s16); - q7s32 = vmull_s16(d22s16, d31s16); - q8s32 = vmull_s16(d23s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d24s16, d31s16); - q6s32 = vmlal_s16(q6s32, d25s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); - q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); - - q11s32 = vaddq_s32(q1s32, q5s32); - q12s32 = vaddq_s32(q2s32, q6s32); - q1s32 = vsubq_s32(q1s32, q5s32); - q2s32 = vsubq_s32(q2s32, q6s32); - - d22s16 = vrshrn_n_s32(q11s32, 14); - d23s16 = vrshrn_n_s32(q12s32, 14); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q12s32 = vaddq_s32(q3s32, q7s32); - q15s32 = vaddq_s32(q4s32, q8s32); - q3s32 = vsubq_s32(q3s32, q7s32); - q4s32 = vsubq_s32(q4s32, q8s32); - - d2s16 = vrshrn_n_s32(q1s32, 14); - d3s16 = vrshrn_n_s32(q2s32, 14); - d24s16 = vrshrn_n_s32(q12s32, 14); - d25s16 = vrshrn_n_s32(q15s32, 14); - d6s16 = vrshrn_n_s32(q3s32, 14); - d7s16 = vrshrn_n_s32(q4s32, 14); - *q12s16 = vcombine_s16(d24s16, d25s16); - - d0s16 = vdup_n_s16(cospi_10_64); - d1s16 = vdup_n_s16(cospi_22_64); - q4s32 = vmull_s16(d26s16, d0s16); - q5s32 = vmull_s16(d27s16, d0s16); - q2s32 = vmull_s16(d26s16, d1s16); - q6s32 = vmull_s16(d27s16, d1s16); - - d30s16 = vdup_n_s16(cospi_26_64); - d31s16 = vdup_n_s16(cospi_6_64); - - q4s32 = vmlal_s16(q4s32, d20s16, d1s16); - q5s32 = vmlal_s16(q5s32, d21s16, d1s16); - q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); - q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); - - q0s32 = vmull_s16(d18s16, d30s16); - q13s32 = vmull_s16(d19s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d31s16); - q13s32 = vmlal_s16(q13s32, d29s16, d31s16); - - q10s32 = vmull_s16(d18s16, d31s16); - q9s32 = vmull_s16(d19s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); - q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); - - q14s32 = vaddq_s32(q2s32, q10s32); - q15s32 = vaddq_s32(q6s32, q9s32); - q2s32 = vsubq_s32(q2s32, q10s32); - q6s32 = vsubq_s32(q6s32, q9s32); - - d28s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q6s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - q9s32 = vaddq_s32(q4s32, q0s32); - q10s32 = vaddq_s32(q5s32, q13s32); - q4s32 = vsubq_s32(q4s32, q0s32); - q5s32 = vsubq_s32(q5s32, q13s32); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - d18s16 = vrshrn_n_s32(q9s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - d8s16 = vrshrn_n_s32(q4s32, 14); - d9s16 = vrshrn_n_s32(q5s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q5s32 = vmull_s16(d2s16, d30s16); - q6s32 = vmull_s16(d3s16, d30s16); - q7s32 = vmull_s16(d2s16, d31s16); - q0s32 = vmull_s16(d3s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d6s16, d31s16); - q6s32 = vmlal_s16(q6s32, d7s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); - - q1s32 = vmull_s16(d4s16, d30s16); - q3s32 = vmull_s16(d5s16, d30s16); - q10s32 = vmull_s16(d4s16, d31s16); - q2s32 = vmull_s16(d5s16, d31s16); - - q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); - q10s32 = vmlal_s16(q10s32, d8s16, d30s16); - q2s32 = vmlal_s16(q2s32, d9s16, d30s16); - - *q8s16 = vaddq_s16(*q11s16, *q9s16); - *q11s16 = vsubq_s16(*q11s16, *q9s16); - q4s16 = vaddq_s16(*q12s16, *q14s16); - *q12s16 = vsubq_s16(*q12s16, *q14s16); - - q14s32 = vaddq_s32(q5s32, q1s32); - q15s32 = vaddq_s32(q6s32, q3s32); - q5s32 = vsubq_s32(q5s32, q1s32); - q6s32 = vsubq_s32(q6s32, q3s32); - - d18s16 = vrshrn_n_s32(q14s32, 14); - d19s16 = vrshrn_n_s32(q15s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q1s32 = vaddq_s32(q7s32, q10s32); - q3s32 = vaddq_s32(q0s32, q2s32); - q7s32 = vsubq_s32(q7s32, q10s32); - q0s32 = vsubq_s32(q0s32, q2s32); - - d28s16 = vrshrn_n_s32(q1s32, 14); - d29s16 = vrshrn_n_s32(q3s32, 14); - d14s16 = vrshrn_n_s32(q7s32, 14); - d15s16 = vrshrn_n_s32(q0s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - d30s16 = vdup_n_s16(cospi_16_64); - - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - q2s32 = vmull_s16(d22s16, d30s16); - q3s32 = vmull_s16(d23s16, d30s16); - q13s32 = vmull_s16(d22s16, d30s16); - q1s32 = vmull_s16(d23s16, d30s16); - - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - q2s32 = vmlal_s16(q2s32, d24s16, d30s16); - q3s32 = vmlal_s16(q3s32, d25s16, d30s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); - q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); - - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q3s32, 14); - d24s16 = vrshrn_n_s32(q13s32, 14); - d25s16 = vrshrn_n_s32(q1s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - *q12s16 = vcombine_s16(d24s16, d25s16); - - q13s32 = vmull_s16(d10s16, d30s16); - q1s32 = vmull_s16(d11s16, d30s16); - q11s32 = vmull_s16(d10s16, d30s16); - q0s32 = vmull_s16(d11s16, d30s16); - - q13s32 = vmlal_s16(q13s32, d14s16, d30s16); - q1s32 = vmlal_s16(q1s32, d15s16, d30s16); - q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); - - d20s16 = vrshrn_n_s32(q13s32, 14); - d21s16 = vrshrn_n_s32(q1s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q0s32, 14); - *q10s16 = vcombine_s16(d20s16, d21s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q5s16 = vdupq_n_s16(0); - - *q9s16 = vsubq_s16(q5s16, *q9s16); - *q11s16 = vsubq_s16(q5s16, q2s16); - *q13s16 = vsubq_s16(q5s16, q6s16); - *q15s16 = vsubq_s16(q5s16, q4s16); -} - void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - int i; - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a[8]; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 8 * 2); - q11s16 = vld1q_s16(input + 8 * 3); - q12s16 = vld1q_s16(input + 8 * 4); - q13s16 = vld1q_s16(input + 8 * 5); - q14s16 = vld1q_s16(input + 8 * 6); - q15s16 = vld1q_s16(input + 8 * 7); + a[0] = load_tran_low_to_s16q(input + 0 * 8); + a[1] = load_tran_low_to_s16q(input + 1 * 8); + a[2] = load_tran_low_to_s16q(input + 2 * 8); + a[3] = load_tran_low_to_s16q(input + 3 * 8); + a[4] = load_tran_low_to_s16q(input + 4 * 8); + a[5] = load_tran_low_to_s16q(input + 5 * 8); + a[6] = load_tran_low_to_s16q(input + 6 * 8); + a[7] = load_tran_low_to_s16q(input + 7 * 8); - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht8x8_64_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // first transform rows - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - case 2: // idct_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // then transform columns - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case ADST_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; - case 3: // iadst_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_ADST: + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - for (d1 = d2 = dest, i = 0; i < 2; i++) { - if (i != 0) { - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - } - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = - vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = - vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += stride; - } + idct8x8_add8x8_neon(a, dest, stride); } diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht_neon.h b/libs/libvpx/vp9/common/arm/neon/vp9_iht_neon.h new file mode 100644 index 0000000000..c64822e27c --- /dev/null +++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht_neon.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ +#define VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void iadst4(int16x8_t *const io) { + const int32x4_t c3 = vdupq_n_s32(sinpi_3_9); + int16x4_t x[4]; + int32x4_t s[8], output[4]; + const int16x4_t c = + create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9); + + x[0] = vget_low_s16(io[0]); + x[1] = vget_low_s16(io[1]); + x[2] = vget_high_s16(io[0]); + x[3] = vget_high_s16(io[1]); + + s[0] = vmull_lane_s16(x[0], c, 0); + s[1] = vmull_lane_s16(x[0], c, 1); + s[2] = vmull_lane_s16(x[1], c, 2); + s[3] = vmull_lane_s16(x[2], c, 3); + s[4] = vmull_lane_s16(x[2], c, 0); + s[5] = vmull_lane_s16(x[3], c, 1); + s[6] = vmull_lane_s16(x[3], c, 3); + s[7] = vaddl_s16(x[0], x[3]); + s[7] = vsubw_s16(s[7], x[2]); + + s[0] = vaddq_s32(s[0], s[3]); + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[4]); + s[1] = vsubq_s32(s[1], s[6]); + s[3] = s[2]; + s[2] = vmulq_s32(c3, s[7]); + + output[0] = vaddq_s32(s[0], s[3]); + output[1] = vaddq_s32(s[1], s[3]); + output[2] = s[2]; + output[3] = vaddq_s32(s[0], s[1]); + output[3] = vsubq_s32(output[3], s[3]); + dct_const_round_shift_low_8_dual(output, &io[0], &io[1]); +} + +static INLINE void iadst_half_butterfly_neon(int16x8_t *const x, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + x[0] = dct_const_round_shift_low_8(t0); + x[1] = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); +} + +static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); +} + +static INLINE void iadst_butterfly_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); +} + +static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); +} + +static INLINE int16x8_t add_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vaddq_s32(in0[0], in1[0]); + sum[1] = vaddq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE int16x8_t sub_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vsubq_s32(in0[0], in1[0]); + sum[1] = vsubq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE void iadst8(int16x8_t *const io) { + const int16x4_t c0 = + create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int16x4_t c1 = + create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int16x4_t c2 = + create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int16x8_t x[8], t[4]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + + x[0] = io[7]; + x[1] = io[0]; + x[2] = io[5]; + x[3] = io[2]; + x[4] = io[3]; + x[5] = io[4]; + x[6] = io[1]; + x[7] = io[6]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7); + + x[0] = add_dct_const_round_shift_low_8(s0, s4); + x[1] = add_dct_const_round_shift_low_8(s1, s5); + x[2] = add_dct_const_round_shift_low_8(s2, s6); + x[3] = add_dct_const_round_shift_low_8(s3, s7); + x[4] = sub_dct_const_round_shift_low_8(s0, s4); + x[5] = sub_dct_const_round_shift_low_8(s1, s5); + x[6] = sub_dct_const_round_shift_low_8(s2, s6); + x[7] = sub_dct_const_round_shift_low_8(s3, s7); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + + // stage 3 + iadst_half_butterfly_neon(x + 2, c2); + iadst_half_butterfly_neon(x + 6, c2); + + io[0] = x[0]; + io[1] = vnegq_s16(x[4]); + io[2] = x[6]; + io[3] = vnegq_s16(x[2]); + io[4] = x[3]; + io[5] = vnegq_s16(x[7]); + io[6] = x[5]; + io[7] = vnegq_s16(x[1]); +} + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef void (*iht_1d)(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef struct { + iht_1d cols, rows; // vertical and horizontal +} iht_2d; + +#endif // VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c index 3e3530116d..c031322806 100644 --- a/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c +++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c index 786fbdb794..aaccd5ca7b 100644 --- a/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c +++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c index e4166775da..76d15ff8c0 100644 --- a/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c +++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c b/libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c new file mode 100644 index 0000000000..e861596ad4 --- /dev/null +++ b/libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +#include "vp9/common/vp9_enums.h" + +void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + case ADST_DCT: + vpx_idct4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + } + + vpx_round_store4x4_vsx(in, out, dest, stride); +} + +void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[8], out[8]; + + // load input data + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + in[2] = load_tran_low(2 * 8 * sizeof(*input), input); + in[3] = load_tran_low(3 * 8 * sizeof(*input), input); + in[4] = load_tran_low(4 * 8 * sizeof(*input), input); + in[5] = load_tran_low(5 * 8 * sizeof(*input), input); + in[6] = load_tran_low(6 * 8 * sizeof(*input), input); + in[7] = load_tran_low(7 * 8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + case ADST_DCT: + vpx_idct8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + } + + vpx_round_store8x8_vsx(in, dest, stride); +} + +void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + int16x8_t in0[16], in1[16]; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0); + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), in1); + + switch (tx_type) { + case DCT_DCT: + vpx_idct16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + case ADST_DCT: + vpx_idct16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + case DCT_ADST: + vpx_iadst16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + default: + assert(tx_type == ADST_ADST); + vpx_iadst16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + } + + vpx_round_store16x16_vsx(in0, in1, dest, stride); +} diff --git a/libs/libvpx/vp9/common/vp9_alloccommon.h b/libs/libvpx/vp9/common/vp9_alloccommon.h index a3a1638572..8900038ead 100644 --- a/libs/libvpx/vp9/common/vp9_alloccommon.h +++ b/libs/libvpx/vp9/common/vp9_alloccommon.h @@ -8,10 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_ -#define VP9_COMMON_VP9_ALLOCCOMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ +#define VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ -#define INVALID_IDX -1 // Invalid buffer index. +#define INVALID_IDX (-1) // Invalid buffer index. #ifdef __cplusplus extern "C" { @@ -41,4 +41,4 @@ void vp9_swap_current_and_last_seg_map(struct VP9Common *cm); } // extern "C" #endif -#endif // VP9_COMMON_VP9_ALLOCCOMMON_H_ +#endif // VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/libs/libvpx/vp9/common/vp9_blockd.h b/libs/libvpx/vp9/common/vp9_blockd.h index 780b29208b..2ddc0f121c 100644 --- a/libs/libvpx/vp9/common/vp9_blockd.h +++ b/libs/libvpx/vp9/common/vp9_blockd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_BLOCKD_H_ -#define VP9_COMMON_VP9_BLOCKD_H_ +#ifndef VPX_VP9_COMMON_VP9_BLOCKD_H_ +#define VPX_VP9_COMMON_VP9_BLOCKD_H_ #include "./vpx_config.h" @@ -54,12 +54,13 @@ typedef struct { // decoder implementation modules critically rely on the defined entry values // specified herein. They should be refactored concurrently. -#define NONE -1 +#define NONE (-1) #define INTRA_FRAME 0 #define LAST_FRAME 1 #define GOLDEN_FRAME 2 #define ALTREF_FRAME 3 #define MAX_REF_FRAMES 4 + typedef int8_t MV_REFERENCE_FRAME; // This structure now relates to 8x8 block regions. @@ -130,6 +131,8 @@ struct macroblockd_plane { // encoder const int16_t *dequant; + + int *eob; }; #define BLOCK_OFFSET(x, i) ((x) + (i)*16) @@ -173,7 +176,7 @@ typedef struct macroblockd { FRAME_CONTEXT *fc; /* pointers to reference frames */ - RefBuffer *block_refs[2]; + const RefBuffer *block_refs[2]; /* pointer to current frame */ const YV12_BUFFER_CONFIG *cur_buf; @@ -193,6 +196,8 @@ typedef struct macroblockd { int corrupted; struct vpx_internal_error_info *error_info; + + PARTITION_TYPE *partition; } MACROBLOCKD; static INLINE PLANE_TYPE get_plane_type(int plane) { @@ -281,8 +286,30 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff); +#if CONFIG_MISMATCH_DEBUG +#define TX_UNIT_SIZE_LOG2 2 +static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, + int mi_row, int tx_blk_col, int tx_blk_row, + int subsampling_x, int subsampling_y) { + *pixel_c = ((mi_col << MI_SIZE_LOG2) >> subsampling_x) + + (tx_blk_col << TX_UNIT_SIZE_LOG2); + *pixel_r = ((mi_row << MI_SIZE_LOG2) >> subsampling_y) + + (tx_blk_row << TX_UNIT_SIZE_LOG2); +} + +static INLINE int get_block_width(BLOCK_SIZE bsize) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + return 4 * num_4x4_w; +} + +static INLINE int get_block_height(BLOCK_SIZE bsize) { + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + return 4 * num_4x4_h; +} +#endif + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_BLOCKD_H_ +#endif // VPX_VP9_COMMON_VP9_BLOCKD_H_ diff --git a/libs/libvpx/vp9/common/vp9_common.h b/libs/libvpx/vp9/common/vp9_common.h index 666c3beaf0..e3c5535ddb 100644 --- a/libs/libvpx/vp9/common/vp9_common.h +++ b/libs/libvpx/vp9/common/vp9_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_COMMON_H_ -#define VP9_COMMON_VP9_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_COMMON_H_ +#define VPX_VP9_COMMON_VP9_COMMON_H_ /* Interface header for common constant data structures and lookup tables */ @@ -33,14 +33,14 @@ extern "C" { } // Use this for variably-sized arrays. -#define vp9_copy_array(dest, src, n) \ - { \ - assert(sizeof(*dest) == sizeof(*src)); \ - memcpy(dest, src, n * sizeof(*src)); \ +#define vp9_copy_array(dest, src, n) \ + { \ + assert(sizeof(*(dest)) == sizeof(*(src))); \ + memcpy(dest, src, (n) * sizeof(*(src))); \ } #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest)) -#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest)) +#define vp9_zero_array(dest, n) memset(dest, 0, (n) * sizeof(*(dest))) static INLINE int get_unsigned_bits(unsigned int num_values) { return num_values > 0 ? get_msb(num_values) + 1 : 0; @@ -49,8 +49,8 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { #if CONFIG_DEBUG #define CHECK_MEM_ERROR(cm, lval, expr) \ do { \ - lval = (expr); \ - if (!lval) \ + (lval) = (expr); \ + if (!(lval)) \ vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ "Failed to allocate " #lval " at %s:%d", __FILE__, \ __LINE__); \ @@ -58,8 +58,8 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { #else #define CHECK_MEM_ERROR(cm, lval, expr) \ do { \ - lval = (expr); \ - if (!lval) \ + (lval) = (expr); \ + if (!(lval)) \ vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ "Failed to allocate " #lval); \ } while (0) @@ -75,4 +75,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } // extern "C" #endif -#endif // VP9_COMMON_VP9_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_COMMON_H_ diff --git a/libs/libvpx/vp9/common/vp9_common_data.c b/libs/libvpx/vp9/common/vp9_common_data.c index 4a10833229..809d7317ce 100644 --- a/libs/libvpx/vp9/common/vp9_common_data.c +++ b/libs/libvpx/vp9/common/vp9_common_data.c @@ -28,7 +28,7 @@ const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 2, 2, const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8 }; -// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize))) +// VPXMIN(3, VPXMIN(b_width_log2_lookup(bsize), b_height_log2_lookup(bsize))) const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; diff --git a/libs/libvpx/vp9/common/vp9_common_data.h b/libs/libvpx/vp9/common/vp9_common_data.h index 5c6a7e8ff3..a533c5f058 100644 --- a/libs/libvpx/vp9/common/vp9_common_data.h +++ b/libs/libvpx/vp9/common/vp9_common_data.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_COMMON_DATA_H_ -#define VP9_COMMON_VP9_COMMON_DATA_H_ +#ifndef VPX_VP9_COMMON_VP9_COMMON_DATA_H_ +#define VPX_VP9_COMMON_VP9_COMMON_DATA_H_ #include "vp9/common/vp9_enums.h" #include "vpx/vpx_integer.h" @@ -42,4 +42,4 @@ extern const uint8_t need_top_left[INTRA_MODES]; } // extern "C" #endif -#endif // VP9_COMMON_VP9_COMMON_DATA_H_ +#endif // VPX_VP9_COMMON_VP9_COMMON_DATA_H_ diff --git a/libs/libvpx/vp9/common/vp9_entropy.c b/libs/libvpx/vp9/common/vp9_entropy.c index a575bda729..430b917b8f 100644 --- a/libs/libvpx/vp9/common/vp9_entropy.c +++ b/libs/libvpx/vp9/common/vp9_entropy.c @@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254, 177, 153, 140, 133, 130, 129 }; #endif +/* clang-format off */ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; +/* clang-format on */ const uint8_t vp9_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, diff --git a/libs/libvpx/vp9/common/vp9_entropy.h b/libs/libvpx/vp9/common/vp9_entropy.h index 1da4911668..d026651df7 100644 --- a/libs/libvpx/vp9/common/vp9_entropy.h +++ b/libs/libvpx/vp9/common/vp9_entropy.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPY_H_ -#define VP9_COMMON_VP9_ENTROPY_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPY_H_ +#define VPX_VP9_COMMON_VP9_ENTROPY_H_ #include "vpx/vpx_integer.h" #include "vpx_dsp/prob.h" @@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly - #define COEFF_PROB_MODELS 255 #define UNCONSTRAINED_NODES 3 @@ -195,4 +194,4 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPY_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPY_H_ diff --git a/libs/libvpx/vp9/common/vp9_entropymode.c b/libs/libvpx/vp9/common/vp9_entropymode.c index 47cd63e94f..bda824de3c 100644 --- a/libs/libvpx/vp9/common/vp9_entropymode.c +++ b/libs/libvpx/vp9/common/vp9_entropymode.c @@ -179,29 +179,32 @@ static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = { { 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm }; -const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = - { - // 8x8 -> 4x4 - { 158, 97, 94 }, // a/l both not split - { 93, 24, 99 }, // a split, l not split - { 85, 119, 44 }, // l split, a not split - { 62, 59, 67 }, // a/l both split - // 16x16 -> 8x8 - { 149, 53, 53 }, // a/l both not split - { 94, 20, 48 }, // a split, l not split - { 83, 53, 24 }, // l split, a not split - { 52, 18, 18 }, // a/l both split - // 32x32 -> 16x16 - { 150, 40, 39 }, // a/l both not split - { 78, 12, 26 }, // a split, l not split - { 67, 33, 11 }, // l split, a not split - { 24, 7, 5 }, // a/l both split - // 64x64 -> 32x32 - { 174, 35, 49 }, // a/l both not split - { 68, 11, 27 }, // a split, l not split - { 57, 15, 9 }, // l split, a not split - { 12, 3, 3 }, // a/l both split - }; +const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 158, 97, 94 }, // a/l both not split + { 93, 24, 99 }, // a split, l not split + { 85, 119, 44 }, // l split, a not split + { 62, 59, 67 }, // a/l both split + + // 16x16 -> 8x8 + { 149, 53, 53 }, // a/l both not split + { 94, 20, 48 }, // a split, l not split + { 83, 53, 24 }, // l split, a not split + { 52, 18, 18 }, // a/l both split + + // 32x32 -> 16x16 + { 150, 40, 39 }, // a/l both not split + { 78, 12, 26 }, // a split, l not split + { 67, 33, 11 }, // l split, a not split + { 24, 7, 5 }, // a/l both split + + // 64x64 -> 32x32 + { 174, 35, 49 }, // a/l both not split + { 68, 11, 27 }, // a split, l not split + { 57, 15, 9 }, // l split, a not split + { 12, 3, 3 }, // a/l both split + }; static const vpx_prob default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = { @@ -260,13 +263,13 @@ const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT }; -static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { - 9, 102, 187, 225 -}; +static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102, + 187, + 225 }; -static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { - 239, 183, 119, 96, 41 -}; +static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { 239, 183, + 119, 96, + 41 }; static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = { 50, 126, 123, 221, 226 }; @@ -331,8 +334,8 @@ static void init_mode_probs(FRAME_CONTEXT *fc) { vp9_copy(fc->inter_mode_probs, default_inter_mode_probs); } -const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] = - { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; +const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE( + SWITCHABLE_FILTERS)] = { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; void vp9_adapt_mode_probs(VP9_COMMON *cm) { int i, j; diff --git a/libs/libvpx/vp9/common/vp9_entropymode.h b/libs/libvpx/vp9/common/vp9_entropymode.h index 0ee663fe88..a756c8d0b8 100644 --- a/libs/libvpx/vp9/common/vp9_entropymode.h +++ b/libs/libvpx/vp9/common/vp9_entropymode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ -#define VP9_COMMON_VP9_ENTROPYMODE_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymv.h" @@ -104,4 +104,4 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPYMODE_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/libs/libvpx/vp9/common/vp9_entropymv.c b/libs/libvpx/vp9/common/vp9_entropymv.c index a18a290cfd..b6f052d088 100644 --- a/libs/libvpx/vp9/common/vp9_entropymv.c +++ b/libs/libvpx/vp9/common/vp9_entropymv.c @@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { 18, -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10, }; -const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { - -0, -1, -}; +const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 }; const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1, 4, -2, -3 }; diff --git a/libs/libvpx/vp9/common/vp9_entropymv.h b/libs/libvpx/vp9/common/vp9_entropymv.h index e2fe37a327..ee9d37973f 100644 --- a/libs/libvpx/vp9/common/vp9_entropymv.h +++ b/libs/libvpx/vp9/common/vp9_entropymv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPYMV_H_ -#define VP9_COMMON_VP9_ENTROPYMV_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMV_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMV_H_ #include "./vpx_config.h" @@ -25,7 +25,7 @@ struct VP9Common; void vp9_init_mv_probs(struct VP9Common *cm); -void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); +void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp); static INLINE int use_mv_hp(const MV *ref) { const int kMvRefThresh = 64; // threshold for use of high-precision 1/8 mv @@ -127,10 +127,10 @@ typedef struct { nmv_component_counts comps[2]; } nmv_context_counts; -void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPYMV_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/libs/libvpx/vp9/common/vp9_enums.h b/libs/libvpx/vp9/common/vp9_enums.h index 056b298b3d..b33a3a2978 100644 --- a/libs/libvpx/vp9/common/vp9_enums.h +++ b/libs/libvpx/vp9/common/vp9_enums.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENUMS_H_ -#define VP9_COMMON_VP9_ENUMS_H_ +#ifndef VPX_VP9_COMMON_VP9_ENUMS_H_ +#define VPX_VP9_COMMON_VP9_ENUMS_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -41,6 +41,8 @@ typedef enum BITSTREAM_PROFILE { MAX_PROFILES } BITSTREAM_PROFILE; +typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG; + #define BLOCK_4X4 0 #define BLOCK_4X8 1 #define BLOCK_8X4 2 @@ -140,4 +142,4 @@ typedef uint8_t PREDICTION_MODE; } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENUMS_H_ +#endif // VPX_VP9_COMMON_VP9_ENUMS_H_ diff --git a/libs/libvpx/vp9/common/vp9_filter.c b/libs/libvpx/vp9/common/vp9_filter.c index 6c43af8ce8..adbda6c825 100644 --- a/libs/libvpx/vp9/common/vp9_filter.c +++ b/libs/libvpx/vp9/common/vp9_filter.c @@ -63,6 +63,20 @@ DECLARE_ALIGNED(256, static const InterpKernel, { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 1, 38, 64, 32, -1, -3 } }; -const InterpKernel *vp9_filter_kernels[4] = { - sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters +// 4-tap filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -6, 120, 18, -4, 0, 0 }, { 0, 0, -8, 114, 28, -6, 0, 0 }, + { 0, 0, -10, 108, 36, -6, 0, 0 }, { 0, 0, -12, 102, 46, -8, 0, 0 }, + { 0, 0, -12, 94, 56, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 56, 94, -12, 0, 0 }, { 0, 0, -8, 46, 102, -12, 0, 0 }, + { 0, 0, -6, 36, 108, -10, 0, 0 }, { 0, 0, -6, 28, 114, -8, 0, 0 }, + { 0, 0, -4, 18, 120, -6, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; + +const InterpKernel *vp9_filter_kernels[5] = { + sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters, + sub_pel_filters_4 }; diff --git a/libs/libvpx/vp9/common/vp9_filter.h b/libs/libvpx/vp9/common/vp9_filter.h index 9d2b8e1dbf..0382c88e7c 100644 --- a/libs/libvpx/vp9/common/vp9_filter.h +++ b/libs/libvpx/vp9/common/vp9_filter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_FILTER_H_ -#define VP9_COMMON_VP9_FILTER_H_ +#ifndef VPX_VP9_COMMON_VP9_FILTER_H_ +#define VPX_VP9_COMMON_VP9_FILTER_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -25,6 +25,7 @@ extern "C" { #define EIGHTTAP_SHARP 2 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */ #define BILINEAR 3 +#define FOURTAP 4 // The codec can operate in four possible inter prediction filter mode: // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) @@ -32,10 +33,10 @@ extern "C" { typedef uint8_t INTERP_FILTER; -extern const InterpKernel *vp9_filter_kernels[4]; +extern const InterpKernel *vp9_filter_kernels[5]; #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_FILTER_H_ +#endif // VPX_VP9_COMMON_VP9_FILTER_H_ diff --git a/libs/libvpx/vp9/common/vp9_frame_buffers.h b/libs/libvpx/vp9/common/vp9_frame_buffers.h index e2cfe61b66..11be838c02 100644 --- a/libs/libvpx/vp9/common/vp9_frame_buffers.h +++ b/libs/libvpx/vp9/common/vp9_frame_buffers.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ -#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#ifndef VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ #include "vpx/vpx_frame_buffer.h" #include "vpx/vpx_integer.h" @@ -50,4 +50,4 @@ int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); } // extern "C" #endif -#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#endif // VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/libs/libvpx/vp9/common/vp9_idct.h b/libs/libvpx/vp9/common/vp9_idct.h index 3e83b8402d..94eeaf599e 100644 --- a/libs/libvpx/vp9/common/vp9_idct.h +++ b/libs/libvpx/vp9/common/vp9_idct.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_IDCT_H_ -#define VP9_COMMON_VP9_IDCT_H_ +#ifndef VPX_VP9_COMMON_VP9_IDCT_H_ +#define VPX_VP9_COMMON_VP9_IDCT_H_ #include @@ -78,4 +78,4 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, } // extern "C" #endif -#endif // VP9_COMMON_VP9_IDCT_H_ +#endif // VPX_VP9_COMMON_VP9_IDCT_H_ diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.c b/libs/libvpx/vp9/common/vp9_loopfilter.c index c7c343aed5..95d6029f3b 100644 --- a/libs/libvpx/vp9/common/vp9_loopfilter.c +++ b/libs/libvpx/vp9/common/vp9_loopfilter.c @@ -880,12 +880,12 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, - MODE_INFO **mi, const int mode_info_stride, + MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm) { int idx_32, idx_16, idx_8; const loop_filter_info_n *const lfi_n = &cm->lf_info; - MODE_INFO **mip = mi; - MODE_INFO **mip2 = mi; + MODE_INFO **mip = mi8x8; + MODE_INFO **mip2 = mi8x8; // These are offsets to the next mi in the 64x64 block. It is what gets // added to the mi ptr as we go through each loop. It helps us to avoid @@ -1087,13 +1087,19 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, const int row_step_stride = cm->mi_stride * row_step; struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; - unsigned int mask_16x16[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_8x8[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_4x4[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_4x4_int[MI_BLOCK_SIZE] = { 0 }; + unsigned int mask_16x16[MI_BLOCK_SIZE]; + unsigned int mask_8x8[MI_BLOCK_SIZE]; + unsigned int mask_4x4[MI_BLOCK_SIZE]; + unsigned int mask_4x4_int[MI_BLOCK_SIZE]; uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE]; int r, c; + vp9_zero(mask_16x16); + vp9_zero(mask_8x8); + vp9_zero(mask_4x4); + vp9_zero(mask_4x4_int); + vp9_zero(lfl); + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { unsigned int mask_16x16_c = 0; unsigned int mask_8x8_c = 0; @@ -1174,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, } // Disable filtering on the leftmost column - border_mask = ~(mi_col == 0); + border_mask = ~(mi_col == 0 ? 1 : 0); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { highbd_filter_selectively_vert( @@ -1330,6 +1336,8 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm, uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; uint16_t mask_4x4_int = lfm->int_4x4_uv; + vp9_zero(lfl_uv); + assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); // Vertical pass: do 2 rows at one time diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.h b/libs/libvpx/vp9/common/vp9_loopfilter.h index 481a6cdc63..39648a72c3 100644 --- a/libs/libvpx/vp9/common/vp9_loopfilter.h +++ b/libs/libvpx/vp9/common/vp9_loopfilter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_LOOPFILTER_H_ -#define VP9_COMMON_VP9_LOOPFILTER_H_ +#ifndef VPX_VP9_COMMON_VP9_LOOPFILTER_H_ +#define VPX_VP9_COMMON_VP9_LOOPFILTER_H_ #include "vpx_ports/mem.h" #include "./vpx_config.h" @@ -97,7 +97,7 @@ struct VP9LfSyncData; // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(struct VP9Common *const cm, const int mi_row, - const int mi_col, MODE_INFO **mi_8x8, + const int mi_col, MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm); void vp9_filter_block_plane_ss00(struct VP9Common *const cm, @@ -120,7 +120,7 @@ void vp9_loop_filter_init(struct VP9Common *cm); void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, - struct macroblockd *mbd, int filter_level, + struct macroblockd *xd, int frame_filter_level, int y_only, int partial_frame); // Get the superblock lfm for a given mi_row, mi_col. @@ -157,4 +157,4 @@ int vp9_loop_filter_worker(void *arg1, void *unused); } // extern "C" #endif -#endif // VP9_COMMON_VP9_LOOPFILTER_H_ +#endif // VPX_VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/libs/libvpx/vp9/common/vp9_mfqe.h b/libs/libvpx/vp9/common/vp9_mfqe.h index dfff8c23d6..f53e1c2f9d 100644 --- a/libs/libvpx/vp9/common/vp9_mfqe.h +++ b/libs/libvpx/vp9/common/vp9_mfqe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MFQE_H_ -#define VP9_COMMON_VP9_MFQE_H_ +#ifndef VPX_VP9_COMMON_VP9_MFQE_H_ +#define VPX_VP9_COMMON_VP9_MFQE_H_ #ifdef __cplusplus extern "C" { @@ -28,4 +28,4 @@ void vp9_mfqe(struct VP9Common *cm); } // extern "C" #endif -#endif // VP9_COMMON_VP9_MFQE_H_ +#endif // VPX_VP9_COMMON_VP9_MFQE_H_ diff --git a/libs/libvpx/vp9/common/vp9_mv.h b/libs/libvpx/vp9/common/vp9_mv.h index 4c8eac7213..14dde7dd05 100644 --- a/libs/libvpx/vp9/common/vp9_mv.h +++ b/libs/libvpx/vp9/common/vp9_mv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MV_H_ -#define VP9_COMMON_VP9_MV_H_ +#ifndef VPX_VP9_COMMON_VP9_MV_H_ +#define VPX_VP9_COMMON_VP9_MV_H_ #include "vpx/vpx_integer.h" @@ -52,4 +52,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, } // extern "C" #endif -#endif // VP9_COMMON_VP9_MV_H_ +#endif // VPX_VP9_COMMON_VP9_MV_H_ diff --git a/libs/libvpx/vp9/common/vp9_mvref_common.h b/libs/libvpx/vp9/common/vp9_mvref_common.h index 2b2c1ba9ee..5db6772dca 100644 --- a/libs/libvpx/vp9/common/vp9_mvref_common.h +++ b/libs/libvpx/vp9/common/vp9_mvref_common.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ -#define VP9_COMMON_VP9_MVREF_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_blockd.h" @@ -263,10 +263,10 @@ static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref, mv_ref_list, Done) \ do { \ if (is_inter_block(mbmi)) { \ - if ((mbmi)->ref_frame[0] != ref_frame) \ + if ((mbmi)->ref_frame[0] != (ref_frame)) \ ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \ refmv_count, mv_ref_list, Done); \ - if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame && \ + if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != (ref_frame) && \ (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \ refmv_count, mv_ref_list, Done); \ @@ -320,4 +320,4 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int block, } // extern "C" #endif -#endif // VP9_COMMON_VP9_MVREF_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/libs/libvpx/vp9/common/vp9_onyxc_int.h b/libs/libvpx/vp9/common/vp9_onyxc_int.h index 1d96d92c24..662b8ef5e1 100644 --- a/libs/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libs/libvpx/vp9/common/vp9_onyxc_int.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ONYXC_INT_H_ -#define VP9_COMMON_VP9_ONYXC_INT_H_ +#ifndef VPX_VP9_COMMON_VP9_ONYXC_INT_H_ +#define VPX_VP9_COMMON_VP9_ONYXC_INT_H_ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" @@ -37,10 +37,9 @@ extern "C" { #define REF_FRAMES_LOG2 3 #define REF_FRAMES (1 << REF_FRAMES_LOG2) -// 1 scratch frame for the new frame, 3 for scaled references on the encoder. -// TODO(jkoleszar): These 3 extra references could probably come from the -// normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 4) +// 1 scratch frame for the new frame, REFS_PER_FRAME for scaled references on +// the encoder. +#define FRAME_BUFFERS (REF_FRAMES + 1 + REFS_PER_FRAME) #define FRAME_CONTEXTS_LOG2 2 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) @@ -70,6 +69,7 @@ typedef struct { int mi_rows; int mi_cols; uint8_t released; + int frame_index; vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; } RefCntBuffer; @@ -128,6 +128,8 @@ typedef struct VP9Common { int new_fb_idx; + int cur_show_frame_fb_idx; + #if CONFIG_VP9_POSTPROC YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG post_proc_buffer_int; @@ -256,8 +258,16 @@ typedef struct VP9Common { PARTITION_CONTEXT *above_seg_context; ENTROPY_CONTEXT *above_context; int above_context_alloc_cols; + + int lf_row; } VP9_COMMON; +static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= FRAME_BUFFERS) return NULL; + if (cm->error.error_code != VPX_CODEC_OK) return NULL; + return &cm->buffer_pool->frame_bufs[index].buf; +} + static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; if (cm->ref_frame_map[index] < 0) return NULL; @@ -405,4 +415,4 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ONYXC_INT_H_ +#endif // VPX_VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/libs/libvpx/vp9/common/vp9_postproc.c b/libs/libvpx/vp9/common/vp9_postproc.c index dfc315eeac..5373b02181 100644 --- a/libs/libvpx/vp9/common/vp9_postproc.c +++ b/libs/libvpx/vp9/common/vp9_postproc.c @@ -293,7 +293,7 @@ static void swap_mi_and_prev_mi(VP9_COMMON *cm) { } int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *ppflags) { + vp9_ppflags_t *ppflags, int unscaled_width) { const int q = VPXMIN(105, cm->lf.filter_level * 2); const int flags = ppflags->post_proc_flag; YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer; @@ -359,7 +359,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) { if (!cm->postproc_state.limits) { cm->postproc_state.limits = - vpx_calloc(cm->width, sizeof(*cm->postproc_state.limits)); + vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits)); } } diff --git a/libs/libvpx/vp9/common/vp9_postproc.h b/libs/libvpx/vp9/common/vp9_postproc.h index 6059094114..67efc1b4e4 100644 --- a/libs/libvpx/vp9/common/vp9_postproc.h +++ b/libs/libvpx/vp9/common/vp9_postproc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_POSTPROC_H_ -#define VP9_COMMON_VP9_POSTPROC_H_ +#ifndef VPX_VP9_COMMON_VP9_POSTPROC_H_ +#define VPX_VP9_COMMON_VP9_POSTPROC_H_ #include "vpx_ports/mem.h" #include "vpx_scale/yv12config.h" @@ -38,7 +38,7 @@ struct VP9Common; #define MFQE_PRECISION 4 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); + vp9_ppflags_t *ppflags, int unscaled_width); void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits); @@ -50,4 +50,4 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, } // extern "C" #endif -#endif // VP9_COMMON_VP9_POSTPROC_H_ +#endif // VPX_VP9_COMMON_VP9_POSTPROC_H_ diff --git a/libs/libvpx/vp9/common/vp9_ppflags.h b/libs/libvpx/vp9/common/vp9_ppflags.h index b8b647bf18..a0e3017626 100644 --- a/libs/libvpx/vp9/common/vp9_ppflags.h +++ b/libs/libvpx/vp9/common/vp9_ppflags.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_PPFLAGS_H_ -#define VP9_COMMON_VP9_PPFLAGS_H_ +#ifndef VPX_VP9_COMMON_VP9_PPFLAGS_H_ +#define VPX_VP9_COMMON_VP9_PPFLAGS_H_ #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ typedef struct { } // extern "C" #endif -#endif // VP9_COMMON_VP9_PPFLAGS_H_ +#endif // VPX_VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/libs/libvpx/vp9/common/vp9_pred_common.c b/libs/libvpx/vp9/common/vp9_pred_common.c index a7ddc0b951..375cb4d76c 100644 --- a/libs/libvpx/vp9/common/vp9_pred_common.c +++ b/libs/libvpx/vp9/common/vp9_pred_common.c @@ -13,6 +13,32 @@ #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" +int vp9_compound_reference_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; + + return 0; +} + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int ctx; @@ -229,9 +255,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { else pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); } else { - pred_context = 1 + - 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || - edge_mi->ref_frame[1] == GOLDEN_FRAME); + pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); } } else { // inter/inter const int above_has_second = has_second_ref(above_mi); diff --git a/libs/libvpx/vp9/common/vp9_pred_common.h b/libs/libvpx/vp9/common/vp9_pred_common.h index 8400bd70f1..ee59669359 100644 --- a/libs/libvpx/vp9/common/vp9_pred_common.h +++ b/libs/libvpx/vp9/common/vp9_pred_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_PRED_COMMON_H_ -#define VP9_COMMON_VP9_PRED_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_PRED_COMMON_H_ +#define VPX_VP9_COMMON_VP9_PRED_COMMON_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" @@ -145,6 +145,10 @@ static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; } +int vp9_compound_reference_allowed(const VP9_COMMON *cm); + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm); + // Returns a context number for the given MB prediction signal // The mode info data structure has a one element border above and to the // left of the entries corresponding to real blocks. @@ -176,12 +180,6 @@ static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, } } -static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size, - const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { - return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs); -} - static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, struct tx_counts *tx_counts) { switch (max_tx_size) { @@ -196,4 +194,4 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, } // extern "C" #endif -#endif // VP9_COMMON_VP9_PRED_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/libs/libvpx/vp9/common/vp9_quant_common.h b/libs/libvpx/vp9/common/vp9_quant_common.h index 4bae4a8967..ec8b9f4c6a 100644 --- a/libs/libvpx/vp9/common/vp9_quant_common.h +++ b/libs/libvpx/vp9/common/vp9_quant_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_ -#define VP9_COMMON_VP9_QUANT_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ +#define VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ #include "vpx/vpx_codec.h" #include "vp9/common/vp9_seg_common.h" @@ -33,4 +33,4 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id, } // extern "C" #endif -#endif // VP9_COMMON_VP9_QUANT_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/libs/libvpx/vp9/common/vp9_reconinter.c b/libs/libvpx/vp9/common/vp9_reconinter.c index a108a65153..04f41e6a33 100644 --- a/libs/libvpx/vp9/common/vp9_reconinter.c +++ b/libs/libvpx/vp9/common/vp9_reconinter.c @@ -63,14 +63,14 @@ static INLINE int round_mv_comp_q4(int value) { } static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { - MV res = { - round_mv_comp_q4( - mi->bmi[0].as_mv[idx].as_mv.row + mi->bmi[1].as_mv[idx].as_mv.row + - mi->bmi[2].as_mv[idx].as_mv.row + mi->bmi[3].as_mv[idx].as_mv.row), - round_mv_comp_q4( - mi->bmi[0].as_mv[idx].as_mv.col + mi->bmi[1].as_mv[idx].as_mv.col + - mi->bmi[2].as_mv[idx].as_mv.col + mi->bmi[3].as_mv[idx].as_mv.col) - }; + MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row + + mi->bmi[1].as_mv[idx].as_mv.row + + mi->bmi[2].as_mv[idx].as_mv.row + + mi->bmi[3].as_mv[idx].as_mv.row), + round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col + + mi->bmi[1].as_mv[idx].as_mv.col + + mi->bmi[2].as_mv[idx].as_mv.col + + mi->bmi[3].as_mv[idx].as_mv.col) }; return res; } @@ -136,7 +136,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, const struct scale_factors *const sf = &xd->block_refs[ref]->sf; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; - uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + uint8_t *const dst = dst_buf->buf + (int64_t)dst_buf->stride * y + x; const MV mv = mi->sb_type < BLOCK_8X8 ? average_split_mvs(pd, mi, ref, block) : mi->mv[ref].as_mv; @@ -178,7 +178,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, xs = sf->x_step_q4; ys = sf->y_step_q4; } else { - pre = pre_buf->buf + (y * pre_buf->stride + x); + pre = pre_buf->buf + ((int64_t)y * pre_buf->stride + x); scaled_mv.row = mv_q4.row; scaled_mv.col = mv_q4.col; xs = ys = 16; diff --git a/libs/libvpx/vp9/common/vp9_reconinter.h b/libs/libvpx/vp9/common/vp9_reconinter.h index bb9291a264..12b545831a 100644 --- a/libs/libvpx/vp9/common/vp9_reconinter.h +++ b/libs/libvpx/vp9/common/vp9_reconinter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_RECONINTER_H_ -#define VP9_COMMON_VP9_RECONINTER_H_ +#ifndef VPX_VP9_COMMON_VP9_RECONINTER_H_ +#define VPX_VP9_COMMON_VP9_RECONINTER_H_ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_onyxc_int.h" @@ -61,24 +61,25 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *mv_q3, + int dst_stride, const MV *src_mv, const struct scale_factors *sf, int w, int h, - int do_avg, const InterpKernel *kernel, + int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y); #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, - const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd); #endif -static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *sf) { +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; - return y * stride + x; + return (int64_t)y * stride + x; } static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src, @@ -103,4 +104,4 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, } // extern "C" #endif -#endif // VP9_COMMON_VP9_RECONINTER_H_ +#endif // VPX_VP9_COMMON_VP9_RECONINTER_H_ diff --git a/libs/libvpx/vp9/common/vp9_reconintra.h b/libs/libvpx/vp9/common/vp9_reconintra.h index 78e41c8811..426a35ebfa 100644 --- a/libs/libvpx/vp9/common/vp9_reconintra.h +++ b/libs/libvpx/vp9/common/vp9_reconintra.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_RECONINTRA_H_ -#define VP9_COMMON_VP9_RECONINTRA_H_ +#ifndef VPX_VP9_COMMON_VP9_RECONINTRA_H_ +#define VPX_VP9_COMMON_VP9_RECONINTRA_H_ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" @@ -28,4 +28,4 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size, } // extern "C" #endif -#endif // VP9_COMMON_VP9_RECONINTRA_H_ +#endif // VPX_VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl index 22b67ecace..6980b9b7fb 100644 --- a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -62,18 +62,18 @@ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, i add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; -add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; +add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # Note that there are more specializations appended when # CONFIG_VP9_HIGHBITDEPTH is off. - specialize qw/vp9_iht4x4_16_add sse2/; - specialize qw/vp9_iht8x8_64_add sse2/; - specialize qw/vp9_iht16x16_256_add sse2/; + specialize qw/vp9_iht4x4_16_add neon sse2 vsx/; + specialize qw/vp9_iht8x8_64_add neon sse2 vsx/; + specialize qw/vp9_iht16x16_256_add neon sse2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. - specialize qw/vp9_iht4x4_16_add neon dspr2 msa/; - specialize qw/vp9_iht8x8_64_add neon dspr2 msa/; + specialize qw/vp9_iht4x4_16_add dspr2 msa/; + specialize qw/vp9_iht8x8_64_add dspr2 msa/; specialize qw/vp9_iht16x16_256_add dspr2 msa/; } } @@ -100,7 +100,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; + + if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { + specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/; + specialize qw/vp9_highbd_iht8x8_64_add neon sse4_1/; + specialize qw/vp9_highbd_iht16x16_256_add neon sse4_1/; + } } # @@ -123,28 +129,22 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_ add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64"; - -add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64"; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error avx2 sse2/; specialize qw/vp9_block_error_fp avx2 sse2/; - specialize qw/vp9_fdct8x8_quant neon ssse3/; - add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; } else { specialize qw/vp9_block_error avx2 msa sse2/; specialize qw/vp9_block_error_fp neon avx2 sse2/; - - specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/; } # fdct functions @@ -177,11 +177,20 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx/; +# +# Apply temporal filter +# if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { -add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse4_1/; +add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count"; +specialize qw/vp9_apply_temporal_filter sse4_1/; + + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count"; + specialize qw/vp9_highbd_apply_temporal_filter sse4_1/; + } } + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # ENCODEMB INVOKE @@ -199,7 +208,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; + add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count"; } # End vp9_high encoder functions diff --git a/libs/libvpx/vp9/common/vp9_scale.h b/libs/libvpx/vp9/common/vp9_scale.h index ada8dbaad5..2f3b609483 100644 --- a/libs/libvpx/vp9/common/vp9_scale.h +++ b/libs/libvpx/vp9/common/vp9_scale.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_SCALE_H_ -#define VP9_COMMON_VP9_SCALE_H_ +#ifndef VPX_VP9_COMMON_VP9_SCALE_H_ +#define VPX_VP9_COMMON_VP9_SCALE_H_ #include "vp9/common/vp9_mv.h" #include "vpx_dsp/vpx_convolve.h" @@ -20,7 +20,7 @@ extern "C" { #define REF_SCALE_SHIFT 14 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT) -#define REF_INVALID_SCALE -1 +#define REF_INVALID_SCALE (-1) struct scale_factors { int x_scale_fp; // horizontal fixed point scale factor @@ -42,7 +42,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); #if CONFIG_VP9_HIGHBITDEPTH void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h, - int use_high); + int use_highbd); #else void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); @@ -68,4 +68,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height, } // extern "C" #endif -#endif // VP9_COMMON_VP9_SCALE_H_ +#endif // VPX_VP9_COMMON_VP9_SCALE_H_ diff --git a/libs/libvpx/vp9/common/vp9_scan.h b/libs/libvpx/vp9/common/vp9_scan.h index b3520e7dcc..72a9a5ec47 100644 --- a/libs/libvpx/vp9/common/vp9_scan.h +++ b/libs/libvpx/vp9/common/vp9_scan.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_SCAN_H_ -#define VP9_COMMON_VP9_SCAN_H_ +#ifndef VPX_VP9_COMMON_VP9_SCAN_H_ +#define VPX_VP9_COMMON_VP9_SCAN_H_ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -55,4 +55,4 @@ static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, } // extern "C" #endif -#endif // VP9_COMMON_VP9_SCAN_H_ +#endif // VPX_VP9_COMMON_VP9_SCAN_H_ diff --git a/libs/libvpx/vp9/common/vp9_seg_common.h b/libs/libvpx/vp9/common/vp9_seg_common.h index b9bf75d580..b63e4f4999 100644 --- a/libs/libvpx/vp9/common/vp9_seg_common.h +++ b/libs/libvpx/vp9/common/vp9_seg_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_SEG_COMMON_H_ -#define VP9_COMMON_VP9_SEG_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_SEG_COMMON_H_ +#define VPX_VP9_COMMON_VP9_SEG_COMMON_H_ #include "vpx_dsp/prob.h" @@ -78,4 +78,4 @@ extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; } // extern "C" #endif -#endif // VP9_COMMON_VP9_SEG_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/libs/libvpx/vp9/common/vp9_thread_common.c b/libs/libvpx/vp9/common/vp9_thread_common.c index 8d44e91f2e..c79d9b7f08 100644 --- a/libs/libvpx/vp9/common/vp9_thread_common.c +++ b/libs/libvpx/vp9/common/vp9_thread_common.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include +#include #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" @@ -38,11 +40,11 @@ static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { const int nsync = lf_sync->sync_range; if (r && !(c & (nsync - 1))) { - pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; + pthread_mutex_t *const mutex = &lf_sync->mutex[r - 1]; mutex_lock(mutex); while (c > lf_sync->cur_sb_col[r - 1] - nsync) { - pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); + pthread_cond_wait(&lf_sync->cond[r - 1], mutex); } pthread_mutex_unlock(mutex); } @@ -69,12 +71,12 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, } if (sig) { - mutex_lock(&lf_sync->mutex_[r]); + mutex_lock(&lf_sync->mutex[r]); lf_sync->cur_sb_col[r] = cur; - pthread_cond_signal(&lf_sync->cond_[r]); - pthread_mutex_unlock(&lf_sync->mutex_[r]); + pthread_cond_signal(&lf_sync->cond[r]); + pthread_mutex_unlock(&lf_sync->mutex[r]); } #else (void)lf_sync; @@ -91,6 +93,7 @@ static INLINE void thread_loop_filter_rows( int y_only, VP9LfSync *const lf_sync) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + const int num_active_workers = lf_sync->num_active_workers; int mi_row, mi_col; enum lf_path path; if (y_only) @@ -102,8 +105,10 @@ static INLINE void thread_loop_filter_rows( else path = LF_PATH_SLOW; + assert(num_active_workers > 0); + for (mi_row = start; mi_row < stop; - mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) { + mi_row += num_active_workers * MI_BLOCK_SIZE) { MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0); @@ -157,10 +162,12 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; - // Decoder may allocate more threads than number of tiles based on user's - // input. - const int tile_cols = 1 << cm->log2_tile_cols; - const int num_workers = VPXMIN(nworkers, tile_cols); + const int num_tile_cols = 1 << cm->log2_tile_cols; + // Limit the number of workers to prevent changes in frame dimensions from + // causing incorrect sync calculations when sb_rows < threads/tile_cols. + // Further restrict them by the number of tile columns should the user + // request more as this implementation doesn't scale well beyond that. + const int num_workers = VPXMIN(nworkers, VPXMIN(num_tile_cols, sb_rows)); int i; if (!lf_sync->sync_range || sb_rows != lf_sync->rows || @@ -168,6 +175,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } + lf_sync->num_active_workers = num_workers; // Initialize cur_sb_col to -1 for all SB rows. memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); @@ -231,6 +239,28 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, workers, num_workers, lf_sync); } +void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level, + int num_workers) { + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + + if (!frame_filter_level) return; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + lf_sync->corrupted = 0; + + memset(lf_sync->num_tiles_done, 0, + sizeof(*lf_sync->num_tiles_done) * sb_rows); + cm->lf_row = 0; +} + // Set up nsync by width. static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k @@ -253,19 +283,38 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, { int i; - CHECK_MEM_ERROR(cm, lf_sync->mutex_, - vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); - if (lf_sync->mutex_) { + CHECK_MEM_ERROR(cm, lf_sync->mutex, + vpx_malloc(sizeof(*lf_sync->mutex) * rows)); + if (lf_sync->mutex) { for (i = 0; i < rows; ++i) { - pthread_mutex_init(&lf_sync->mutex_[i], NULL); + pthread_mutex_init(&lf_sync->mutex[i], NULL); } } - CHECK_MEM_ERROR(cm, lf_sync->cond_, - vpx_malloc(sizeof(*lf_sync->cond_) * rows)); - if (lf_sync->cond_) { + CHECK_MEM_ERROR(cm, lf_sync->cond, + vpx_malloc(sizeof(*lf_sync->cond) * rows)); + if (lf_sync->cond) { for (i = 0; i < rows; ++i) { - pthread_cond_init(&lf_sync->cond_[i], NULL); + pthread_cond_init(&lf_sync->cond[i], NULL); + } + } + pthread_mutex_init(&lf_sync->lf_mutex, NULL); + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, + vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); + if (lf_sync->recon_done_mutex) { + int i; + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, + vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); + if (lf_sync->recon_done_cond) { + int i; + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); } } } @@ -274,10 +323,16 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->lfdata, vpx_malloc(num_workers * sizeof(*lf_sync->lfdata))); lf_sync->num_workers = num_workers; + lf_sync->num_active_workers = lf_sync->num_workers; CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done, + vpx_malloc(sizeof(*lf_sync->num_tiles_done) * + mi_cols_aligned_to_sb(cm->mi_rows) >> + MI_BLOCK_SIZE_LOG2)); + // Set up nsync. lf_sync->sync_range = get_sync_range(width); } @@ -288,27 +343,149 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { #if CONFIG_MULTITHREAD int i; - if (lf_sync->mutex_ != NULL) { + if (lf_sync->mutex != NULL) { for (i = 0; i < lf_sync->rows; ++i) { - pthread_mutex_destroy(&lf_sync->mutex_[i]); + pthread_mutex_destroy(&lf_sync->mutex[i]); } - vpx_free(lf_sync->mutex_); + vpx_free(lf_sync->mutex); } - if (lf_sync->cond_ != NULL) { + if (lf_sync->cond != NULL) { for (i = 0; i < lf_sync->rows; ++i) { - pthread_cond_destroy(&lf_sync->cond_[i]); + pthread_cond_destroy(&lf_sync->cond[i]); } - vpx_free(lf_sync->cond_); + vpx_free(lf_sync->cond); + } + if (lf_sync->recon_done_mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]); + } + vpx_free(lf_sync->recon_done_mutex); + } + + pthread_mutex_destroy(&lf_sync->lf_mutex); + if (lf_sync->recon_done_cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->recon_done_cond[i]); + } + vpx_free(lf_sync->recon_done_cond); } #endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->lfdata); vpx_free(lf_sync->cur_sb_col); + vpx_free(lf_sync->num_tiles_done); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. vp9_zero(*lf_sync); } } +static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { + int return_val = -1; + int cur_row; + const int max_rows = cm->mi_rows; + +#if CONFIG_MULTITHREAD + const int tile_cols = 1 << cm->log2_tile_cols; + + pthread_mutex_lock(&lf_sync->lf_mutex); + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } + pthread_mutex_unlock(&lf_sync->lf_mutex); + + if (return_val == -1) return return_val; + + pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]); + if (lf_sync->num_tiles_done[cur_row] < tile_cols) { + pthread_cond_wait(&lf_sync->recon_done_cond[cur_row], + &lf_sync->recon_done_mutex[cur_row]); + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]); + pthread_mutex_lock(&lf_sync->lf_mutex); + if (lf_sync->corrupted) { + int row = return_val >> MI_BLOCK_SIZE_LOG2; + pthread_mutex_lock(&lf_sync->mutex[row]); + lf_sync->cur_sb_col[row] = INT_MAX; + pthread_cond_signal(&lf_sync->cond[row]); + pthread_mutex_unlock(&lf_sync->mutex[row]); + return_val = -1; + } + pthread_mutex_unlock(&lf_sync->lf_mutex); +#else + (void)lf_sync; + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } +#endif // CONFIG_MULTITHREAD + + return return_val; +} + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + int mi_row; + VP9_COMMON *cm = lf_data->cm; + + while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) { + lf_data->start = mi_row; + lf_data->stop = mi_row + MI_BLOCK_SIZE; + + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + } +} + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&lf_sync->lf_mutex); + lf_sync->corrupted |= corrupted; + pthread_mutex_unlock(&lf_sync->lf_mutex); + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tiles == lf_sync->num_tiles_done[row]) { + if (is_last_row) { + /* The last 2 rows wait on the last row to be done. + * So, we have to broadcast the signal in this case. + */ + pthread_cond_broadcast(&lf_sync->recon_done_cond[row]); + } else { + pthread_cond_signal(&lf_sync->recon_done_cond[row]); + } + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); +#else + (void)lf_sync; + (void)num_tiles; + (void)row; + (void)is_last_row; + (void)corrupted; +#endif // CONFIG_MULTITHREAD +} + +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); +} + // Accumulate frame counts. void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, const FRAME_COUNTS *counts, int is_dec) { diff --git a/libs/libvpx/vp9/common/vp9_thread_common.h b/libs/libvpx/vp9/common/vp9_thread_common.h index 0f7c3ff748..94c9de6593 100644 --- a/libs/libvpx/vp9/common/vp9_thread_common.h +++ b/libs/libvpx/vp9/common/vp9_thread_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_ -#define VP9_COMMON_VP9_THREAD_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ +#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ #include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" #include "vpx_util/vpx_thread.h" @@ -24,8 +24,8 @@ struct FRAME_COUNTS; // Loopfilter row synchronization typedef struct VP9LfSyncData { #if CONFIG_MULTITHREAD - pthread_mutex_t *mutex_; - pthread_cond_t *cond_; + pthread_mutex_t *mutex; + pthread_cond_t *cond; #endif // Allocate memory to store the loop-filtered superblock index in each row. int *cur_sb_col; @@ -36,7 +36,16 @@ typedef struct VP9LfSyncData { // Row-based parallel loopfilter data LFWorkerData *lfdata; - int num_workers; + int num_workers; // number of allocated workers. + int num_active_workers; // number of scheduled workers. + +#if CONFIG_MULTITHREAD + pthread_mutex_t lf_mutex; + pthread_mutex_t *recon_done_mutex; + pthread_cond_t *recon_done_cond; +#endif + int *num_tiles_done; + int corrupted; } VP9LfSync; // Allocate memory for loopfilter row synchronization. @@ -53,6 +62,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, int partial_frame, VPxWorker *workers, int num_workers, VP9LfSync *lf_sync); +// Multi-threaded loopfilter initialisations +void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm, + int frame_filter_level, int num_workers); + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync); + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted); + +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync); + void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, const struct FRAME_COUNTS *counts, int is_dec); @@ -60,4 +80,4 @@ void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, } // extern "C" #endif -#endif // VP9_COMMON_VP9_THREAD_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ diff --git a/libs/libvpx/vp9/common/vp9_tile_common.h b/libs/libvpx/vp9/common/vp9_tile_common.h index 1b11c2680d..4ccf0a3d5f 100644 --- a/libs/libvpx/vp9/common/vp9_tile_common.h +++ b/libs/libvpx/vp9/common/vp9_tile_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_TILE_COMMON_H_ -#define VP9_COMMON_VP9_TILE_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_TILE_COMMON_H_ +#define VPX_VP9_COMMON_VP9_TILE_COMMON_H_ #ifdef __cplusplus extern "C" { @@ -37,4 +37,4 @@ void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, } // extern "C" #endif -#endif // VP9_COMMON_VP9_TILE_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/libs/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/libs/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c new file mode 100644 index 0000000000..57b79a732d --- /dev/null +++ b/libs/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2], + s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2], + x10[2], x11[2], x12[2], x13[2], x14[2], x15[2]; + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5); + highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7); + highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9); + highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14, + s15); + + x0[0] = _mm_add_epi64(s0[0], s8[0]); + x0[1] = _mm_add_epi64(s0[1], s8[1]); + x1[0] = _mm_add_epi64(s1[0], s9[0]); + x1[1] = _mm_add_epi64(s1[1], s9[1]); + x2[0] = _mm_add_epi64(s2[0], s10[0]); + x2[1] = _mm_add_epi64(s2[1], s10[1]); + x3[0] = _mm_add_epi64(s3[0], s11[0]); + x3[1] = _mm_add_epi64(s3[1], s11[1]); + x4[0] = _mm_add_epi64(s4[0], s12[0]); + x4[1] = _mm_add_epi64(s4[1], s12[1]); + x5[0] = _mm_add_epi64(s5[0], s13[0]); + x5[1] = _mm_add_epi64(s5[1], s13[1]); + x6[0] = _mm_add_epi64(s6[0], s14[0]); + x6[1] = _mm_add_epi64(s6[1], s14[1]); + x7[0] = _mm_add_epi64(s7[0], s15[0]); + x7[1] = _mm_add_epi64(s7[1], s15[1]); + x8[0] = _mm_sub_epi64(s0[0], s8[0]); + x8[1] = _mm_sub_epi64(s0[1], s8[1]); + x9[0] = _mm_sub_epi64(s1[0], s9[0]); + x9[1] = _mm_sub_epi64(s1[1], s9[1]); + x10[0] = _mm_sub_epi64(s2[0], s10[0]); + x10[1] = _mm_sub_epi64(s2[1], s10[1]); + x11[0] = _mm_sub_epi64(s3[0], s11[0]); + x11[1] = _mm_sub_epi64(s3[1], s11[1]); + x12[0] = _mm_sub_epi64(s4[0], s12[0]); + x12[1] = _mm_sub_epi64(s4[1], s12[1]); + x13[0] = _mm_sub_epi64(s5[0], s13[0]); + x13[1] = _mm_sub_epi64(s5[1], s13[1]); + x14[0] = _mm_sub_epi64(s6[0], s14[0]); + x14[1] = _mm_sub_epi64(s6[1], s14[1]); + x15[0] = _mm_sub_epi64(s7[0], s15[0]); + x15[1] = _mm_sub_epi64(s7[1], s15[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x0[0] = pack_4(x0[0], x0[1]); + x1[0] = pack_4(x1[0], x1[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 2 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + s4[0] = x4[0]; + s5[0] = x5[0]; + s6[0] = x6[0]; + s7[0] = x7[0]; + x0[0] = _mm_add_epi32(s0[0], s4[0]); + x1[0] = _mm_add_epi32(s1[0], s5[0]); + x2[0] = _mm_add_epi32(s2[0], s6[0]); + x3[0] = _mm_add_epi32(s3[0], s7[0]); + x4[0] = _mm_sub_epi32(s0[0], s4[0]); + x5[0] = _mm_sub_epi32(s1[0], s5[0]); + x6[0] = _mm_sub_epi32(s2[0], s6[0]); + x7[0] = _mm_sub_epi32(s3[0], s7[0]); + + highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9); + highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13, + s12); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15, + s14); + + x8[0] = _mm_add_epi64(s8[0], s12[0]); + x8[1] = _mm_add_epi64(s8[1], s12[1]); + x9[0] = _mm_add_epi64(s9[0], s13[0]); + x9[1] = _mm_add_epi64(s9[1], s13[1]); + x10[0] = _mm_add_epi64(s10[0], s14[0]); + x10[1] = _mm_add_epi64(s10[1], s14[1]); + x11[0] = _mm_add_epi64(s11[0], s15[0]); + x11[1] = _mm_add_epi64(s11[1], s15[1]); + x12[0] = _mm_sub_epi64(s8[0], s12[0]); + x12[1] = _mm_sub_epi64(s8[1], s12[1]); + x13[0] = _mm_sub_epi64(s9[0], s13[0]); + x13[1] = _mm_sub_epi64(s9[1], s13[1]); + x14[0] = _mm_sub_epi64(s10[0], s14[0]); + x14[1] = _mm_sub_epi64(s10[1], s14[1]); + x15[0] = _mm_sub_epi64(s11[0], s15[0]); + x15[1] = _mm_sub_epi64(s11[1], s15[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 3 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + s8[0] = x8[0]; + s9[0] = x9[0]; + s10[0] = x10[0]; + s11[0] = x11[0]; + highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15, + s14); + + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = _mm_add_epi32(s8[0], s10[0]); + x9[0] = _mm_add_epi32(s9[0], s11[0]); + x10[0] = _mm_sub_epi32(s8[0], s10[0]); + x11[0] = _mm_sub_epi32(s9[0], s11[0]); + x12[0] = _mm_add_epi64(s12[0], s14[0]); + x12[1] = _mm_add_epi64(s12[1], s14[1]); + x13[0] = _mm_add_epi64(s13[0], s15[0]); + x13[1] = _mm_add_epi64(s13[1], s15[1]); + x14[0] = _mm_sub_epi64(s12[0], s14[0]); + x14[1] = _mm_sub_epi64(s12[1], s14[1]); + x15[0] = _mm_sub_epi64(s13[0], s15[0]); + x15[1] = _mm_sub_epi64(s13[1], s15[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 4 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x7[0], x6[0]); + s7[0] = _mm_sub_epi32(x7[0], x6[0]); + s10[0] = _mm_add_epi32(x11[0], x10[0]); + s11[0] = _mm_sub_epi32(x11[0], x10[0]); + s14[0] = _mm_add_epi32(x14[0], x15[0]); + s15[0] = _mm_sub_epi32(x14[0], x15[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10); + highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11); + highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14); + highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x10[0] = dct_const_round_shift_64bit(s10[0]); + x10[1] = dct_const_round_shift_64bit(s10[1]); + x11[0] = dct_const_round_shift_64bit(s11[0]); + x11[1] = dct_const_round_shift_64bit(s11[1]); + x14[0] = dct_const_round_shift_64bit(s14[0]); + x14[1] = dct_const_round_shift_64bit(s14[1]); + x15[0] = dct_const_round_shift_64bit(s15[0]); + x15[1] = dct_const_round_shift_64bit(s15[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]); + io[2] = x12[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[4] = x6[0]; + io[5] = x14[0]; + io[6] = x10[0]; + io[7] = x2[0]; + io[8] = x3[0]; + io[9] = x11[0]; + io[10] = x15[0]; + io[11] = x7[0]; + io[12] = x5[0]; + io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]); + io[14] = x9[0]; + io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct16_8col(in, in); + } else { + vpx_iadst16_8col_sse2(in); + } + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct16_8col(out, out); + } else { + vpx_iadst16_8col_sse2(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct16_4col_sse4_1(in); + } else { + highbd_iadst16_4col_sse4_1(in); + } + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct16_4col_sse4_1(out); + } else { + highbd_iadst16_4col_sse4_1(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/libs/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/libs/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c new file mode 100644 index 0000000000..af158536f9 --- /dev/null +++ b/libs/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst4_sse4_1(__m128i *const io) { + const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0); + const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0); + const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0); + const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0); + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2]; + __m128i temp[2]; + + transpose_32bit_4x4(io, io); + + extend_64bit(io[0], temp); + s0[0] = _mm_mul_epi32(pair_c1, temp[0]); + s0[1] = _mm_mul_epi32(pair_c1, temp[1]); + s1[0] = _mm_mul_epi32(pair_c2, temp[0]); + s1[1] = _mm_mul_epi32(pair_c2, temp[1]); + + extend_64bit(io[1], temp); + s2[0] = _mm_mul_epi32(pair_c3, temp[0]); + s2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + extend_64bit(io[2], temp); + s3[0] = _mm_mul_epi32(pair_c4, temp[0]); + s3[1] = _mm_mul_epi32(pair_c4, temp[1]); + s4[0] = _mm_mul_epi32(pair_c1, temp[0]); + s4[1] = _mm_mul_epi32(pair_c1, temp[1]); + + extend_64bit(io[3], temp); + s5[0] = _mm_mul_epi32(pair_c2, temp[0]); + s5[1] = _mm_mul_epi32(pair_c2, temp[1]); + s6[0] = _mm_mul_epi32(pair_c4, temp[0]); + s6[1] = _mm_mul_epi32(pair_c4, temp[1]); + + t0[0] = _mm_add_epi64(s0[0], s3[0]); + t0[1] = _mm_add_epi64(s0[1], s3[1]); + t0[0] = _mm_add_epi64(t0[0], s5[0]); + t0[1] = _mm_add_epi64(t0[1], s5[1]); + t1[0] = _mm_sub_epi64(s1[0], s4[0]); + t1[1] = _mm_sub_epi64(s1[1], s4[1]); + t1[0] = _mm_sub_epi64(t1[0], s6[0]); + t1[1] = _mm_sub_epi64(t1[1], s6[1]); + temp[0] = _mm_sub_epi32(io[0], io[2]); + temp[0] = _mm_add_epi32(temp[0], io[3]); + extend_64bit(temp[0], temp); + t2[0] = _mm_mul_epi32(pair_c3, temp[0]); + t2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + s0[0] = _mm_add_epi64(t0[0], s2[0]); + s0[1] = _mm_add_epi64(t0[1], s2[1]); + s1[0] = _mm_add_epi64(t1[0], s2[0]); + s1[1] = _mm_add_epi64(t1[1], s2[1]); + s3[0] = _mm_add_epi64(t0[0], t1[0]); + s3[1] = _mm_add_epi64(t0[1], t1[1]); + s3[0] = _mm_sub_epi64(s3[0], s2[0]); + s3[1] = _mm_sub_epi64(s3[1], s2[1]); + + s0[0] = dct_const_round_shift_64bit(s0[0]); + s0[1] = dct_const_round_shift_64bit(s0[1]); + s1[0] = dct_const_round_shift_64bit(s1[0]); + s1[1] = dct_const_round_shift_64bit(s1[1]); + s2[0] = dct_const_round_shift_64bit(t2[0]); + s2[1] = dct_const_round_shift_64bit(t2[1]); + s3[0] = dct_const_round_shift_64bit(s3[0]); + s3[1] = dct_const_round_shift_64bit(s3[1]); + io[0] = pack_4(s0[0], s0[1]); + io[1] = pack_4(s1[0], s1[1]); + io[2] = pack_4(s2[0], s2[1]); + io[3] = pack_4(s3[0], s3[1]); +} + +void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/libs/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/libs/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c new file mode 100644 index 0000000000..7d949b6dbc --- /dev/null +++ b/libs/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst8_sse4_1(__m128i *const io) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5); + x0[0] = _mm_add_epi64(s0[0], s4[0]); + x0[1] = _mm_add_epi64(s0[1], s4[1]); + x1[0] = _mm_add_epi64(s1[0], s5[0]); + x1[1] = _mm_add_epi64(s1[1], s5[1]); + x4[0] = _mm_sub_epi64(s0[0], s4[0]); + x4[1] = _mm_sub_epi64(s0[1], s4[1]); + x5[0] = _mm_sub_epi64(s1[0], s5[0]); + x5[1] = _mm_sub_epi64(s1[1], s5[1]); + + highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7); + x2[0] = _mm_add_epi64(s2[0], s6[0]); + x2[1] = _mm_add_epi64(s2[1], s6[1]); + x3[0] = _mm_add_epi64(s3[0], s7[0]); + x3[1] = _mm_add_epi64(s3[1], s7[1]); + x6[0] = _mm_sub_epi64(s2[0], s6[0]); + x6[1] = _mm_sub_epi64(s2[1], s6[1]); + x7[0] = _mm_sub_epi64(s3[0], s7[0]); + x7[1] = _mm_sub_epi64(s3[1], s7[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + s0[0] = pack_4(x0[0], x0[1]); // s0 = x0; + s1[0] = pack_4(x1[0], x1[1]); // s1 = x1; + s2[0] = pack_4(x2[0], x2[1]); // s2 = x2; + s3[0] = pack_4(x3[0], x3[1]); // s3 = x3; + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 2 + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 3 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x6[0], x7[0]); + s7[0] = _mm_sub_epi32(x6[0], x7[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[2] = x6[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]); + io[4] = x3[0]; + io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]); + io[6] = x5[0]; + io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + highbd_iadst8_sse4_1(&io[8]); + } + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_iadst8_sse4_1(&io[8]); + } + highbd_idct8x8_final_round(io); + } + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 6996260e26..ad693718c0 100644 --- a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -10,8 +10,6 @@ #include "./vp9_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" -#include "vpx_ports/mem.h" void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { @@ -22,23 +20,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[1] = load_input_data8(input + 8); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct4_sse2(in); idct4_sse2(in); break; - case 1: // ADST_DCT + case ADST_DCT: idct4_sse2(in); iadst4_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst4_sse2(in); idct4_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst4_sse2(in); iadst4_sse2(in); break; - default: assert(0); break; } // Final round and shift @@ -67,23 +65,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { - case 0: // DCT_DCT - idct8_sse2(in); - idct8_sse2(in); + case DCT_DCT: + vpx_idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 1: // ADST_DCT - idct8_sse2(in); + case ADST_DCT: + vpx_idct8_sse2(in); iadst8_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst8_sse2(in); - idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst8_sse2(in); iadst8_sse2(in); break; - default: assert(0); break; } // Final rounding and shift @@ -201,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, load_buffer_8x16(input, in1); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 1: // ADST_DCT + case ADST_DCT: idct16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - case 2: // DCT_ADST + case DCT_ADST: iadst16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - default: assert(0); break; } write_buffer_8x16(dest, in0, stride); diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.c b/libs/libvpx/vp9/decoder/vp9_decodeframe.c index d0e896c13f..7d66cb2b27 100644 --- a/libs/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.c @@ -23,6 +23,9 @@ #include "vpx_ports/mem_ops.h" #include "vpx_scale/vpx_scale.h" #include "vpx_util/vpx_thread.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_common.h" @@ -42,34 +45,15 @@ #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_dsubexp.h" +#include "vp9/decoder/vp9_job_queue.h" #define MAX_VP9_HEADER_SIZE 80 -static int is_compound_reference_allowed(const VP9_COMMON *cm) { - int i; - for (i = 1; i < REFS_PER_FRAME; ++i) - if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; +typedef int (*predict_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); - return 0; -} - -static void setup_compound_reference_mode(VP9_COMMON *cm) { - if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) { - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; - } else if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[ALTREF_FRAME]) { - cm->comp_fixed_ref = GOLDEN_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } else { - cm->comp_fixed_ref = LAST_FRAME; - cm->comp_var_ref[0] = GOLDEN_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } -} +typedef void (*intra_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { return len != 0 && len <= (size_t)(end - start); @@ -118,7 +102,7 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) { static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, vpx_reader *r) { - if (is_compound_reference_allowed(cm)) { + if (vp9_compound_reference_allowed(cm)) { return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT : COMPOUND_REFERENCE) : SINGLE_REFERENCE; @@ -351,20 +335,121 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd, } } +static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + if (!mi->skip) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + const scan_order *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; + *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + +static void predict_and_reconstruct_intra_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, + int plane, int row, + int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, dst, pd->dst.stride, + dst, pd->dst.stride, col, row, plane); + + if (!mi->skip) { + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + if (*pd->eob > 0) { + inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst, + pd->dst.stride, *pd->eob); + } + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, - int plane, int row, int col, - TX_SIZE tx_size) { + int plane, int row, int col, TX_SIZE tx_size, + int mi_row, int mi_col) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *sc = &vp9_default_scan_orders[tx_size]; + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (eob > 0) { + inverse_transform_block_inter(xd, plane, tx_size, dst, pd->dst.stride, eob); + } +#if CONFIG_MISMATCH_DEBUG + { + int pixel_c, pixel_r; + int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, blk_w, + blk_h, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#else + (void)mi_row; + (void)mi_col; +#endif + return eob; +} + +static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; const scan_order *sc = &vp9_default_scan_orders[tx_size]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); + *pd->eob = eob; + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + + return eob; +} + +static int reconstruct_inter_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, int plane, + int row, int col, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = *pd->eob; + + (void)mi; if (eob > 0) { inverse_transform_block_inter( xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col], pd->dst.stride, eob); } + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + return eob; } @@ -715,6 +800,25 @@ static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl, } } +static MODE_INFO *set_offsets_recon(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, int bw, int bh, + int bwl, int bhl) { + const int offset = mi_row * cm->mi_stride + mi_col; + const TileInfo *const tile = &xd->tile; + xd->mi = cm->mi_grid_visible + offset; + + set_plane_n4(xd, bw, bh, bwl, bhl); + + set_skip_context(xd, mi_row, mi_col); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + return xd->mi[0]; +} + static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, BLOCK_SIZE bsize, int mi_row, int mi_col, int bw, int bh, int x_mis, int y_mis, int bwl, int bhl) { @@ -744,6 +848,66 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, return xd->mi[0]; } +static INLINE int predict_recon_inter(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + predict_recon_func func) { + int eobtotal = 0; + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + eobtotal += func(twd, mi, plane, row, col, tx_size); + } + return eobtotal; +} + +static INLINE void predict_recon_intra(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + intra_recon_func func) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + func(twd, mi, plane, row, col, tx_size); + } +} + static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { VP9_COMMON *const cm = &pbi->common; @@ -801,6 +965,24 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } else { // Prediction dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col); +#if CONFIG_MISMATCH_DEBUG + { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]); + const int bw = get_block_width(plane_bsize); + const int bh = get_block_height(plane_bsize); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c, + pixel_r, bw, bh, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#endif // Reconstruction if (!mi->skip) { @@ -829,8 +1011,8 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, for (row = 0; row < max_blocks_high; row += step) for (col = 0; col < max_blocks_wide; col += step) - eobtotal += - reconstruct_inter_block(twd, mi, plane, row, col, tx_size); + eobtotal += reconstruct_inter_block(twd, mi, plane, row, col, + tx_size, mi_row, mi_col); } if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter @@ -844,6 +1026,98 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } } +static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets_recon(cm, xd, mi_row, mi_col, bw, bh, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, + predict_and_reconstruct_intra_block_row_mt); + } else { + // Prediction + dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col); + + // Reconstruction + if (!mi->skip) { + predict_recon_inter(xd, mi, twd, reconstruct_inter_block_row_mt); + } + } + + vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh); +} + +static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, + y_mis, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis); + + if (mi->skip) { + dec_reset_skip_context(xd); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, parse_intra_block_row_mt); + } else { + if (!mi->skip) { + tran_low_t *dqcoeff[MAX_MB_PLANE]; + int *eob[MAX_MB_PLANE]; + int plane; + int eobtotal; + // Based on eobtotal and bsize, this may be mi->skip may be set to true + // In that case dqcoeff and eob need to be backed up and restored as + // recon_block will not increment these pointers for skip cases + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + dqcoeff[plane] = pd->dqcoeff; + eob[plane] = pd->eob; + } + eobtotal = predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt); + + if (bsize >= BLOCK_8X8 && eobtotal == 0) { + mi->skip = 1; // skip loopfilter + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + pd->dqcoeff = dqcoeff[plane]; + pd->eob = eob[plane]; + } + } + } + } + + xd->corrupted |= vpx_reader_has_error(r); +} + static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row, int mi_col, int bsl) { const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col; @@ -950,6 +1224,75 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi, dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); } +static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2, int parse_recon_flag, + process_block_fn_t process_block) { + VP9_COMMON *const cm = &pbi->common; + const int n8x8_l2 = n4x4_l2 - 1; + const int num_8x8_wh = 1 << n8x8_l2; + const int hbs = num_8x8_wh >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + MACROBLOCKD *const xd = &twd->xd; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (parse_recon_flag & PARSE) { + *xd->partition = + read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); + } + + partition = *xd->partition; + xd->partition++; + + subsize = get_subsize(bsize, partition); + if (!hbs) { + // calculate bmode block dimensions (log 2) + xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); + xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); + process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); + } else { + switch (partition) { + case PARTITION_NONE: + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); + break; + case PARTITION_HORZ: + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); + if (has_rows) + process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, + n8x8_l2); + break; + case PARTITION_VERT: + process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); + if (has_cols) + process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + n4x4_l2); + break; + case PARTITION_SPLIT: + process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, + n8x8_l2, parse_recon_flag, process_block); + break; + default: assert(0 && "Invalid partition type"); + } + } + + if (parse_recon_flag & PARSE) { + // update partition context + if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) && + bsize >= BLOCK_8X8) + dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); + } +} + static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end, size_t read_size, struct vpx_internal_error_info *error_info, @@ -1148,9 +1491,15 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { // Allocations in vp9_alloc_context_buffers() depend on individual // dimensions as well as the overall size. if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { - if (vp9_alloc_context_buffers(cm, width, height)) + if (vp9_alloc_context_buffers(cm, width, height)) { + // The cm->mi_* values have been cleared and any existing context + // buffers have been freed. Clear cm->width and cm->height to be + // consistent and to force a realloc next time. + cm->width = 0; + cm->height = 0; vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate context buffers"); + } } else { vp9_set_mb_mi(cm, width, height); } @@ -1348,6 +1697,318 @@ static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data, } } +static void map_write(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); + row_mt_worker_data->recon_map[map_idx] = 1; + pthread_cond_signal(&row_mt_worker_data->recon_sync_cond[sync_idx]); + pthread_mutex_unlock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static void map_read(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + volatile int8_t *map = row_mt_worker_data->recon_map + map_idx; + pthread_mutex_t *const mutex = + &row_mt_worker_data->recon_sync_mutex[sync_idx]; + pthread_mutex_lock(mutex); + while (!(*map)) { + pthread_cond_wait(&row_mt_worker_data->recon_sync_cond[sync_idx], mutex); + } + pthread_mutex_unlock(mutex); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) { + int return_val = 0; +#if CONFIG_MULTITHREAD + int corrupted; + pthread_mutex_lock(&lf_sync->lf_mutex); + corrupted = lf_sync->corrupted; + pthread_mutex_unlock(&lf_sync->lf_mutex); + if (!corrupted) { + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1; + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); + } +#else + (void)lf_sync; + (void)row; + (void)num_tile_cols; +#endif + return return_val; +} + +static void vp9_tile_done(VP9Decoder *pbi) { +#if CONFIG_MULTITHREAD + int terminate; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int all_parse_done = 1 << pbi->common.log2_tile_cols; + pthread_mutex_lock(&row_mt_worker_data->recon_done_mutex); + row_mt_worker_data->num_tiles_done++; + terminate = all_parse_done == row_mt_worker_data->num_tiles_done; + pthread_mutex_unlock(&row_mt_worker_data->recon_done_mutex); + if (terminate) { + vp9_jobq_terminate(&row_mt_worker_data->jobq); + } +#else + (void)pbi; +#endif +} + +static void vp9_jobq_alloc(VP9Decoder *pbi) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job); + + if (jobq_size > row_mt_worker_data->jobq_size) { + vpx_free(row_mt_worker_data->jobq_buf); + CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size)); + vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf, + jobq_size); + row_mt_worker_data->jobq_size = jobq_size; + } +} + +static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int is_last_row, VP9LfSync *lf_sync, + int cur_tile_col) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int tile_cols = 1 << cm->log2_tile_cols; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int mi_col_start = tile_data->xd.tile.mi_col_start; + int mi_col_end = tile_data->xd.tile.mi_col_end; + int mi_col; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + + // Top Dependency + if (cur_sb_row) { + map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c, + ((cur_sb_row - 1) * tile_cols) + cur_tile_col); + } + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB); + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON, + recon_block); + if (cm->lf.filter_level && !cm->skip_loop_filter) { + // Queue LPF_JOB + int is_lpf_job_ready = 0; + + if (mi_col + MI_BLOCK_SIZE >= mi_col_end) { + // Checks if this row has been decoded in all tiles + is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols); + + if (is_lpf_job_ready) { + Job lpf_job; + lpf_job.job_type = LPF_JOB; + if (cur_sb_row > 0) { + lpf_job.row_num = mi_row - MI_BLOCK_SIZE; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + if (is_last_row) { + lpf_job.row_num = mi_row; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + } + } + } + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + cur_tile_col); + } +} + +static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int cur_tile_col, uint8_t **data_end) { + int mi_col; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + TileInfo *tile = &tile_data->xd.tile; + TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col]; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(tile, cm, 0, cur_tile_col); + + /* Update reader only at the beginning of each row in a tile */ + if (mi_row == 0) { + setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + } + vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff); + tile_data->xd.error_info = &tile_data->error_info; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE, + parse_block); + } +} + +static int row_decode_worker_hook(void *arg1, void *arg2) { + ThreadData *const thread_data = (ThreadData *)arg1; + uint8_t **data_end = (uint8_t **)arg2; + VP9Decoder *const pbi = thread_data->pbi; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + Job job; + LFWorkerData *lf_data = thread_data->lf_data; + VP9LfSync *lf_sync = thread_data->lf_sync; + volatile int corrupted = 0; + + while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) { + int mi_col; + const int mi_row = job.row_num; + + if (job.job_type == LPF_JOB) { + lf_data->start = mi_row; + lf_data->stop = lf_data->start + MI_BLOCK_SIZE; + + if (cm->lf.filter_level && !cm->skip_loop_filter && + mi_row < cm->mi_rows) { + vp9_loopfilter_job(lf_data, lf_sync); + } + } else if (job.job_type == RECON_JOB) { + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int is_last_row = sb_rows - 1 == cur_sb_row; + TileWorkerData twd_recon; + TileWorkerData *const tile_data_recon = &twd_recon; + int mi_col_start, mi_col_end; + + tile_data_recon->xd = pbi->mb; + vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col); + vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff); + mi_col_start = tile_data_recon->xd.tile.mi_col_start; + mi_col_end = tile_data_recon->xd.tile.mi_col_end; + + if (setjmp(tile_data_recon->error_info.jmp)) { + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + tile_data_recon->error_info.setjmp = 0; + corrupted = 1; + for (mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + job.tile_col); + } + if (is_last_row) { + vp9_tile_done(pbi); + } + continue; + } + + tile_data_recon->error_info.setjmp = 1; + tile_data_recon->xd.error_info = &tile_data_recon->error_info; + + recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync, + job.tile_col); + + if (corrupted) + vpx_internal_error(&tile_data_recon->error_info, + VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (is_last_row) { + vp9_tile_done(pbi); + } + } else if (job.job_type == PARSE_JOB) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col]; + + if (setjmp(tile_data->error_info.jmp)) { + tile_data->error_info.setjmp = 0; + corrupted = 1; + vp9_tile_done(pbi); + continue; + } + + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts; + + tile_data->error_info.setjmp = 1; + + parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end); + + corrupted |= tile_data->xd.corrupted; + if (corrupted) + vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + /* Queue in the recon_job for this row */ + { + Job recon_job; + recon_job.row_num = mi_row; + recon_job.tile_col = job.tile_col; + recon_job.job_type = RECON_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job, + sizeof(recon_job)); + } + + /* Queue next parse job */ + if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) { + Job parse_job; + parse_job.row_num = mi_row + MI_BLOCK_SIZE; + parse_job.tile_col = job.tile_col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, + sizeof(parse_job)); + } + } + } + + return !corrupted; +} + static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; @@ -1426,7 +2087,29 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + if (pbi->row_mt == 1) { + int plane; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + PARSE, parse_block); + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + RECON, recon_block); + } else { + decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + } } pbi->mb.corrupted |= tile_data->xd.corrupted; if (pbi->mb.corrupted) @@ -1471,6 +2154,25 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, return vpx_reader_find_end(&tile_data->bit_reader); } +static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows, + int num_tiles_left, int total_num_tiles) { + do { + int mi_row; + const int aligned_rows = mi_cols_aligned_to_sb(mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int corrupted = 1; + for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) { + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2, + is_last_row, corrupted); + } + /* If there are multiple tiles, the second tile should start marking row + * progress from row 0. + */ + start_row = 0; + } while (num_tiles_left--); +} + // On entry 'tile_data->data_end' points to the end of the input frame, on exit // it is updated to reflect the bitreader position of the final tile column if // present in the tile buffer group or NULL otherwise. @@ -1481,6 +2183,12 @@ static int tile_worker_hook(void *arg1, void *arg2) { TileInfo *volatile tile = &tile_data->xd.tile; const int final_col = (1 << pbi->common.log2_tile_cols) - 1; const uint8_t *volatile bit_reader_end = NULL; + VP9_COMMON *cm = &pbi->common; + + LFWorkerData *lf_data = tile_data->lf_data; + VP9LfSync *lf_sync = tile_data->lf_sync; + + volatile int mi_row = 0; volatile int n = tile_data->buf_start; tile_data->error_info.setjmp = 1; @@ -1488,14 +2196,26 @@ static int tile_worker_hook(void *arg1, void *arg2) { tile_data->error_info.setjmp = 0; tile_data->xd.corrupted = 1; tile_data->data_end = NULL; + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int num_tiles_left = tile_data->buf_end - n; + const int mi_row_start = mi_row; + set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left, + 1 << cm->log2_tile_cols); + } return 0; } tile_data->xd.corrupted = 0; do { - int mi_row, mi_col; + int mi_col; const TileBuffer *const buf = pbi->tile_buffers + n; + + /* Initialize to 0 is safe since we do not deal with streams that have + * more than one row of tiles. (So tile->mi_row_start will be 0) + */ + assert(cm->log2_tile_rows == 0); + mi_row = 0; vp9_zero(tile_data->dqcoeff); vp9_tile_init(tile, &pbi->common, 0, buf->col); setup_token_decoder(buf->data, tile_data->data_end, buf->size, @@ -1513,6 +2233,14 @@ static int tile_worker_hook(void *arg1, void *arg2) { mi_col += MI_BLOCK_SIZE) { decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, 1 << cm->log2_tile_cols, + mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row, + tile_data->xd.corrupted); + } } if (buf->col == final_col) { @@ -1520,31 +2248,38 @@ static int tile_worker_hook(void *arg1, void *arg2) { } } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end); + if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level && + !cm->skip_loop_filter) { + /* This was not incremented in the tile loop, so increment before tiles left + * calculation + */ + ++n; + set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n, + 1 << cm->log2_tile_cols); + } + + if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_loopfilter_rows(lf_data, lf_sync); + } + tile_data->data_end = bit_reader_end; return !tile_data->xd.corrupted; } // sorts in descending order static int compare_tile_buffers(const void *a, const void *b) { - const TileBuffer *const buf1 = (const TileBuffer *)a; - const TileBuffer *const buf2 = (const TileBuffer *)b; - return (int)(buf2->size - buf1->size); + const TileBuffer *const buf_a = (const TileBuffer *)a; + const TileBuffer *const buf_b = (const TileBuffer *)b; + return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size); } -static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, - const uint8_t *data_end) { - VP9_COMMON *const cm = &pbi->common; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - const uint8_t *bit_reader_end = NULL; - const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - const int num_workers = VPXMIN(pbi->max_threads, tile_cols); +static INLINE void init_mt(VP9Decoder *pbi) { int n; - - assert(tile_cols <= (1 << 6)); - assert(tile_rows == 1); - (void)tile_rows; + VP9_COMMON *const cm = &pbi->common; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); if (pbi->num_tile_workers == 0) { const int num_threads = pbi->max_threads; @@ -1562,12 +2297,173 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, } } + // Initialize LPF + if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level, + pbi->num_tile_workers); + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); + + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_mi_cols); + + vp9_reset_lfm(cm); +} + +static const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = pbi->max_threads; + int i, n; + int col; + int corrupted = 0; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + memset(row_mt_worker_data->recon_map, 0, + sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map)); + + init_mt(pbi); + + // Reset tile decoding hook + for (n = 0; n < num_workers; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n]; + winterface->sync(worker); + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + thread_data->lf_sync = lf_row_sync; + thread_data->lf_data = &thread_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm, + pbi->mb.plane); + } + + thread_data->pbi = pbi; + + worker->hook = row_decode_worker_hook; + worker->data1 = thread_data; + worker->data2 = (void *)&row_mt_worker_data->data_end; + } + + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[col]; + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; + } + + /* Reset the jobq to start of the jobq buffer */ + vp9_jobq_reset(&row_mt_worker_data->jobq); + row_mt_worker_data->num_tiles_done = 0; + row_mt_worker_data->data_end = NULL; + + // Load tile data into tile_buffers + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, + &pbi->tile_buffers); + + // Initialize thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[col]; + vp9_zero(tile_data->counts); + } + } + + // queue parse jobs for 0th row of every tile + for (col = 0; col < tile_cols; ++col) { + Job parse_job; + parse_job.row_num = 0; + parse_job.tile_col = col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job)); + } + + for (i = 0; i < num_workers; ++i) { + VPxWorker *const worker = &pbi->tile_workers[i]; + worker->had_error = 0; + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + for (; n > 0; --n) { + VPxWorker *const worker = &pbi->tile_workers[n - 1]; + // TODO(jzern): The tile may have specific error data associated with + // its vpx_internal_error_info which could be propagated to the main info + // in cm. Additionally once the threads have been synced and an error is + // detected, there's no point in continuing to decode tiles. + corrupted |= !winterface->sync(worker); + } + + pbi->mb.corrupted = corrupted; + + { + /* Set data end */ + TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1]; + row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader); + } + + // Accumulate thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (i = 0; i < tile_cols; ++i) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[i]; + vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1); + } + } + + return row_mt_worker_data->data_end; +} + +static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const uint8_t *bit_reader_end = NULL; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = VPXMIN(pbi->max_threads, tile_cols); + int n; + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + init_mt(pbi); + // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { VPxWorker *const worker = &pbi->tile_workers[n]; TileWorkerData *const tile_data = &pbi->tile_worker_data[n + pbi->total_tiles]; winterface->sync(worker); + + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + tile_data->lf_sync = lf_row_sync; + tile_data->lf_data = &tile_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane); + tile_data->lf_data->y_only = 0; + } + tile_data->xd = pbi->mb; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; @@ -1576,15 +2472,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, worker->data2 = pbi; } - // Note: this memset assumes above_context[0], [1] and [2] - // are allocated as part of the same buffer. - memset(cm->above_context, 0, - sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); - memset(cm->above_seg_context, 0, - sizeof(*cm->above_seg_context) * aligned_mi_cols); - - vp9_reset_lfm(cm); - // Load tile data into tile_buffers get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, &pbi->tile_buffers); @@ -1724,6 +2611,22 @@ static void read_bitdepth_colorspace_sampling(VP9_COMMON *cm, } } +static INLINE void flush_all_fb_on_key(VP9_COMMON *cm) { + if (cm->frame_type == KEY_FRAME && cm->current_video_frame > 0) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (i == cm->new_fb_idx) continue; + frame_bufs[i].ref_count = 0; + if (!frame_bufs[i].released) { + pool->release_fb_cb(pool->cb_priv, &frame_bufs[i].raw_frame_buffer); + frame_bufs[i].released = 1; + } + } + } +} + static size_t read_uncompressed_header(VP9Decoder *pbi, struct vpx_read_bit_buffer *rb) { VP9_COMMON *const cm = &pbi->common; @@ -1788,6 +2691,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, setup_frame_size(cm, rb); if (pbi->need_resync) { memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + flush_all_fb_on_key(cm); pbi->need_resync = 0; } } else { @@ -1911,6 +2815,35 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, setup_segmentation_dequant(cm); setup_tile_info(cm, rb); + if (pbi->row_mt == 1) { + int num_sbs = 1; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int num_jobs = sb_rows << cm->log2_tile_cols; + + if (pbi->row_mt_worker_data == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data, + vpx_calloc(1, sizeof(*pbi->row_mt_worker_data))); +#if CONFIG_MULTITHREAD + pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL); +#endif + } + + if (pbi->max_threads > 1) { + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + + num_sbs = sb_cols * sb_rows; + } + + if (num_sbs > pbi->row_mt_worker_data->num_sbs || + num_jobs > pbi->row_mt_worker_data->num_jobs) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs, + pbi->max_threads, num_jobs); + } + vp9_jobq_alloc(pbi); + } sz = vpx_rb_read_literal(rb, 16); if (sz == 0) @@ -1953,7 +2886,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, cm->reference_mode = read_frame_reference_mode(cm, &r); if (cm->reference_mode != SINGLE_REFERENCE) - setup_compound_reference_mode(cm); + vp9_setup_compound_reference_mode(cm); read_frame_reference_mode_probs(cm, &r); for (j = 0; j < BLOCK_SIZE_GROUPS; j++) @@ -2021,6 +2954,12 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, const int tile_rows = 1 << cm->log2_tile_rows; const int tile_cols = 1 << cm->log2_tile_cols; YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame); +#endif +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_r(); +#endif xd->cur_buf = new_fb; if (!first_partition_size) { @@ -2069,20 +3008,28 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, pbi->total_tiles = tile_rows * tile_cols; } - if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { - // Multi-threaded tile decoder - *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); - if (!xd->corrupted) { - if (!cm->skip_loop_filter) { - // If multiple threads are used to decode tiles, then we use those - // threads to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, - 0, 0, pbi->tile_workers, pbi->num_tile_workers, - &pbi->lf_row_sync); - } + if (pbi->max_threads > 1 && tile_rows == 1 && + (tile_cols > 1 || pbi->row_mt == 1)) { + if (pbi->row_mt == 1) { + *p_data_end = + decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end); } else { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Decode failed. Frame data is corrupted."); + // Multi-threaded tile decoder + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); + if (!pbi->lpf_mt_opt) { + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt( + new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0, + pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + } } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.h b/libs/libvpx/vp9/decoder/vp9_decodeframe.h index 44717f546a..ba95e72344 100644 --- a/libs/libvpx/vp9/decoder/vp9_decodeframe.h +++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODEFRAME_H_ -#define VP9_DECODER_VP9_DECODEFRAME_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODEFRAME_H_ +#define VPX_VP9_DECODER_VP9_DECODEFRAME_H_ #ifdef __cplusplus extern "C" { @@ -32,4 +32,4 @@ void vp9_decode_frame(struct VP9Decoder *pbi, const uint8_t *data, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODEFRAME_H_ +#endif // VPX_VP9_DECODER_VP9_DECODEFRAME_H_ diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.c b/libs/libvpx/vp9/decoder/vp9_decodemv.c index 0a781413b1..943fe478a6 100644 --- a/libs/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libs/libvpx/vp9/decoder/vp9_decodemv.c @@ -696,7 +696,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, VP9_COMMON *const cm = &pbi->common; const BLOCK_SIZE bsize = mi->sb_type; const int allow_hp = cm->allow_high_precision_mv; - int_mv best_ref_mvs[2]; + int_mv best_ref_mvs[2] = { { 0 }, { 0 } }; int ref, is_compound; uint8_t inter_mode_ctx; const POSITION *const mv_ref_search = mv_ref_blocks[bsize]; diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.h b/libs/libvpx/vp9/decoder/vp9_decodemv.h index b460cb8fb1..11b45ace06 100644 --- a/libs/libvpx/vp9/decoder/vp9_decodemv.h +++ b/libs/libvpx/vp9/decoder/vp9_decodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODEMV_H_ -#define VP9_DECODER_VP9_DECODEMV_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODEMV_H_ +#define VPX_VP9_DECODER_VP9_DECODEMV_H_ #include "vpx_dsp/bitreader.h" @@ -26,4 +26,4 @@ void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODEMV_H_ +#endif // VPX_VP9_DECODER_VP9_DECODEMV_H_ diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.c b/libs/libvpx/vp9/decoder/vp9_decoder.c index a913fa560c..0aed3d717c 100644 --- a/libs/libvpx/vp9/decoder/vp9_decoder.c +++ b/libs/libvpx/vp9/decoder/vp9_decoder.c @@ -55,6 +55,94 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) { cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); } +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs) { + int plane; + const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) * + sizeof(*row_mt_worker_data->dqcoeff[0]); + row_mt_worker_data->num_jobs = num_jobs; +#if CONFIG_MULTITHREAD + { + int i; + CHECK_MEM_ERROR( + cm, row_mt_worker_data->recon_sync_mutex, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs)); + if (row_mt_worker_data->recon_sync_mutex) { + for (i = 0; i < num_jobs; ++i) { + pthread_mutex_init(&row_mt_worker_data->recon_sync_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR( + cm, row_mt_worker_data->recon_sync_cond, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs)); + if (row_mt_worker_data->recon_sync_cond) { + for (i = 0; i < num_jobs; ++i) { + pthread_cond_init(&row_mt_worker_data->recon_sync_cond[i], NULL); + } + } + } +#endif + row_mt_worker_data->num_sbs = num_sbs; + for (plane = 0; plane < 3; ++plane) { + CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane], + vpx_memalign(16, dqcoeff_size)); + memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); + CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane], + vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, + sizeof(*row_mt_worker_data->eob[plane]))); + } + CHECK_MEM_ERROR(cm, row_mt_worker_data->partition, + vpx_calloc(num_sbs * PARTITIONS_PER_SB, + sizeof(*row_mt_worker_data->partition))); + CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map, + vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map))); + + // allocate memory for thread_data + if (row_mt_worker_data->thread_data == NULL) { + const size_t thread_size = + max_threads * sizeof(*row_mt_worker_data->thread_data); + CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data, + vpx_memalign(32, thread_size)); + } +} + +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) { + if (row_mt_worker_data != NULL) { + int plane; +#if CONFIG_MULTITHREAD + int i; + if (row_mt_worker_data->recon_sync_mutex != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_mutex_destroy(&row_mt_worker_data->recon_sync_mutex[i]); + } + vpx_free(row_mt_worker_data->recon_sync_mutex); + row_mt_worker_data->recon_sync_mutex = NULL; + } + if (row_mt_worker_data->recon_sync_cond != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_cond_destroy(&row_mt_worker_data->recon_sync_cond[i]); + } + vpx_free(row_mt_worker_data->recon_sync_cond); + row_mt_worker_data->recon_sync_cond = NULL; + } +#endif + for (plane = 0; plane < 3; ++plane) { + vpx_free(row_mt_worker_data->eob[plane]); + row_mt_worker_data->eob[plane] = NULL; + vpx_free(row_mt_worker_data->dqcoeff[plane]); + row_mt_worker_data->dqcoeff[plane] = NULL; + } + vpx_free(row_mt_worker_data->partition); + row_mt_worker_data->partition = NULL; + vpx_free(row_mt_worker_data->recon_map); + row_mt_worker_data->recon_map = NULL; + vpx_free(row_mt_worker_data->thread_data); + row_mt_worker_data->thread_data = NULL; + } +} + static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) { cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip)); if (!cm->mip) return 1; @@ -69,6 +157,7 @@ static void vp9_dec_free_mi(VP9_COMMON *cm) { cm->mip = NULL; vpx_free(cm->mi_grid_base); cm->mi_grid_base = NULL; + cm->mi_alloc_size = 0; } VP9Decoder *vp9_decoder_create(BufferPool *const pool) { @@ -139,6 +228,18 @@ void vp9_decoder_remove(VP9Decoder *pbi) { vp9_loop_filter_dealloc(&pbi->lf_row_sync); } + if (pbi->row_mt == 1) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + if (pbi->row_mt_worker_data != NULL) { + vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq); + vpx_free(pbi->row_mt_worker_data->jobq_buf); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_done_mutex); +#endif + } + vpx_free(pbi->row_mt_worker_data); + } + vp9_remove_common(&pbi->common); vpx_free(pbi); } @@ -260,6 +361,44 @@ static void swap_frame_buffers(VP9Decoder *pbi) { cm->frame_refs[ref_index].idx = -1; } +static void release_fb_on_decoder_exit(VP9Decoder *pbi) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *volatile const cm = &pbi->common; + BufferPool *volatile const pool = cm->buffer_pool; + RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + // Synchronize all threads immediately as a subsequent decode call may + // cause a resize invalidating some allocations. + winterface->sync(&pbi->lf_worker); + for (i = 0; i < pbi->num_tile_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } + + // Release all the reference buffers if worker thread is holding them. + if (pbi->hold_ref_buf == 1) { + int ref_index = 0, mask; + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if (mask & 1) { + decrease_ref_count(old_idx, frame_bufs, pool); + } + ++ref_index; + } + + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + } + pbi->hold_ref_buf = 0; + } +} + int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, const uint8_t **psource) { VP9_COMMON *volatile const cm = &pbi->common; @@ -297,6 +436,9 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) { + pbi->ready_for_new_data = 1; + release_fb_on_decoder_exit(pbi); + vpx_clear_system_state(); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Unable to find free frame buffer"); return cm->error.error_code; @@ -309,44 +451,11 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; if (setjmp(cm->error.jmp)) { - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - int i; - cm->error.setjmp = 0; pbi->ready_for_new_data = 1; - - // Synchronize all threads immediately as a subsequent decode call may - // cause a resize invalidating some allocations. - winterface->sync(&pbi->lf_worker); - for (i = 0; i < pbi->num_tile_workers; ++i) { - winterface->sync(&pbi->tile_workers[i]); - } - - // Release all the reference buffers if worker thread is holding them. - if (pbi->hold_ref_buf == 1) { - int ref_index = 0, mask; - for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - const int old_idx = cm->ref_frame_map[ref_index]; - // Current thread releases the holding of reference frame. - decrease_ref_count(old_idx, frame_bufs, pool); - - // Release the reference frame in reference map. - if (mask & 1) { - decrease_ref_count(old_idx, frame_bufs, pool); - } - ++ref_index; - } - - // Current thread releases the holding of reference frame. - for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { - const int old_idx = cm->ref_frame_map[ref_index]; - decrease_ref_count(old_idx, frame_bufs, pool); - } - pbi->hold_ref_buf = 0; - } + release_fb_on_decoder_exit(pbi); // Release current frame. decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - vpx_clear_system_state(); return -1; } @@ -364,6 +473,8 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm); } + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; + // Update progress in frame parallel decode. cm->last_width = cm->width; cm->last_height = cm->height; @@ -394,7 +505,7 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, #if CONFIG_VP9_POSTPROC if (!cm->show_existing_frame) { - ret = vp9_post_proc_frame(cm, sd, flags); + ret = vp9_post_proc_frame(cm, sd, flags, cm->width); } else { *sd = *cm->frame_to_show; ret = 0; diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.h b/libs/libvpx/vp9/decoder/vp9_decoder.h index 4b26c314d3..4a22aa6b5b 100644 --- a/libs/libvpx/vp9/decoder/vp9_decoder.h +++ b/libs/libvpx/vp9/decoder/vp9_decoder.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODER_H_ -#define VP9_DECODER_VP9_DECODER_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODER_H_ +#define VPX_VP9_DECODER_VP9_DECODER_H_ #include "./vpx_config.h" @@ -21,11 +21,24 @@ #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" +#include "./vp9_job_queue.h" #ifdef __cplusplus extern "C" { #endif +#define EOBS_PER_SB_LOG2 8 +#define DQCOEFFS_PER_SB_LOG2 12 +#define PARTITIONS_PER_SB 85 + +typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType; + +typedef struct ThreadData { + struct VP9Decoder *pbi; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; +} ThreadData; + typedef struct TileBuffer { const uint8_t *data; size_t size; @@ -37,12 +50,46 @@ typedef struct TileWorkerData { int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive vpx_reader bit_reader; FRAME_COUNTS counts; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); struct vpx_internal_error_info error_info; } TileWorkerData; +typedef void (*process_block_fn_t)(TileWorkerData *twd, + struct VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, + int bhl); + +typedef struct RowMTWorkerData { + int num_sbs; + int *eob[MAX_MB_PLANE]; + PARTITION_TYPE *partition; + tran_low_t *dqcoeff[MAX_MB_PLANE]; + int8_t *recon_map; + const uint8_t *data_end; + uint8_t *jobq_buf; + JobQueueRowMt jobq; + size_t jobq_size; + int num_tiles_done; + int num_jobs; +#if CONFIG_MULTITHREAD + pthread_mutex_t recon_done_mutex; + pthread_mutex_t *recon_sync_mutex; + pthread_cond_t *recon_sync_cond; +#endif + ThreadData *thread_data; +} RowMTWorkerData; + +/* Structure to queue and dequeue row decode jobs */ +typedef struct Job { + int row_num; + int tile_col; + JobType job_type; +} Job; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -72,10 +119,14 @@ typedef struct VP9Decoder { int inv_tile_order; int need_resync; // wait for key/intra-only frame. int hold_ref_buf; // hold the reference buffer. + + int row_mt; + int lpf_mt_opt; + RowMTWorkerData *row_mt_worker_data; } VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size, - const uint8_t **dest); + const uint8_t **psource); int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, vp9_ppflags_t *flags); @@ -109,6 +160,11 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool); void vp9_decoder_remove(struct VP9Decoder *pbi); +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs); +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data); + static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, BufferPool *const pool) { if (idx >= 0 && frame_bufs[idx].ref_count > 0) { @@ -129,4 +185,4 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODER_H_ +#endif // VPX_VP9_DECODER_VP9_DECODER_H_ diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.c b/libs/libvpx/vp9/decoder/vp9_detokenize.c index 4bd016dc7d..e250a5a354 100644 --- a/libs/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libs/libvpx/vp9/decoder/vp9_detokenize.c @@ -33,6 +33,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, int *count, unsigned int *range) { const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT; const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); +#if CONFIG_BITSTREAM_DEBUG + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + int ref_result, ref_prob; + bitstream_queue_pop(&ref_result, &ref_prob); + if (prob != ref_prob) { + fprintf(stderr, + "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d " + "queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + + assert(0); + } +#endif if (*count < 0) { r->value = *value; @@ -51,6 +65,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, *value <<= shift; *count -= shift; } +#if CONFIG_BITSTREAM_DEBUG + { + const int bit = 1; + if (bit != ref_result) { + fprintf( + stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + } +#endif return 1; } *range = split; @@ -60,6 +88,19 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, *value <<= shift; *count -= shift; } +#if CONFIG_BITSTREAM_DEBUG + { + const int bit = 0; + if (bit != ref_result) { + fprintf(stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + } +#endif return 0; } diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.h b/libs/libvpx/vp9/decoder/vp9_detokenize.h index 7b0d876016..a32052ffff 100644 --- a/libs/libvpx/vp9/decoder/vp9_detokenize.h +++ b/libs/libvpx/vp9/decoder/vp9_detokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DETOKENIZE_H_ -#define VP9_DECODER_VP9_DETOKENIZE_H_ +#ifndef VPX_VP9_DECODER_VP9_DETOKENIZE_H_ +#define VPX_VP9_DECODER_VP9_DETOKENIZE_H_ #include "vpx_dsp/bitreader.h" #include "vp9/decoder/vp9_decoder.h" @@ -27,4 +27,4 @@ int vp9_decode_block_tokens(TileWorkerData *twd, int plane, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DETOKENIZE_H_ +#endif // VPX_VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/libs/libvpx/vp9/decoder/vp9_dsubexp.h b/libs/libvpx/vp9/decoder/vp9_dsubexp.h index 5a8ec8300c..b0c7750736 100644 --- a/libs/libvpx/vp9/decoder/vp9_dsubexp.h +++ b/libs/libvpx/vp9/decoder/vp9_dsubexp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DSUBEXP_H_ -#define VP9_DECODER_VP9_DSUBEXP_H_ +#ifndef VPX_VP9_DECODER_VP9_DSUBEXP_H_ +#define VPX_VP9_DECODER_VP9_DSUBEXP_H_ #include "vpx_dsp/bitreader.h" @@ -23,4 +23,4 @@ void vp9_diff_update_prob(vpx_reader *r, vpx_prob *p); } // extern "C" #endif -#endif // VP9_DECODER_VP9_DSUBEXP_H_ +#endif // VPX_VP9_DECODER_VP9_DSUBEXP_H_ diff --git a/libs/libvpx/vp9/decoder/vp9_job_queue.c b/libs/libvpx/vp9/decoder/vp9_job_queue.c new file mode 100644 index 0000000000..9a31f5a6d0 --- /dev/null +++ b/libs/libvpx/vp9/decoder/vp9_job_queue.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx/vpx_integer.h" + +#include "vp9/decoder/vp9_job_queue.h" + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) { +#if CONFIG_MULTITHREAD + pthread_mutex_init(&jobq->mutex, NULL); + pthread_cond_init(&jobq->cond, NULL); +#endif + jobq->buf_base = buf; + jobq->buf_wr = buf; + jobq->buf_rd = buf; + jobq->buf_end = buf + buf_size; + jobq->terminate = 0; +} + +void vp9_jobq_reset(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->buf_wr = jobq->buf_base; + jobq->buf_rd = jobq->buf_base; + jobq->terminate = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +void vp9_jobq_deinit(JobQueueRowMt *jobq) { + vp9_jobq_reset(jobq); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&jobq->mutex); + pthread_cond_destroy(&jobq->cond); +#endif +} + +void vp9_jobq_terminate(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->terminate = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(&jobq->cond); + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_wr + job_size) { + memcpy(jobq->buf_wr, job, job_size); + jobq->buf_wr = jobq->buf_wr + job_size; +#if CONFIG_MULTITHREAD + pthread_cond_signal(&jobq->cond); +#endif + ret = 0; + } else { + /* Wrap around case is not supported */ + assert(0); + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + return ret; +} + +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_rd + job_size) { + while (1) { + if (jobq->buf_wr >= jobq->buf_rd + job_size) { + memcpy(job, jobq->buf_rd, job_size); + jobq->buf_rd = jobq->buf_rd + job_size; + ret = 0; + break; + } else { + /* If all the entries have been dequeued, then break and return */ + if (jobq->terminate == 1) { + ret = 1; + break; + } + if (blocking == 1) { +#if CONFIG_MULTITHREAD + pthread_cond_wait(&jobq->cond, &jobq->mutex); +#endif + } else { + /* If there is no job available, + * and this is non blocking call then return fail */ + ret = 1; + break; + } + } + } + } else { + /* Wrap around case is not supported */ + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + + return ret; +} diff --git a/libs/libvpx/vp9/decoder/vp9_job_queue.h b/libs/libvpx/vp9/decoder/vp9_job_queue.h new file mode 100644 index 0000000000..bc23bf9c2c --- /dev/null +++ b/libs/libvpx/vp9/decoder/vp9_job_queue.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ + +#include "vpx_util/vpx_thread.h" + +typedef struct { + // Pointer to buffer base which contains the jobs + uint8_t *buf_base; + + // Pointer to current address where new job can be added + uint8_t *volatile buf_wr; + + // Pointer to current address from where next job can be obtained + uint8_t *volatile buf_rd; + + // Pointer to end of job buffer + uint8_t *buf_end; + + int terminate; + +#if CONFIG_MULTITHREAD + pthread_mutex_t mutex; + pthread_cond_t cond; +#endif +} JobQueueRowMt; + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size); +void vp9_jobq_reset(JobQueueRowMt *jobq); +void vp9_jobq_deinit(JobQueueRowMt *jobq); +void vp9_jobq_terminate(JobQueueRowMt *jobq); +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size); +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking); + +#endif // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c deleted file mode 100644 index 513718e7cb..0000000000 --- a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" - -#include "vp9/common/vp9_blockd.h" -#include "vpx_dsp/txfm_common.h" -#include "vpx_dsp/vpx_dsp_common.h" - -void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, - tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { - tran_low_t temp_buffer[64]; - (void)coeff_ptr; - - vpx_fdct8x8_neon(input, temp_buffer, stride); - vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, round_ptr, quant_ptr, - qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan_ptr, - iscan_ptr); -} diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c index 97a09bdff6..8b62b450ce 100644 --- a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -97,6 +97,9 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_s16(v_eobmax_76543210); +#else { const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); @@ -111,6 +114,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); } +#endif // __aarch64__ } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -122,7 +126,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan_ptr) { + const int16_t *scan, const int16_t *iscan) { const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); @@ -134,8 +138,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); // Process dc and the first seven ac coeffs. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); const int16x8_t coeff_abs = vabsq_s16(coeff); @@ -169,12 +173,12 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); store_s16q_to_tran_low(qcoeff_ptr, qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); - iscan_ptr += 8; + iscan += 8; coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; @@ -188,8 +192,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, // Process the rest of the ac coeffs. for (i = 8; i < 32 * 32; i += 8) { - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); const int16x8_t coeff_abs = vabsq_s16(coeff); @@ -215,17 +219,20 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); store_s16q_to_tran_low(qcoeff_ptr, qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); - iscan_ptr += 8; + iscan += 8; coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -233,5 +240,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } } diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c index 188d04d8f6..61786d8f66 100644 --- a/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c +++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" @@ -79,6 +80,7 @@ return err; \ } +#if !CONFIG_VP9_HIGHBITDEPTH BLOCK_ERROR_BLOCKSIZE_MSA(16); BLOCK_ERROR_BLOCKSIZE_MSA(64); BLOCK_ERROR_BLOCKSIZE_MSA(256); @@ -103,3 +105,4 @@ int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr, return err; } +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c index 0831e59148..efbbe830db 100644 --- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c +++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c index fa36f09ab8..9c5cc12ef0 100644 --- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c +++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c index 604db853c4..26d81aa9ef 100644 --- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c +++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h index 794bec70b6..fa1af2fc57 100644 --- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h +++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ -#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#ifndef VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#define VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ #include "vpx_dsp/mips/fwd_txfm_msa.h" #include "vpx_dsp/mips/txfm_macros_msa.h" @@ -113,4 +113,4 @@ PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \ out0, out1, out2, out3); \ } -#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */ +#endif // VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ diff --git a/libs/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/libs/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c new file mode 100644 index 0000000000..4f88b8fff6 --- /dev/null +++ b/libs/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" + +#include "./vp9_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 +// Note: Because this is done in 2 operations, a and b cannot both be UINT16_MIN +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >> 15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); +} + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + + (void)scan; + (void)skip_block; + assert(!skip_block); + + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from round and quant + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + + // Remove DC value from dequant + dequant = vec_splat(dequant, 1); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + // We quantize 16 coeff up front (enough for a 4x4) and process 24 coeff per + // loop iteration. + // for 8x8: 16 + 2 x 24 = 64 + // for 16x16: 16 + 10 x 24 = 256 + if (n_coeffs > 16) { + int16x8_t coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2; + + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + + do { + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + + qcoeff2 = vec_mulhi(vec_vaddshs(vec_abs(coeff2), round), quant); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + qcoeff2 = vec_sign(qcoeff2, coeff2); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} + +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); +} + +void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t mask0, mask1, zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2)); + int16x8_t abs_coeff0 = vec_abs(coeff0); + int16x8_t abs_coeff1 = vec_abs(coeff1); + + (void)scan; + (void)skip_block; + (void)n_coeffs; + assert(!skip_block); + + mask0 = vec_cmpge(abs_coeff0, thres); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff0 = vec_and(qcoeff0, mask0); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from thres, round, quant and dequant + thres = vec_splat(thres, 1); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + dequant = vec_splat(dequant, 1); + + mask1 = vec_cmpge(abs_coeff1, thres); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = + vec_madds(vec_vaddshs(vec_abs(coeff1), round), quant, vec_zeros_s16); + qcoeff1 = vec_and(qcoeff1, mask1); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + do { + int16x8_t coeff2, abs_coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2, mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + abs_coeff0 = vec_abs(coeff0); + abs_coeff1 = vec_abs(coeff1); + abs_coeff2 = vec_abs(coeff2); + + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff1 = vec_madds(vec_vaddshs(abs_coeff1, round), quant, vec_zeros_s16); + qcoeff2 = vec_madds(vec_vaddshs(abs_coeff2, round), quant, vec_zeros_s16); + + mask0 = vec_cmpge(abs_coeff0, thres); + mask1 = vec_cmpge(abs_coeff1, thres); + mask2 = vec_cmpge(abs_coeff2, thres); + + qcoeff0 = vec_and(qcoeff0, mask0); + qcoeff1 = vec_and(qcoeff1, mask1); + qcoeff2 = vec_and(qcoeff2, mask2); + + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + + qcoeff0 = vec_sign(qcoeff0, coeff0); + qcoeff1 = vec_sign(qcoeff1, coeff1); + qcoeff2 = vec_sign(qcoeff2, coeff2); + + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + dqcoeff2 = dequantize_coeff_32(qcoeff2, dequant); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} diff --git a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h index e508cb44ac..22a657e035 100644 --- a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h +++ b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h @@ -15,8 +15,8 @@ * for altref frames. Go to alt_ref_aq_private.h for implmentation details. */ -#ifndef VP9_ENCODER_VP9_ALT_REF_AQ_H_ -#define VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#ifndef VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#define VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ #include "vpx/vpx_integer.h" @@ -124,4 +124,4 @@ void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#endif // VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_aq_360.h b/libs/libvpx/vp9/encoder/vp9_aq_360.h index b1b56561d8..749d3c198a 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_360.h +++ b/libs/libvpx/vp9/encoder/vp9_aq_360.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_360_H_ -#define VP9_ENCODER_VP9_AQ_360_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_360_H_ +#define VPX_VP9_ENCODER_VP9_AQ_360_H_ #include "vp9/encoder/vp9_encoder.h" @@ -24,4 +24,4 @@ void vp9_360aq_frame_setup(VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_360_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_aq_complexity.h b/libs/libvpx/vp9/encoder/vp9_aq_complexity.h index a00d34e702..d3cb34c013 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_complexity.h +++ b/libs/libvpx/vp9/encoder/vp9_aq_complexity.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ -#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#define VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 2f2f0055a7..adb12c10c6 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -21,6 +21,14 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_segmentation.h" +static const uint8_t VP9_VAR_OFFS[64] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +}; + CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { size_t last_coded_q_map_size; CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr)); @@ -39,13 +47,16 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { } assert(MAXQ <= 255); memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); + cr->counter_encode_maxq_scene_change = 0; return cr; } void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) { - vpx_free(cr->map); - vpx_free(cr->last_coded_q_map); - vpx_free(cr); + if (cr != NULL) { + vpx_free(cr->map); + vpx_free(cr->last_coded_q_map); + vpx_free(cr); + } } // Check if this coding block, of size bsize, should be considered for refresh @@ -318,6 +329,28 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) { rc->baseline_gf_interval = 10; } +static int is_superblock_flat_static(VP9_COMP *const cpi, int sb_row_index, + int sb_col_index) { + unsigned int source_variance; + const uint8_t *src_y = cpi->Source->y_buffer; + const int ystride = cpi->Source->y_stride; + unsigned int sse; + const BLOCK_SIZE bsize = BLOCK_64X64; + src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + source_variance = + cpi->fn_ptr[bsize].vf(src_y, ystride, VP9_VAR_OFFS, 0, &sse); + if (source_variance == 0) { + uint64_t block_sad; + const uint8_t *last_src_y = cpi->Last_Source->y_buffer; + const int last_ystride = cpi->Last_Source->y_stride; + last_src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + block_sad = + cpi->fn_ptr[bsize].sdf(src_y, ystride, last_src_y, last_ystride); + if (block_sad == 0) return 1; + } + return 0; +} + // Update the segmentation map, and related quantities: cyclic refresh map, // refresh sb_index, and target number of blocks to be refreshed. // The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to @@ -368,8 +401,17 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { int sb_col_index = i - sb_row_index * sb_cols; int mi_row = sb_row_index * MI_BLOCK_SIZE; int mi_col = sb_col_index * MI_BLOCK_SIZE; + int flat_static_blocks = 0; + int compute_content = 1; assert(mi_row >= 0 && mi_row < cm->mi_rows); assert(mi_col >= 0 && mi_col < cm->mi_cols); +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) compute_content = 0; +#endif + if (cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + compute_content = 0; bl_index = mi_row * cm->mi_cols + mi_col; // Loop through all 8x8 blocks in superblock and update map. xmis = @@ -400,11 +442,21 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { // Enforce constant segment over superblock. // If segment is at least half of superblock, set to 1. if (sum_map >= xmis * ymis / 2) { - for (y = 0; y < ymis; y++) - for (x = 0; x < xmis; x++) { - seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; - } - cr->target_num_seg_blocks += xmis * ymis; + // This superblock is a candidate for refresh: + // compute spatial variance and exclude blocks that are spatially flat + // and stationary. Note: this is currently only done for screne content + // mode. + if (compute_content && cr->skip_flat_static_blocks) + flat_static_blocks = + is_superblock_flat_static(cpi, sb_row_index, sb_col_index); + if (!flat_static_blocks) { + // Label this superblock as segment 1. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; + } + cr->target_num_seg_blocks += xmis * ymis; + } } i++; if (i == sbs_in_frame) { @@ -413,7 +465,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); cr->sb_index = i; cr->reduce_refresh = 0; - if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1; } // Set cyclic refresh parameters. @@ -425,11 +478,20 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { int target_refresh = 0; double weight_segment_target = 0; double weight_segment = 0; - int thresh_low_motion = (cm->width < 720) ? 55 : 20; + int thresh_low_motion = 20; + int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20, + rc->best_quality << 1); + int qp_max_thresh = 117 * MAXQ >> 7; cr->apply_cyclic_refresh = 1; - if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || + if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 || + is_lossless_requested(&cpi->oxcf) || + rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + (cpi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && - rc->frames_since_key > 40)) { + rc->frames_since_key > 40) || + (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh && + rc->frames_since_key > 20)) { cr->apply_cyclic_refresh = 0; return; } @@ -454,20 +516,32 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_boost_fac = 13; } } + // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and + // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2 + // (rate_boost_fac = 10 disables segment#2). + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + // Only enable feature of skipping flat_static blocks for top layer + // under screen content mode. + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cr->skip_flat_static_blocks = 1; + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10; + // Increase the amount of refresh on scene change that is encoded at max Q, + // increase for a few cycles of the refresh period (~100 / percent_refresh). + if (cr->counter_encode_maxq_scene_change < 30) + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15; + cr->rate_ratio_qdelta = 2.0; + cr->rate_boost_fac = 10; + } // Adjust some parameters for low resolutions. - if (cm->width <= 352 && cm->height <= 288) { + if (cm->width * cm->height <= 352 * 288) { if (rc->avg_frame_bandwidth < 3000) { - cr->motion_thresh = 16; + cr->motion_thresh = 64; cr->rate_boost_fac = 13; } else { cr->max_qdelta_perc = 70; cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5); } } - if (cpi->svc.spatial_layer_id > 0) { - cr->motion_thresh = 4; - cr->rate_boost_fac = 12; - } if (cpi->oxcf.rc_mode == VPX_VBR) { // To be adjusted for VBR mode, e.g., based on gf period and boost. // For now use smaller qp-delta (than CBR), no second boosted seg, and @@ -492,6 +566,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { num8x8bl; if (weight_segment_target < 7 * weight_segment / 8) weight_segment = weight_segment_target; + // For screen-content: don't include target for the weight segment, + // since for all flat areas the segment is reset, so its more accurate + // to just use the previous actual number of seg blocks for the weight. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + weight_segment = + (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) / + num8x8bl; cr->weight_segment = weight_segment; } @@ -501,23 +582,31 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; - if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) { + // Reset if resoluton change has occurred. + if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi); + if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) || + scene_change_detected) { // Set segmentation map to 0 and disable. unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); vp9_disable_segmentation(&cm->seg); - if (cm->frame_type == KEY_FRAME) { + if (cm->frame_type == KEY_FRAME || scene_change_detected) { memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; cr->reduce_refresh = 0; + cr->counter_encode_maxq_scene_change = 0; } return; } else { int qindex_delta = 0; int qindex2; const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); + cr->counter_encode_maxq_scene_change++; vpx_clear_system_state(); // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). @@ -567,9 +656,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { cr->qindex_delta[2] = qindex_delta; vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); - // Reset if resoluton change has occurred. - if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi); - // Update the segmentation and refresh map. cyclic_refresh_update_map(cpi); } @@ -583,8 +669,19 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; memset(cr->map, 0, cm->mi_rows * cm->mi_cols); - memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; + cr->counter_encode_maxq_scene_change = 0; +} + +void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // For now apply hard limit to frame-level decrease in q, if the cyclic + // refresh is active (percent_refresh > 0). + if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) { + *q = cpi->rc.q_1_frame - 8; + } } diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h index 77fa67c9e1..b6d7fdeae7 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ -#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#define VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" @@ -68,6 +68,8 @@ struct CYCLIC_REFRESH { int reduce_refresh; double weight_segment; int apply_cyclic_refresh; + int counter_encode_maxq_scene_change; + int skip_flat_static_blocks; }; struct VP9_COMP; @@ -102,10 +104,6 @@ void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi, int mi_row, int mi_col, BLOCK_SIZE bsize); -// Update the segmentation map, and related quantities: cyclic refresh map, -// refresh sb_index, and target number of blocks to be refreshed. -void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi); - // From the just encoded frame: update the actual number of blocks that were // applied the segment delta q, and the amount of low motion in the frame. // Also check conditions for forcing golden update, or preventing golden @@ -139,8 +137,10 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) { return CR_SEGMENT_ID_BASE; } +void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_aq_variance.c b/libs/libvpx/vp9/encoder/vp9_aq_variance.c index 477f62ba5a..1f9ce2354c 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_variance.c +++ b/libs/libvpx/vp9/encoder/vp9_aq_variance.c @@ -19,6 +19,7 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_segmentation.h" #define ENERGY_MIN (-4) @@ -108,7 +109,7 @@ static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b, #if CONFIG_VP9_HIGHBITDEPTH static void aq_highbd_variance64(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, uint64_t *sum) { + uint64_t *sse, int64_t *sum) { int i, j; uint16_t *a = CONVERT_TO_SHORTPTR(a8); @@ -127,15 +128,6 @@ static void aq_highbd_variance64(const uint8_t *a8, int a_stride, } } -static void aq_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} #endif // CONFIG_VP9_HIGHBITDEPTH static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, @@ -153,11 +145,13 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, int avg; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, + uint64_t sse64 = 0; + int64_t sum64 = 0; + aq_highbd_variance64(x->plane[0].src.buf, x->plane[0].src.stride, CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, - &sse, &avg); - sse >>= 2 * (xd->bd - 8); - avg >>= (xd->bd - 8); + &sse64, &sum64); + sse = (unsigned int)(sse64 >> (2 * (xd->bd - 8))); + avg = (int)(sum64 >> (xd->bd - 8)); } else { aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0, bw, bh, &sse, &avg); @@ -192,6 +186,40 @@ double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { return log(var + 1.0); } +// Get the range of sub block energy values; +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e) { + VP9_COMMON *const cm = &cpi->common; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + int x, y; + + if (xmis < bw || ymis < bh) { + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); + *min_e = vp9_block_energy(cpi, mb, bsize); + *max_e = *min_e; + } else { + int energy; + *min_e = ENERGY_MAX; + *max_e = ENERGY_MIN; + + for (y = 0; y < ymis; ++y) { + for (x = 0; x < xmis; ++x) { + vp9_setup_src_planes(mb, cpi->Source, mi_row + y, mi_col + x); + energy = vp9_block_energy(cpi, mb, BLOCK_8X8); + *min_e = VPXMIN(*min_e, energy); + *max_e = VPXMAX(*max_e, energy); + } + } + } + + // Re-instate source pointers back to what they should have been on entry. + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); +} + #define DEFAULT_E_MIDPOINT 10.0 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { double energy; diff --git a/libs/libvpx/vp9/encoder/vp9_aq_variance.h b/libs/libvpx/vp9/encoder/vp9_aq_variance.h index 211a69f392..a4f872879d 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_variance.h +++ b/libs/libvpx/vp9/encoder/vp9_aq_variance.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_ -#define VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#define VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ #include "vp9/encoder/vp9_encoder.h" @@ -20,11 +20,15 @@ extern "C" { unsigned int vp9_vaq_segment_id(int energy); void vp9_vaq_frame_setup(VP9_COMP *cpi); +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e); int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.c b/libs/libvpx/vp9/encoder/vp9_bitstream.c index d346cd57aa..3eff4ce830 100644 --- a/libs/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libs/libvpx/vp9/encoder/vp9_bitstream.c @@ -18,6 +18,9 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem_ops.h" #include "vpx_ports/system_state.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" @@ -39,8 +42,10 @@ static const struct vp9_token intra_mode_encodings[INTRA_MODES] = { { 0, 1 }, { 6, 3 }, { 28, 5 }, { 30, 5 }, { 58, 6 }, { 59, 6 }, { 126, 7 }, { 127, 7 }, { 62, 6 }, { 2, 2 } }; -static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] = - { { 0, 1 }, { 2, 2 }, { 3, 2 } }; +static const struct vp9_token + switchable_interp_encodings[SWITCHABLE_FILTERS] = { { 0, 1 }, + { 2, 2 }, + { 3, 2 } }; static const struct vp9_token partition_encodings[PARTITION_TYPES] = { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } }; @@ -86,7 +91,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, BLOCK_SIZE bsize = xd->mi[0]->sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; const vpx_prob *const tx_probs = - get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); + get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs); vpx_write(w, tx_size != TX_4X4, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vpx_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -217,7 +222,8 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd, } if (is_compound) { - vpx_write(w, mi->ref_frame[0] == GOLDEN_FRAME, + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + vpx_write(w, mi->ref_frame[!idx] == cm->comp_var_ref[1], vp9_get_pred_prob_comp_ref_p(cm, xd)); } else { const int bit0 = mi->ref_frame[0] != LAST_FRAME; @@ -459,7 +465,8 @@ static void write_modes_sb( write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, max_mv_magnitude, interp_filter_selected); break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize, max_mv_magnitude, interp_filter_selected); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, @@ -469,7 +476,6 @@ static void write_modes_sb( write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, subsize, max_mv_magnitude, interp_filter_selected); break; - default: assert(0); } } @@ -618,9 +624,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, return; } - case ONE_LOOP_REDUCED: { + default: { int updates = 0; int noupdates_before_first = 0; + assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED); for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { @@ -670,7 +677,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, } return; } - default: assert(0); } } @@ -909,10 +915,24 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { (cpi->refresh_golden_frame << cpi->alt_fb_idx); } else { int arf_idx = cpi->alt_fb_idx; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + if (cpi->multi_layer_arf) { + for (arf_idx = 0; arf_idx < REF_FRAMES; ++arf_idx) { + if (arf_idx != cpi->alt_fb_idx && arf_idx != cpi->lst_fb_idx && + arf_idx != cpi->gld_fb_idx) { + int idx; + for (idx = 0; idx < gf_group->stack_size; ++idx) + if (arf_idx == gf_group->arf_index_stack[idx]) break; + if (idx == gf_group->stack_size) break; + } + } } + cpi->twopass.gf_group.top_arf_idx = arf_idx; + + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + return cpi->svc.update_buffer_slot[cpi->svc.spatial_layer_id]; return (cpi->refresh_last_frame << cpi->lst_fb_idx) | (cpi->refresh_golden_frame << cpi->gld_fb_idx) | (cpi->refresh_alt_ref_frame << arf_idx); @@ -1117,11 +1137,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || (cpi->svc.number_spatial_layers > 1 && - cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) || - (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state == ENCODING && - cpi->svc.layer_context[0].frames_from_key_frame < - cpi->svc.number_temporal_layers + 1))) { + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) { found = 0; } else if (cfg != NULL) { found = @@ -1153,8 +1169,10 @@ static void write_profile(BITSTREAM_PROFILE profile, case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break; case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break; case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break; - case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break; - default: assert(0); + default: + assert(profile == PROFILE_3); + vpx_wb_write_literal(wb, 6, 3); + break; } } @@ -1191,7 +1209,13 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_profile(cm->profile, wb); - vpx_wb_write_bit(wb, 0); // show_existing_frame + // If to use show existing frame. + vpx_wb_write_bit(wb, cm->show_existing_frame); + if (cm->show_existing_frame) { + vpx_wb_write_literal(wb, cpi->alt_fb_idx, 3); + return; + } + vpx_wb_write_bit(wb, cm->frame_type); vpx_wb_write_bit(wb, cm->show_frame); vpx_wb_write_bit(wb, cm->error_resilient_mode); @@ -1201,14 +1225,6 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_bitdepth_colorspace_sampling(cm, wb); write_frame_size(cm, wb); } else { - // In spatial svc if it's not error_resilient_mode then we need to code all - // visible frames as invisible. But we need to keep the show_frame flag so - // that the publisher could know whether it is supposed to be visible. - // So we will code the show_frame flag as it is. Then code the intra_only - // bit here. This will make the bitstream incompatible. In the player we - // will change to show_frame flag to 0, then add an one byte frame with - // show_existing_frame flag which tells the decoder which frame we want to - // show. if (!cm->show_frame) vpx_wb_write_bit(wb, cm->intra_only); if (!cm->error_resilient_mode) @@ -1340,7 +1356,20 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { struct vpx_write_bit_buffer wb = { data, 0 }; struct vpx_write_bit_buffer saved_wb; +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_reset_write(); +#endif + write_uncompressed_header(cpi, &wb); + + // Skip the rest coding process if use show existing frame. + if (cpi->common.show_existing_frame) { + uncompressed_hdr_size = vpx_wb_bytes_written(&wb); + data += uncompressed_hdr_size; + *size = data - dest; + return; + } + saved_wb = wb; vpx_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.h b/libs/libvpx/vp9/encoder/vp9_bitstream.h index 339c3fecb1..208651dc22 100644 --- a/libs/libvpx/vp9/encoder/vp9_bitstream.h +++ b/libs/libvpx/vp9/encoder/vp9_bitstream.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_BITSTREAM_H_ -#define VP9_ENCODER_VP9_BITSTREAM_H_ +#ifndef VPX_VP9_ENCODER_VP9_BITSTREAM_H_ +#define VPX_VP9_ENCODER_VP9_BITSTREAM_H_ #ifdef __cplusplus extern "C" { @@ -38,16 +38,12 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi); void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { - return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && - cpi->rc.is_src_frame_alt_ref && - (!cpi->use_svc || // Add spatial svc base layer case here - (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].gold_ref_idx >= 0 && - cpi->oxcf.ss_enable_auto_arf[0])); + return cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref && + !cpi->use_svc; } #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_BITSTREAM_H_ +#endif // VPX_VP9_ENCODER_VP9_BITSTREAM_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_block.h b/libs/libvpx/vp9/encoder/vp9_block.h index 724205dd57..37a4605ad8 100644 --- a/libs/libvpx/vp9/encoder/vp9_block.h +++ b/libs/libvpx/vp9/encoder/vp9_block.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_BLOCK_H_ -#define VP9_ENCODER_VP9_BLOCK_H_ +#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_ +#define VPX_VP9_ENCODER_VP9_BLOCK_H_ #include "vpx_util/vpx_thread.h" @@ -34,8 +34,8 @@ struct macroblock_plane { struct buf_2d src; // Quantizer setings + DECLARE_ALIGNED(16, int16_t, round_fp[8]); int16_t *quant_fp; - int16_t *round_fp; int16_t *quant; int16_t *quant_shift; int16_t *zbin; @@ -92,6 +92,8 @@ struct macroblock { int sadperbit4; int rddiv; int rdmult; + int cb_rdmult; + int segment_id; int mb_energy; // These are set to their default values at the beginning, and then adjusted @@ -115,6 +117,12 @@ struct macroblock { int *nmvsadcost_hp[2]; int **mvsadcost; + // sharpness is used to disable skip mode and change rd_mult + int sharpness; + + // aq mode is used to adjust rd based on segment. + int adjust_rdmult_by_segment; + // These define limits to motion vector components to prevent them // from extending outside the UMV borders MvLimits mv_limits; @@ -180,6 +188,8 @@ struct macroblock { int sb_pickmode_part; + int zero_temp_sad_source; + // For each superblock: saves the content value (e.g., low/high sad/sumdiff) // based on source sad, prior to encoding the frame. uint8_t content_state_sb; @@ -199,10 +209,13 @@ struct macroblock { void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); #endif + DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]); + + struct scale_factors *me_sf; }; #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_BLOCK_H_ +#endif // VPX_VP9_ENCODER_VP9_BLOCK_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_blockiness.c b/libs/libvpx/vp9/encoder/vp9_blockiness.c index 9ab57b57c7..da68a3c3c3 100644 --- a/libs/libvpx/vp9/encoder/vp9_blockiness.c +++ b/libs/libvpx/vp9/encoder/vp9_blockiness.c @@ -11,6 +11,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/system_state.h" +#include "vp9/encoder/vp9_blockiness.h" static int horizontal_filter(const uint8_t *s) { return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; diff --git a/libs/libvpx/vp9/encoder/vp9_blockiness.h b/libs/libvpx/vp9/encoder/vp9_blockiness.h new file mode 100644 index 0000000000..e840cb2518 --- /dev/null +++ b/libs/libvpx/vp9/encoder/vp9_blockiness.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ +#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, + const uint8_t *img2, int img2_pitch, int width, + int height); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.c b/libs/libvpx/vp9/encoder/vp9_context_tree.c index 2f7e544332..b74b9027ca 100644 --- a/libs/libvpx/vp9/encoder/vp9_context_tree.c +++ b/libs/libvpx/vp9/encoder/vp9_context_tree.c @@ -12,7 +12,10 @@ #include "vp9/encoder/vp9_encoder.h" static const BLOCK_SIZE square[] = { - BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, + BLOCK_8X8, + BLOCK_16X16, + BLOCK_32X32, + BLOCK_64X64, }; static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, @@ -136,17 +139,22 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { } void vp9_free_pc_tree(ThreadData *td) { - const int tree_nodes = 64 + 16 + 4 + 1; int i; - // Set up all 4x4 mode contexts - for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]); + if (td == NULL) return; - // Sets up all the leaf nodes in the tree. - for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + if (td->leaf_tree != NULL) { + // Set up all 4x4 mode contexts + for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]); + vpx_free(td->leaf_tree); + td->leaf_tree = NULL; + } - vpx_free(td->pc_tree); - td->pc_tree = NULL; - vpx_free(td->leaf_tree); - td->leaf_tree = NULL; + if (td->pc_tree != NULL) { + const int tree_nodes = 64 + 16 + 4 + 1; + // Sets up all the leaf nodes in the tree. + for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + vpx_free(td->pc_tree); + td->pc_tree = NULL; + } } diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.h b/libs/libvpx/vp9/encoder/vp9_context_tree.h index 73423c0758..4e301cc17d 100644 --- a/libs/libvpx/vp9/encoder/vp9_context_tree.h +++ b/libs/libvpx/vp9/encoder/vp9_context_tree.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_ -#define VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#ifndef VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#define VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/encoder/vp9_block.h" @@ -56,6 +56,7 @@ typedef struct { // scope of refactoring. int rate; int64_t dist; + int64_t rdcost; #if CONFIG_VP9_TEMPORAL_DENOISING unsigned int newmv_sse; @@ -75,6 +76,8 @@ typedef struct { // Used for the machine learning-based early termination int32_t sum_y_eobs; + // Skip certain ref frames during RD search of rectangular partitions. + uint8_t skip_ref_frame_mask; } PICK_MODE_CONTEXT; typedef struct PC_TREE { @@ -88,6 +91,9 @@ typedef struct PC_TREE { struct PC_TREE *split[4]; PICK_MODE_CONTEXT *leaf_split[4]; }; + // Obtained from a simple motion search. Used by the ML based partition search + // speed feature. + MV mv; } PC_TREE; void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td); @@ -97,4 +103,4 @@ void vp9_free_pc_tree(struct ThreadData *td); } // extern "C" #endif -#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */ +#endif // VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_cost.h b/libs/libvpx/vp9/encoder/vp9_cost.h index 70a1a2e0e9..638d72a916 100644 --- a/libs/libvpx/vp9/encoder/vp9_cost.h +++ b/libs/libvpx/vp9/encoder/vp9_cost.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_COST_H_ -#define VP9_ENCODER_VP9_COST_H_ +#ifndef VPX_VP9_ENCODER_VP9_COST_H_ +#define VPX_VP9_ENCODER_VP9_COST_H_ #include "vpx_dsp/prob.h" #include "vpx/vpx_integer.h" @@ -55,4 +55,4 @@ void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_COST_H_ +#endif // VPX_VP9_ENCODER_VP9_COST_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_dct.c b/libs/libvpx/vp9/encoder/vp9_dct.c index 5c66562a56..2f42c6afc2 100644 --- a/libs/libvpx/vp9/encoder/vp9_dct.c +++ b/libs/libvpx/vp9/encoder/vp9_dct.c @@ -554,109 +554,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, } } -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, - tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - int eob = -1; - - int i, j; - tran_low_t intermediate[64]; - - (void)iscan; - - // Transform columns - { - tran_low_t *output = intermediate; - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 - tran_high_t t0, t1, t2, t3; // needs32 - tran_high_t x0, x1, x2, x3; // canbe16 - - int i; - for (i = 0; i < 8; i++) { - // stage 1 - s0 = (input[0 * stride] + input[7 * stride]) * 4; - s1 = (input[1 * stride] + input[6 * stride]) * 4; - s2 = (input[2 * stride] + input[5 * stride]) * 4; - s3 = (input[3 * stride] + input[4 * stride]) * 4; - s4 = (input[3 * stride] - input[4 * stride]) * 4; - s5 = (input[2 * stride] - input[5 * stride]) * 4; - s6 = (input[1 * stride] - input[6 * stride]) * 4; - s7 = (input[0 * stride] - input[7 * stride]) * 4; - - // fdct4(step, step); - x0 = s0 + s3; - x1 = s1 + s2; - x2 = s1 - s2; - x3 = s0 - s3; - t0 = (x0 + x1) * cospi_16_64; - t1 = (x0 - x1) * cospi_16_64; - t2 = x2 * cospi_24_64 + x3 * cospi_8_64; - t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - output[0 * 8] = (tran_low_t)fdct_round_shift(t0); - output[2 * 8] = (tran_low_t)fdct_round_shift(t2); - output[4 * 8] = (tran_low_t)fdct_round_shift(t1); - output[6 * 8] = (tran_low_t)fdct_round_shift(t3); - - // Stage 2 - t0 = (s6 - s5) * cospi_16_64; - t1 = (s6 + s5) * cospi_16_64; - t2 = fdct_round_shift(t0); - t3 = fdct_round_shift(t1); - - // Stage 3 - x0 = s4 + t2; - x1 = s4 - t2; - x2 = s7 - t3; - x3 = s7 + t3; - - // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - output[1 * 8] = (tran_low_t)fdct_round_shift(t0); - output[3 * 8] = (tran_low_t)fdct_round_shift(t2); - output[5 * 8] = (tran_low_t)fdct_round_shift(t1); - output[7 * 8] = (tran_low_t)fdct_round_shift(t3); - input++; - output++; - } - } - - // Rows - for (i = 0; i < 8; ++i) { - fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]); - for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2; - } - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant_ptr[rc != 0]) >> 16; - - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - - if (tmp) eob = i; - } - } - *eob_ptr = eob + 1; -} - void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.c b/libs/libvpx/vp9/encoder/vp9_denoiser.c index b08ccaa66c..2885223b59 100644 --- a/libs/libvpx/vp9/encoder/vp9_denoiser.c +++ b/libs/libvpx/vp9/encoder/vp9_denoiser.c @@ -189,7 +189,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv, int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx, - int use_svc, int spatial_layer) { + int use_svc, int spatial_layer, int use_gf_temporal_ref) { const int sse_diff = (ctx->newmv_sse == UINT_MAX) ? 0 : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); @@ -201,7 +201,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( int i; struct buf_2d saved_dst[MAX_MB_PLANE]; struct buf_2d saved_pre[MAX_MB_PLANE]; - RefBuffer *saved_block_refs[2]; + const RefBuffer *saved_block_refs[2]; MV_REFERENCE_FRAME saved_frame; frame = ctx->best_reference_frame; @@ -219,8 +219,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. - if (frame != INTRA_FRAME && frame != ALTREF_FRAME && - (frame != GOLDEN_FRAME || num_spatial_layers == 1) && + if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME && sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { mi->ref_frame[0] = ctx->best_reference_frame; mi->mode = ctx->best_sse_inter_mode; @@ -230,7 +229,9 @@ static VP9_DENOISER_DECISION perform_motion_compensation( frame = ctx->best_zeromv_reference_frame; ctx->newmv_sse = ctx->zeromv_sse; // Bias to last reference. - if (num_spatial_layers > 1 || frame == ALTREF_FRAME || + if ((num_spatial_layers > 1 && !use_gf_temporal_ref) || + frame == ALTREF_FRAME || + (frame == GOLDEN_FRAME && use_gf_temporal_ref) || (frame != LAST_FRAME && ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || denoiser->denoising_level >= kDenHigh))) { @@ -261,6 +262,14 @@ static VP9_DENOISER_DECISION perform_motion_compensation( denoise_layer_idx = num_spatial_layers - spatial_layer - 1; } + // Force copy (no denoise, copy source in denoised buffer) if + // running_avg_y[frame] is NULL. + if (denoiser->running_avg_y[frame].buffer_alloc == NULL) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { // Restore everything to its original state *mi = saved_mi; @@ -326,7 +335,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation( void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, - VP9_DENOISER_DECISION *denoiser_decision) { + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref) { int mv_col, mv_row; int motion_magnitude = 0; int zeromv_filter = 0; @@ -349,6 +359,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, int is_skin = 0; int increase_denoising = 0; int consec_zeromv = 0; + int last_is_reference = cpi->ref_frame_flags & VP9_LAST_FLAG; mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; motion_magnitude = mv_row * mv_row + mv_col * mv_col; @@ -379,7 +390,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, // zero/small motion in skin detection is high, i.e, > 4). if (consec_zeromv < 4) { i = ymis; - j = xmis; + break; } } } @@ -392,12 +403,18 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, } if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1; - if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising) + // Copy block if LAST_FRAME is not a reference. + // Last doesn't always exist when SVC layers are dynamically changed, e.g. top + // spatial layer doesn't have last reference when it's brought up for the + // first time on the fly. + if (last_is_reference && denoiser->denoising_level >= kDenLow && + !ctx->sb_skip_denoising) decision = perform_motion_compensation( &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, motion_magnitude, is_skin, &zeromv_filter, consec_zeromv, cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx, - cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id); + cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id, + use_gf_temporal_ref); if (decision == FILTER_BLOCK) { decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start, @@ -445,16 +462,16 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, } void vp9_denoiser_update_frame_info( - VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, - int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, - int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, - int svc_base_is_key, int second_spatial_layer) { + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) { const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; // Copy source into denoised reference buffers on KEY_FRAME or // if the just encoded frame was resized. For SVC, copy source if the base // spatial layer was key frame. if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || - svc_base_is_key) { + svc_refresh_denoiser_buffers) { int i; // Start at 1 so as not to overwrite the INTRA_FRAME for (i = 1; i < denoiser->num_ref_frames; ++i) { @@ -465,32 +482,43 @@ void vp9_denoiser_update_frame_info( return; } - // If more than one refresh occurs, must copy frame buffer. - if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) { - if (refresh_alt_ref_frame) { - copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_golden_frame) { - copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_last_frame) { - copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) + copy_frame(&denoiser->running_avg_y[i + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } } else { - if (refresh_alt_ref_frame) { - swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_golden_frame) { - swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_last_frame) { - swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); + // If more than one refresh occurs, must copy frame buffer. + if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > + 1) { + if (refresh_alt_ref_frame) { + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + if (refresh_alt_ref_frame) { + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } } } } @@ -539,26 +567,38 @@ static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm, } int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, - int svc_buf_shift, int refresh_alt, - int refresh_gld, int refresh_lst, int alt_fb_idx, - int gld_fb_idx, int lst_fb_idx) { + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) { int fail = 0; - if (refresh_alt) { - // Increase the frame buffer index by 1 to map it to the buffer index in the - // denoiser. - fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, - alt_fb_idx + 1 + svc_buf_shift); - if (fail) return 1; - } - if (refresh_gld) { - fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, - gld_fb_idx + 1 + svc_buf_shift); - if (fail) return 1; - } - if (refresh_lst) { - fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, - lst_fb_idx + 1 + svc_buf_shift); - if (fail) return 1; + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->frame_type == KEY_FRAME || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + i + 1 + svc_buf_shift); + } + } + } else { + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in + // the denoiser. + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } } return 0; } @@ -648,9 +688,10 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, make_grayscale(&denoiser->running_avg_y[i]); #endif denoiser->frame_buffer_initialized = 1; - denoiser->denoising_level = kDenLow; - denoiser->prev_denoising_level = kDenLow; + denoiser->denoising_level = kDenMedium; + denoiser->prev_denoising_level = kDenMedium; denoiser->reset = 0; + denoiser->current_denoiser_frame = 0; return 0; } @@ -675,13 +716,29 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) { vpx_free_frame_buffer(&denoiser->last_source); } -void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) { +static void force_refresh_longterm_ref(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // If long term reference is used, force refresh of that slot, so + // denoiser buffer for long term reference stays in sync. + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->refresh_alt_ref_frame = 1; + } +} + +void vp9_denoiser_set_noise_level(VP9_COMP *const cpi, int noise_level) { + VP9_DENOISER *const denoiser = &cpi->denoiser; denoiser->denoising_level = noise_level; if (denoiser->denoising_level > kDenLowLow && - denoiser->prev_denoising_level == kDenLowLow) + denoiser->prev_denoising_level == kDenLowLow) { denoiser->reset = 1; - else + force_refresh_longterm_ref(cpi); + } else { denoiser->reset = 0; + } denoiser->prev_denoising_level = denoiser->denoising_level; } @@ -713,6 +770,56 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold, return threshold; } +void vp9_denoiser_reset_on_first_frame(VP9_COMP *const cpi) { + if (vp9_denoise_svc_non_key(cpi) && + cpi->denoiser.current_denoiser_frame == 0) { + cpi->denoiser.reset = 1; + force_refresh_longterm_ref(cpi); + } +} + +void vp9_denoiser_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->denoiser.denoising_level > kDenLowLow) { + int svc_refresh_denoiser_buffers = 0; + int denoise_svc_second_layer = 0; + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; + cpi->denoiser.current_denoiser_frame++; + if (cpi->use_svc) { + const int svc_buf_shift = + svc->number_spatial_layers - svc->spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc_refresh_denoiser_buffers = + lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id]; + denoise_svc_second_layer = + svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0; + // Check if we need to allocate extra buffers in the denoiser + // for refreshed frames. + if (vp9_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); + } + vp9_denoiser_update_frame_info( + &cpi->denoiser, *cpi->Source, svc, frame_type, + cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, + cpi->lst_fb_idx, cpi->resize_pending, svc_refresh_denoiser_buffers, + denoise_svc_second_layer); + } +} + #ifdef OUTPUT_YUV_DENOISED static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { int r, c; diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.h b/libs/libvpx/vp9/encoder/vp9_denoiser.h index f4da24cbf6..1973e98988 100644 --- a/libs/libvpx/vp9/encoder/vp9_denoiser.h +++ b/libs/libvpx/vp9/encoder/vp9_denoiser.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_DENOISER_H_ -#define VP9_ENCODER_DENOISER_H_ +#ifndef VPX_VP9_ENCODER_VP9_DENOISER_H_ +#define VPX_VP9_ENCODER_VP9_DENOISER_H_ #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_skin_detection.h" @@ -50,6 +50,7 @@ typedef struct vp9_denoiser { int reset; int num_ref_frames; int num_layers; + unsigned int current_denoiser_frame; VP9_DENOISER_LEVEL denoising_level; VP9_DENOISER_LEVEL prev_denoising_level; } VP9_DENOISER; @@ -70,14 +71,15 @@ struct VP9_COMP; struct SVC; void vp9_denoiser_update_frame_info( - VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, - int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, - int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, - int svc_base_is_key, int second_spatial_layer); + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer); void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, - VP9_DENOISER_DECISION *denoiser_decision); + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref); void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); @@ -86,9 +88,9 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, PICK_MODE_CONTEXT *ctx); int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, - int svc_buf_shift, int refresh_alt, - int refresh_gld, int refresh_lst, int alt_fb_idx, - int gld_fb_idx, int lst_fb_idx); + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx); int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, int use_svc, int noise_sen, int width, int height, @@ -110,7 +112,9 @@ static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs, void vp9_denoiser_free(VP9_DENOISER *denoiser); -void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level); +void vp9_denoiser_set_noise_level(struct VP9_COMP *const cpi, int noise_level); + +void vp9_denoiser_reset_on_first_frame(struct VP9_COMP *const cpi); int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int content_state, int temporal_layer_id); @@ -119,8 +123,10 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, int temporal_layer_id); +void vp9_denoiser_update_ref_frame(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_DENOISER_H_ +#endif // VPX_VP9_ENCODER_VP9_DENOISER_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.c b/libs/libvpx/vp9/encoder/vp9_encodeframe.c index 682477df18..d47b411fa8 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include @@ -21,6 +22,10 @@ #include "vpx_ports/vpx_timer.h" #include "vpx_ports/system_state.h" +#if CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" @@ -32,16 +37,21 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_tile_common.h" - +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_aq_360.h" #include "vp9/encoder/vp9_aq_complexity.h" +#endif #include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_aq_variance.h" +#endif #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_partition_models.h" #include "vp9/encoder/vp9_pickmode.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" @@ -52,33 +62,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int output_enabled, int mi_row, int mi_col, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); -// Machine learning-based early termination parameters. -static const double train_mean[24] = { - 303501.697372, 3042630.372158, 24.694696, 1.392182, - 689.413511, 162.027012, 1.478213, 0.0, - 135382.260230, 912738.513263, 28.845217, 1.515230, - 544.158492, 131.807995, 1.436863, 0.0, - 43682.377587, 208131.711766, 28.084737, 1.356677, - 138.254122, 119.522553, 1.252322, 0.0 -}; - -static const double train_stdm[24] = { - 673689.212982, 5996652.516628, 0.024449, 1.989792, - 985.880847, 0.014638, 2.001898, 0.0, - 208798.775332, 1812548.443284, 0.018693, 1.838009, - 396.986910, 0.015657, 1.332541, 0.0, - 55888.847031, 448587.962714, 0.017900, 1.904776, - 98.652832, 0.016598, 1.320992, 0.0 -}; - -// Error tolerance: 0.01%-0.0.05%-0.1% -static const double classifiers[24] = { - 0.111736, 0.289977, 0.042219, 0.204765, 0.120410, -0.143863, - 0.282376, 0.847811, 0.637161, 0.131570, 0.018636, 0.202134, - 0.112797, 0.028162, 0.182450, 1.124367, 0.386133, 0.083700, - 0.050028, 0.150873, 0.061119, 0.109318, 0.127255, 0.625211 -}; - // This is used as a reference when computing the source variance for the // purpose of activity masking. // Eventually this should be replaced by custom no-reference routines, @@ -176,6 +159,7 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, } #endif // CONFIG_VP9_HIGHBITDEPTH +#if !CONFIG_REALTIME_ONLY static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, const struct buf_2d *ref, int mi_row, int mi_col, @@ -204,6 +188,72 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x, else return BLOCK_8X8; } +#endif // !CONFIG_REALTIME_ONLY + +static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize, int segment_index) { + VP9_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + + // Initialize the segmentation index as 0. + mi->segment_id = 0; + + // Skip the rest if AQ mode is disabled. + if (!seg->enabled) return; + + switch (aq_mode) { + case CYCLIC_REFRESH_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; +#if !CONFIG_REALTIME_ONLY + case VARIANCE_AQ: + if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || + cpi->force_update_segmentation || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int min_energy; + int max_energy; + // Get sub block energy range + if (bsize >= BLOCK_32X32) { + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + } else { + min_energy = bsize <= BLOCK_16X16 ? x->mb_energy + : vp9_block_energy(cpi, x, bsize); + } + mi->segment_id = vp9_vaq_segment_id(min_energy); + } else { + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + break; + case EQUATOR360_AQ: + if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) + mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows); + else + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; +#endif + case LOOKAHEAD_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; + case PSNR_AQ: mi->segment_id = segment_index; break; + case PERCEPTUAL_AQ: mi->segment_id = x->segment_id; break; + default: + // NO_AQ or PSNR_AQ + break; + } + + // Set segment index from ROI map if it's enabled. + if (cpi->roi.enabled) + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + + vp9_init_plane_quantizers(cpi, x); +} // Lighter version of set_offsets that only sets the mode info // pointers. @@ -217,23 +267,57 @@ static INLINE void set_mode_info_offsets(VP9_COMMON *const cm, x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); } +static void set_ssim_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const VP9_COMMON *const cm = &cpi->common; + + const int bsize_base = BLOCK_16X16; + const int num_8x8_w = num_8x8_blocks_wide_lookup[bsize_base]; + const int num_8x8_h = num_8x8_blocks_high_lookup[bsize_base]; + const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w; + const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h; + const int num_bcols = + (num_8x8_blocks_wide_lookup[bsize] + num_8x8_w - 1) / num_8x8_w; + const int num_brows = + (num_8x8_blocks_high_lookup[bsize] + num_8x8_h - 1) / num_8x8_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + assert(cpi->oxcf.tuning == VP8_TUNE_SSIM); + + for (row = mi_row / num_8x8_w; + row < num_rows && row < mi_row / num_8x8_w + num_brows; ++row) { + for (col = mi_col / num_8x8_h; + col < num_cols && col < mi_col / num_8x8_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->mi_ssim_rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale); + *rdmult = VPXMAX(*rdmult, 0); + set_error_per_bit(x, *rdmult); + vpx_clear_system_state(); +} + static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const struct segmentation *const seg = &cm->seg; MvLimits *const mv_limits = &x->mv_limits; set_skip_context(xd, mi_row, mi_col); set_mode_info_offsets(cm, x, xd, mi_row, mi_col); - mi = xd->mi[0]; - // Set up destination pointers. vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); @@ -255,21 +339,8 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, // R/D setup. x->rddiv = cpi->rd.RDDIV; x->rdmult = cpi->rd.RDMULT; - - // Setup segment ID. - if (seg->enabled) { - if (cpi->oxcf.aq_mode != VARIANCE_AQ && cpi->oxcf.aq_mode != LOOKAHEAD_AQ && - cpi->oxcf.aq_mode != EQUATOR360_AQ) { - const uint8_t *const map = - seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - vp9_init_plane_quantizers(cpi, x); - - x->encode_breakout = cpi->segment_encode_breakout[mi->segment_id]; - } else { - mi->segment_id = 0; - x->encode_breakout = cpi->encode_breakout; + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); } // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs() @@ -385,16 +456,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { node->split[i] = &vt->split[i].part_variances.none; break; } - case BLOCK_4X4: { + default: { v4x4 *vt = (v4x4 *)data; + assert(bsize == BLOCK_4X4); node->part_variances = &vt->part_variances; for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; break; } - default: { - assert(0); - break; - } } } @@ -408,7 +476,8 @@ static void fill_variance(uint32_t s2, int32_t s, int c, var *v) { static void get_variance(var *v) { v->variance = (int)(256 * (v->sum_square_error - - ((v->sum_error * v->sum_error) >> v->log2_count)) >> + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> v->log2_count); } @@ -450,7 +519,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, // No check for vert/horiz split as too few samples for variance. if (bsize == bsize_min) { // Variance already computed to set the force_split. - if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && vt.part_variances->none.variance < threshold) { @@ -460,9 +529,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, return 0; } else if (bsize > bsize_min) { // Variance already computed to set the force_split. - if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); // For key frame: take split for bsize above 32X32 or very high variance. - if (cm->frame_type == KEY_FRAME && + if (frame_is_intra_only(cm) && (bsize > BLOCK_32X32 || vt.part_variances->none.variance > (threshold << 4))) { return 0; @@ -534,8 +603,9 @@ static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, int content_state) { VP9_COMMON *const cm = &cpi->common; - const int is_key_frame = (cm->frame_type == KEY_FRAME); - const int threshold_multiplier = is_key_frame ? 20 : 1; + const int is_key_frame = frame_is_intra_only(cm); + const int threshold_multiplier = + is_key_frame ? 20 : cpi->sf.variance_part_thresh_mult; int64_t threshold_base = (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]); @@ -586,6 +656,7 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, } else { thresholds[1] = (5 * threshold_base) >> 1; } + if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX; } } @@ -593,7 +664,7 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q, int content_state) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; - const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int is_key_frame = frame_is_intra_only(cm); if (sf->partition_search_type != VAR_BASED_PARTITION && sf->partition_search_type != REFERENCE_PARTITION) { return; @@ -620,6 +691,11 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q, cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000 ? (cpi->y_dequant[q][1] << 3) : 8000; + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) { + cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; + } } cpi->vbp_threshold_minmax = 15 + (q >> 3); } @@ -885,13 +961,13 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, set_block_size(cpi, x, xd, mi_row, mi_col, subsize); set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize); break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col); copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col); copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs); copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -940,18 +1016,20 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, const int has_rows = (mi_row_high + bs_high) < cm->mi_rows; const int has_cols = (mi_col_high + bs_high) < cm->mi_cols; - const int row_boundary_block_scale_factor[BLOCK_SIZES] = { - 13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0 - }; - const int col_boundary_block_scale_factor[BLOCK_SIZES] = { - 13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0 - }; + const int row_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 1, 0, + 1, 1, 0, 1, 1, + 0, 1, 0 }; + const int col_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 2, 2, + 0, 2, 2, 0, 2, + 2, 0, 0 }; int start_pos; BLOCK_SIZE bsize_low; PARTITION_TYPE partition_high; if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0; - if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0; + if (mi_row >= svc->mi_rows[svc->spatial_layer_id - 1] || + mi_col >= svc->mi_cols[svc->spatial_layer_id - 1]) + return 0; // Find corresponding (mi_col/mi_row) block down-scaled by 2x2. start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; @@ -1004,7 +1082,8 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high, subsize_high); break; - case PARTITION_SPLIT: + default: + assert(partition_high == PARTITION_SPLIT); if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, mi_row_high, mi_col_high)) return 1; @@ -1020,7 +1099,6 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, mi_col_high + bs_high)) return 1; break; - default: assert(0); } } @@ -1067,13 +1145,13 @@ static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, prev_part[start_pos] = subsize; if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); update_partition_svc(cpi, subsize, mi_row, mi_col); update_partition_svc(cpi, subsize, mi_row + bs, mi_col); update_partition_svc(cpi, subsize, mi_row, mi_col + bs); update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -1108,13 +1186,13 @@ static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, prev_part[start_pos] = subsize; if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); update_prev_partition_helper(cpi, subsize, mi_row, mi_col); update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col); update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs); update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -1206,6 +1284,7 @@ static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, cpi->content_state_sb_fd[sb_offset] = 0; } } + if (tmp_sad == 0) x->zero_temp_sad_source = 1; return tmp_sad; } @@ -1241,21 +1320,40 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, int pixels_wide = 64, pixels_high = 64; int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] }; + int force_64_split = cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe) || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->compute_source_sad_onepass && + cpi->sf.use_source_sad && !x->zero_temp_sad_source); // For the variance computation under SVC mode, we treat the frame as key if // the reference (base layer frame) is key frame (i.e., is_key_frame == 1). - const int is_key_frame = - (cm->frame_type == KEY_FRAME || + int is_key_frame = + (frame_is_intra_only(cm) || (is_one_pass_cbr_svc(cpi) && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); // Always use 4x4 partition for key frame. - const int use_4x4_partition = cm->frame_type == KEY_FRAME; + const int use_4x4_partition = frame_is_intra_only(cm); const int low_res = (cm->width <= 352 && cm->height <= 288); int variance4x4downsample[16]; int segment_id; int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3); + // For SVC: check if LAST frame is NULL or if the resolution of LAST is + // different than the current frame resolution, and if so, treat this frame + // as a key frame, for the purpose of the superblock partitioning. + // LAST == NULL can happen in some cases where enhancement spatial layers are + // enabled dyanmically in the stream and the only reference is the spatial + // reference (GOLDEN). + if (cpi->use_svc) { + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, LAST_FRAME); + if (ref == NULL || ref->y_crop_height != cm->height || + ref->y_crop_width != cm->width) + is_key_frame = 1; + } + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0); segment_id = xd->mi[0]->segment_id; if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame)) @@ -1289,6 +1387,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } // If source_sad is low copy the partition without computing the y_sad. if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && + !force_64_split && copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { x->sb_use_mv_part = 1; if (cpi->sf.svc_use_lowres_part && @@ -1305,6 +1404,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } else { set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state); } + // Decrease 32x32 split threshold for screen on base layer, for scene + // change/high motion frames. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == 0 && force_64_split) + thresholds[1] = 3 * thresholds[1] >> 2; // For non keyframes, disable 4x4 average for low resolution when speed = 8 threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX; @@ -1317,7 +1421,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, // 5-20 for the 16x16 blocks. - force_split[0] = 0; + force_split[0] = force_64_split; if (!is_key_frame) { // In the case of spatial/temporal scalable coding, the assumption here is @@ -1333,7 +1437,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, assert(yv12 != NULL); - if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) { + if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { // For now, GOLDEN will not be used for non-zero spatial layers, since // it may not be a temporal reference. yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); @@ -1374,10 +1479,28 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); } else { - y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); x->sb_use_mv_part = 1; x->sb_mvcol_part = mi->mv[0].as_mv.col; x->sb_mvrow_part = mi->mv[0].as_mv.row; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode && + cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source && + cm->width > 640 && cm->height > 480) { + // Disable split below 16x16 block size when scroll motion (horz or + // vert) is detected. + // TODO(marpan/jianj): Improve this condition: issue is that search + // range is hard-coded/limited in vp9_int_pro_motion_estimation() so + // scroll motion may not be detected here. + if (((abs(x->sb_mvrow_part) >= 48 && abs(x->sb_mvcol_part) <= 8) || + (abs(x->sb_mvcol_part) >= 48 && abs(x->sb_mvrow_part) <= 8)) && + y_sad < 100000) { + compute_minmax_variance = 0; + thresholds[2] = INT64_MAX; + } + } } y_sad_last = y_sad; @@ -1513,9 +1636,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } } - if (is_key_frame || (low_res && - vt.split[i].split[j].part_variances.none.variance > - threshold_4x4avg)) { + if (is_key_frame || + (low_res && vt.split[i].split[j].part_variances.none.variance > + threshold_4x4avg)) { force_split[split_index] = 0; // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; @@ -1648,11 +1771,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } - if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) { + if (!frame_is_intra_only(cm) && cpi->sf.copy_partition_flag) { update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); } - if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part && + if (!frame_is_intra_only(cm) && cpi->sf.svc_use_lowres_part && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); @@ -1666,6 +1789,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, return 0; } +#if !CONFIG_REALTIME_ONLY static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, int output_enabled) { @@ -1794,6 +1918,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, } } } +#endif // !CONFIG_REALTIME_ONLY void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col) { @@ -1836,20 +1961,41 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, vp9_rd_cost_init(rd_cost); } -static int set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, - int8_t segment_id) { - int segment_qindex; +#if !CONFIG_REALTIME_ONLY +static void set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + AQ_MODE aq_mode) { VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const uint8_t *const map = + cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + vp9_init_plane_quantizers(cpi, x); vpx_clear_system_state(); - segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); + + if (aq_mode == NO_AQ || aq_mode == PSNR_AQ) { + if (cpi->sf.enable_tpl_model) x->rdmult = x->cb_rdmult; + } else if (aq_mode == PERCEPTUAL_AQ) { + x->rdmult = x->cb_rdmult; + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted( + get_segment_id(cm, map, bsize, mi_row, mi_col))) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + } else { + x->rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); + } + + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } } static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx, int64_t best_rd) { + PICK_MODE_CONTEXT *ctx, int rate_in_best_rd, + int64_t dist_in_best_rd) { VP9_COMMON *const cm = &cpi->common; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; @@ -1858,6 +2004,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, struct macroblockd_plane *const pd = xd->plane; const AQ_MODE aq_mode = cpi->oxcf.aq_mode; int i, orig_rdmult; + int64_t best_rd = INT64_MAX; vpx_clear_system_state(); @@ -1914,43 +2061,11 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt; } - if (aq_mode == VARIANCE_AQ) { - const int energy = - bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize); - - if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || - cpi->force_update_segmentation || - (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { - mi->segment_id = vp9_vaq_segment_id(energy); - } else { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == LOOKAHEAD_AQ) { - const uint8_t *const map = cpi->segmentation_map; - - // I do not change rdmult here consciously. - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } else if (aq_mode == EQUATOR360_AQ) { - if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) { - mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows); - } else { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == COMPLEXITY_AQ) { - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == CYCLIC_REFRESH_AQ) { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - // If segment is boosted, use rdmult for that segment. - if (cyclic_refresh_segment_id_boosted( - get_segment_id(cm, map, bsize, mi_row, mi_col))) - x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + set_segment_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode); + if (rate_in_best_rd < INT_MAX && dist_in_best_rd < INT64_MAX) { + best_rd = vp9_calculate_rd_cost(x->rdmult, x->rddiv, rate_in_best_rd, + dist_in_best_rd); } // Find best coding mode & reconstruct the MB so it is available @@ -1979,15 +2094,19 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); } - x->rdmult = orig_rdmult; - // TODO(jingning) The rate-distortion optimization flow needs to be // refactored to provide proper exit/return handle. - if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; + if (rd_cost->rate == INT_MAX || rd_cost->dist == INT64_MAX) + rd_cost->rdcost = INT64_MAX; + else + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + + x->rdmult = orig_rdmult; ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; } +#endif // !CONFIG_REALTIME_ONLY static void update_stats(VP9_COMMON *cm, ThreadData *td) { const MACROBLOCK *x = &td->mb; @@ -2013,8 +2132,10 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { [has_second_ref(mi)]++; if (has_second_ref(mi)) { - counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)] - [ref0 == GOLDEN_FRAME]++; + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int bit = mi->ref_frame[!idx] == cm->comp_var_ref[1]; + counts->comp_ref[ctx][bit]++; } else { counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0] [ref0 != LAST_FRAME]++; @@ -2046,6 +2167,7 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { } } +#if !CONFIG_REALTIME_ONLY static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], @@ -2110,6 +2232,16 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, ThreadData *td, PICK_MODE_CONTEXT *ctx) { MACROBLOCK *const x = &td->mb; set_offsets(cpi, tile, x, mi_row, mi_col, bsize); + + if (cpi->sf.enable_tpl_model && + (cpi->oxcf.aq_mode == NO_AQ || cpi->oxcf.aq_mode == PERCEPTUAL_AQ)) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + x->rdmult = x->cb_rdmult; + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } + } + update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled); encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx); @@ -2168,7 +2300,8 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, pc_tree->leaf_split[0]); @@ -2183,12 +2316,12 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, subsize, pc_tree->split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +#endif // !CONFIG_REALTIME_ONLY // Check to see if the given partition size is allowed for a specified number // of 8x8 block rows and columns remaining in the image. @@ -2393,17 +2526,15 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, *(xd->mi[0]) = ctx->mic; *(x->mbmi_ext) = ctx->mbmi_ext; - if (seg->enabled && cpi->oxcf.aq_mode != NO_AQ) { - // For in frame complexity AQ or variance AQ, copy segment_id from - // segmentation_map. - if (cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ) { + if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) { + // Setting segmentation map for cyclic_refresh. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize, + ctx->rate, ctx->dist, x->skip, p); + } else { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } else { - // Setting segmentation map for cyclic_refresh. - vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize, - ctx->rate, ctx->dist, x->skip, p); } vp9_init_plane_quantizers(cpi, x); } @@ -2441,7 +2572,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } x->skip = ctx->skip; - x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0]; + x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0]; } static void encode_b_rt(VP9_COMP *cpi, ThreadData *td, @@ -2509,7 +2640,8 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, pc_tree->split[0]); @@ -2520,13 +2652,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, subsize, pc_tree->split[3]); break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +#if !CONFIG_REALTIME_ONLY static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MODE_INFO **mi_8x8, TOKENEXTRA **tp, int mi_row, int mi_col, @@ -2595,7 +2727,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, mi_col + (mi_step >> 1) < cm->mi_cols) { pc_tree->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, ctx, - INT64_MAX); + INT_MAX, INT64_MAX); pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -2614,11 +2746,12 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, switch (partition) { case PARTITION_NONE: rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, bsize, - ctx, INT64_MAX); + ctx, INT_MAX, INT64_MAX); break; case PARTITION_HORZ: + pc_tree->horizontal[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, &pc_tree->horizontal[0], INT64_MAX); + subsize, &pc_tree->horizontal[0], INT_MAX, INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) { RD_COST tmp_rdc; @@ -2626,8 +2759,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + pc_tree->horizontal[1].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col, - &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX); + &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX, + INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); break; @@ -2638,8 +2773,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, } break; case PARTITION_VERT: + pc_tree->vertical[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, &pc_tree->vertical[0], INT64_MAX); + subsize, &pc_tree->vertical[0], INT_MAX, INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) { RD_COST tmp_rdc; @@ -2647,9 +2783,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), - &tmp_rdc, subsize, - &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX); + pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0; + rd_pick_sb_modes( + cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, + subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT_MAX, INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); break; @@ -2659,10 +2796,11 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.rdcost += tmp_rdc.rdcost; } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, pc_tree->leaf_split[0], INT64_MAX); + subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX); break; } last_part_rdc.rate = 0; @@ -2689,7 +2827,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.dist += tmp_rdc.dist; } break; - default: assert(0); break; } pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -2727,7 +2864,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, pc_tree->split[i]->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc, split_subsize, &pc_tree->split[i]->none, - INT64_MAX); + INT_MAX, INT64_MAX); restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); @@ -2961,6 +3098,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, *min_bs = min_size; *max_bs = max_size; } +#endif // !CONFIG_REALTIME_ONLY static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); @@ -2975,15 +3113,15 @@ const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4 }; const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4 }; -const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { - 0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120 -}; -const int qindex_split_threshold_lookup[BLOCK_SIZES] = { - 0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120 -}; -const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6 -}; +const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { 0, 10, 10, 30, 40, + 40, 60, 80, 80, 90, + 100, 100, 120 }; +const int qindex_split_threshold_lookup[BLOCK_SIZES] = { 0, 3, 3, 7, 15, + 15, 30, 40, 40, 60, + 80, 80, 120 }; +const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, + 4, 4, 6 }; typedef enum { MV_ZERO = 0, @@ -3018,14 +3156,60 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, } #endif -// Calculate the score used in machine-learning based partition search early -// termination. -static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, - PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, - BLOCK_SIZE bsize) { - const double *clf; - const double *mean; - const double *sd; +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +static void nn_predict(const float *features, const NN_CONFIG *nn_config, + float *output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + const float *input_nodes = features; + + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + int layer, node, i; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (layer = 0; layer < num_layers; ++layer) { + const float *weights = nn_config->weights[layer]; + const float *bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (node = 0; node < num_output_nodes; ++node) { + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + val += bias[node]; + // ReLU as activation function. + val = VPXMAX(val, 0.0f); + output_nodes[node] = val; + weights += num_input_nodes; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } + + // Final output layer. + { + const float *weights = nn_config->weights[num_layers]; + for (node = 0; node < nn_config->num_outputs; ++node) { + const float *bias = nn_config->bias[num_layers]; + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + output[node] = val + bias[node]; + weights += num_input_nodes; + } + } +} + +#if !CONFIG_REALTIME_ONLY +#define FEATURES 7 +// Machine-learning based partition search early termination. +// Return 1 to skip split and rect partitions. +static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize) { const int mag_mv = abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row); const int left_in_image = !!xd->left_mi; @@ -3035,11 +3219,32 @@ static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, int above_par = 0; // above_partitioning int left_par = 0; // left_partitioning int last_par = 0; // last_partitioning - BLOCK_SIZE context_size; - double score; int offset = 0; + int i; + BLOCK_SIZE context_size; + const NN_CONFIG *nn_config = NULL; + const float *mean, *sd, *linear_weights; + float nn_score, linear_score; + float features[FEATURES]; assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + vpx_clear_system_state(); + + switch (bsize) { + case BLOCK_64X64: + offset = 0; + nn_config = &vp9_partition_nnconfig_64x64; + break; + case BLOCK_32X32: + offset = 8; + nn_config = &vp9_partition_nnconfig_32x32; + break; + case BLOCK_16X16: + offset = 16; + nn_config = &vp9_partition_nnconfig_16x16; + break; + default: assert(0 && "Unexpected block size."); return 0; + } if (above_in_image) { context_size = xd->above_mi->sb_type; @@ -3065,36 +3270,550 @@ static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, last_par = 1; } - if (bsize == BLOCK_64X64) - offset = 0; - else if (bsize == BLOCK_32X32) - offset = 8; - else if (bsize == BLOCK_16X16) - offset = 16; + mean = &vp9_partition_feature_mean[offset]; + sd = &vp9_partition_feature_std[offset]; + features[0] = ((float)ctx->rate - mean[0]) / sd[0]; + features[1] = ((float)ctx->dist - mean[1]) / sd[1]; + features[2] = ((float)mag_mv / 2 - mean[2]) * sd[2]; + features[3] = ((float)(left_par + above_par) / 2 - mean[3]) * sd[3]; + features[4] = ((float)ctx->sum_y_eobs - mean[4]) / sd[4]; + features[5] = ((float)cm->base_qindex - mean[5]) * sd[5]; + features[6] = ((float)last_par - mean[6]) * sd[6]; - // early termination score calculation - clf = &classifiers[offset]; - mean = &train_mean[offset]; - sd = &train_stdm[offset]; - score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) + - clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) + - clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) + - clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) + - clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) + - clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) + - clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7]; - return score; + // Predict using linear model. + linear_weights = &vp9_partition_linear_weights[offset]; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + if (linear_score > 0.1f) return 0; + + // Predict using neural net model. + nn_predict(features, nn_config, &nn_score); + + if (linear_score < -0.0f && nn_score < 0.1f) return 1; + if (nn_score < -0.0f && linear_score < 0.1f) return 1; + return 0; +} +#undef FEATURES + +#define FEATURES 4 +// ML-based partition search breakout. +static int ml_predict_breakout(VP9_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_COST *const rd_cost) { + DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; + const VP9_COMMON *const cm = &cpi->common; + float features[FEATURES]; + const float *linear_weights = NULL; // Linear model weights. + float linear_score = 0.0f; + const int qindex = cm->base_qindex; + const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2); + const int is_720p_or_larger = VPXMIN(cm->width, cm->height) >= 720; + const int resolution_ctx = is_720p_or_larger ? 1 : 0; + + switch (bsize) { + case BLOCK_64X64: + linear_weights = vp9_partition_breakout_weights_64[resolution_ctx][q_ctx]; + break; + case BLOCK_32X32: + linear_weights = vp9_partition_breakout_weights_32[resolution_ctx][q_ctx]; + break; + case BLOCK_16X16: + linear_weights = vp9_partition_breakout_weights_16[resolution_ctx][q_ctx]; + break; + case BLOCK_8X8: + linear_weights = vp9_partition_breakout_weights_8[resolution_ctx][q_ctx]; + break; + default: assert(0 && "Unexpected block size."); return 0; + } + if (!linear_weights) return 0; + + { // Generate feature values. +#if CONFIG_VP9_HIGHBITDEPTH + const int ac_q = + vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + int feature_index = 0; + unsigned int var, sse; + float rate_f, dist_f; + +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, x->e_mbd.bd); + } else { + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + } +#else + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); +#endif + var = var >> num_pels_log2; + + vpx_clear_system_state(); + + rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX); + dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2); + rate_f = + ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + + features[feature_index++] = rate_f; + features[feature_index++] = dist_f; + features[feature_index++] = (float)var; + features[feature_index++] = (float)ac_q; + assert(feature_index == FEATURES); + } + + { // Calculate the output score. + int i; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + } + + return linear_score >= cpi->sf.rd_ml_partition.search_breakout_thresh[q_ctx]; +} +#undef FEATURES + +#define FEATURES 8 +#define LABELS 4 +static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, + const PC_TREE *const pc_tree, + int *allow_horz, int *allow_vert, + int64_t ref_rd) { + const NN_CONFIG *nn_config = NULL; + float score[LABELS] = { + 0.0f, + }; + int thresh = -1; + int i; + (void)x; + + if (ref_rd <= 0 || ref_rd > 1000000000) return; + + switch (bsize) { + case BLOCK_8X8: break; + case BLOCK_16X16: + nn_config = &vp9_rect_part_nnconfig_16; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &vp9_rect_part_nnconfig_32; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &vp9_rect_part_nnconfig_64; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[3]; + break; + default: assert(0 && "Unexpected block size."); return; + } + if (!nn_config || thresh < 0) return; + + // Feature extraction and model score calculation. + { + const VP9_COMMON *const cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + int feature_index = 0; + float features[FEATURES]; + + features[feature_index++] = logf((float)dc_q + 1.0f); + features[feature_index++] = + (float)(pc_tree->partitioning == PARTITION_NONE); + features[feature_index++] = logf((float)ref_rd / bs / bs + 1.0f); + + { + const float norm_factor = 1.0f / ((float)ref_rd + 1.0f); + const int64_t none_rdcost = pc_tree->none.rdcost; + float rd_ratio = 2.0f; + if (none_rdcost > 0 && none_rdcost < 1000000000) + rd_ratio = (float)none_rdcost * norm_factor; + features[feature_index++] = VPXMIN(rd_ratio, 2.0f); + + for (i = 0; i < 4; ++i) { + const int64_t this_rd = pc_tree->split[i]->none.rdcost; + const int rd_valid = this_rd > 0 && this_rd < 1000000000; + // Ratio between sub-block RD and whole block RD. + features[feature_index++] = + rd_valid ? (float)this_rd * norm_factor : 1.0f; + } + } + + assert(feature_index == FEATURES); + nn_predict(features, nn_config, score); + } + + // Make decisions based on the model score. + { + int max_score = -1000; + int horz = 0, vert = 0; + int int_score[LABELS]; + for (i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = VPXMAX(int_score[i], max_score); + } + thresh = max_score - thresh; + for (i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) horz = 1; + if ((i >> 1) & 1) vert = 1; + } + } + *allow_horz = *allow_horz && horz; + *allow_vert = *allow_vert && vert; + } +} +#undef FEATURES +#undef LABELS + +// Perform fast and coarse motion search for the given block. This is a +// pre-processing step for the ML based partition search speedup. +static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + MV ref_mv, MV_REFERENCE_FRAME ref, + uint8_t *const pred_buf) { + const VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref); + const int step_param = 1; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_method = NSTEP; + const int sadpb = x->sadperbit16; + MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 }; + MV best_mv = { 0, 0 }; + int cost_list[5]; + + assert(yv12 != NULL); + if (!yv12) return; + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ref - 1].sf); + mi->ref_frame[0] = ref; + mi->ref_frame[1] = NONE; + mi->sb_type = bsize; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, + &best_mv, 0, 0); + best_mv.row *= 8; + best_mv.col *= 8; + x->mv_limits = tmp_mv_limits; + mi->mv[0].as_mv = best_mv; + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = pred_buf; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); } +// Use a neural net model to prune partition-none and partition-split search. +// Features used: QP; spatial block size contexts; variance of prediction +// residue after simple_motion_search. +#define FEATURES 12 +static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi, + MACROBLOCK *const x, + PC_TREE *const pc_tree, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { + const VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; +#if CONFIG_VP9_HIGHBITDEPTH + MACROBLOCKD *xd = &x->e_mbd; + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]); + uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? (CONVERT_TO_BYTEPTR(pred_buffer)) + : pred_buffer; +#else + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64]); + uint8_t *const pred_buf = pred_buffer; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int speed = cpi->oxcf.speed; + float thresh = 0.0f; + + switch (bsize) { + case BLOCK_64X64: + nn_config = &vp9_part_split_nnconfig_64; + thresh = speed > 0 ? 2.8f : 3.0f; + break; + case BLOCK_32X32: + nn_config = &vp9_part_split_nnconfig_32; + thresh = speed > 0 ? 3.5f : 3.0f; + break; + case BLOCK_16X16: + nn_config = &vp9_part_split_nnconfig_16; + thresh = speed > 0 ? 3.8f : 4.0f; + break; + case BLOCK_8X8: + nn_config = &vp9_part_split_nnconfig_8; + if (cm->width >= 720 && cm->height >= 720) + thresh = speed > 0 ? 2.5f : 2.0f; + else + thresh = speed > 0 ? 3.8f : 2.0f; + break; + default: assert(0 && "Unexpected block size."); return; + } + + if (!nn_config) return; + + // Do a simple single motion search to find a prediction for current block. + // The variance of the residue will be used as input features. + { + MV ref_mv; + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + // If bsize is 64x64, use zero MV as reference; otherwise, use MV result + // of previous(larger) block as reference. + if (bsize == BLOCK_64X64) + ref_mv.row = ref_mv.col = 0; + else + ref_mv = pc_tree->mv; + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf); + pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv; + } + + vpx_clear_system_state(); + + { + float features[FEATURES] = { 0.0f }; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (xd->bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + int feature_idx = 0; + float score; + + // Generate model input features. + features[feature_idx++] = logf((float)dc_q + 1.0f); + + // Get the variance of the residue as input features. + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const uint8_t *pred = pred_buf; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + const MACROBLOCKD *const xd = &x->e_mbd; + const int has_above = !!xd->above_mi; + const int has_left = !!xd->left_mi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mi->sb_type : bsize; + int i; + + features[feature_idx++] = (float)has_above; + features[feature_idx++] = (float)b_width_log2_lookup[above_bsize]; + features[feature_idx++] = (float)b_height_log2_lookup[above_bsize]; + features[feature_idx++] = (float)has_left; + features[feature_idx++] = (float)b_width_log2_lookup[left_bsize]; + features[feature_idx++] = (float)b_height_log2_lookup[left_bsize]; + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + assert(feature_idx == FEATURES); + + // Feed the features into the model to get the confidence score. + nn_predict(features, nn_config, &score); + + // Higher score means that the model has higher confidence that the split + // partition is better than the non-split partition. So if the score is + // high enough, we skip the none-split partition search; if the score is + // low enough, we skip the split partition search. + if (score > thresh) *none = 0; + if (score < -thresh) *split = 0; + } +} +#undef FEATURES +#endif // !CONFIG_REALTIME_ONLY + +static double log_wiener_var(int64_t wiener_variance) { + return log(1.0 + wiener_variance) / log(2.0); +} + +static void build_kmeans_segmentation(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + BLOCK_SIZE bsize = BLOCK_64X64; + KMEANS_DATA *kmeans_data; + + vp9_disable_segmentation(&cm->seg); + if (cm->show_frame) { + int mi_row, mi_col; + cpi->kmeans_data_size = 0; + cpi->kmeans_ctr_num = 8; + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + int mb_row_start = mi_row >> 1; + int mb_col_start = mi_col >> 1; + int mb_row_end = VPXMIN( + (mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows); + int mb_col_end = VPXMIN( + (mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols); + int row, col; + int64_t wiener_variance = 0; + + for (row = mb_row_start; row < mb_row_end; ++row) + for (col = mb_col_start; col < mb_col_end; ++col) + wiener_variance += cpi->mb_wiener_variance[row * cm->mb_cols + col]; + + wiener_variance /= + (mb_row_end - mb_row_start) * (mb_col_end - mb_col_start); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&cpi->kmeans_mutex); +#endif // CONFIG_MULTITHREAD + + kmeans_data = &cpi->kmeans_data_arr[cpi->kmeans_data_size++]; + kmeans_data->value = log_wiener_var(wiener_variance); + kmeans_data->pos = mi_row * cpi->kmeans_data_stride + mi_col; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&cpi->kmeans_mutex); +#endif // CONFIG_MULTITHREAD + } + } + + vp9_kmeans(cpi->kmeans_ctr_ls, cpi->kmeans_boundary_ls, + cpi->kmeans_count_ls, cpi->kmeans_ctr_num, cpi->kmeans_data_arr, + cpi->kmeans_data_size); + + vp9_perceptual_aq_mode_setup(cpi, &cm->seg); + } +} + +#if !CONFIG_REALTIME_ONLY +static int wiener_var_segment(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *cm = &cpi->common; + int mb_row_start = mi_row >> 1; + int mb_col_start = mi_col >> 1; + int mb_row_end = + VPXMIN((mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows); + int mb_col_end = + VPXMIN((mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols); + int row, col, idx; + int64_t wiener_variance = 0; + int segment_id; + int8_t seg_hist[MAX_SEGMENTS] = { 0 }; + int8_t max_count = 0, max_index = -1; + + vpx_clear_system_state(); + + assert(cpi->norm_wiener_variance > 0); + + for (row = mb_row_start; row < mb_row_end; ++row) { + for (col = mb_col_start; col < mb_col_end; ++col) { + wiener_variance = cpi->mb_wiener_variance[row * cm->mb_cols + col]; + segment_id = + vp9_get_group_idx(log_wiener_var(wiener_variance), + cpi->kmeans_boundary_ls, cpi->kmeans_ctr_num); + ++seg_hist[segment_id]; + } + } + + for (idx = 0; idx < cpi->kmeans_ctr_num; ++idx) { + if (seg_hist[idx] > max_count) { + max_count = seg_hist[idx]; + max_index = idx; + } + } + + assert(max_index >= 0); + segment_id = max_index; + + return segment_id; +} + +static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult) { + const int gf_group_index = cpi->twopass.gf_group.index; + TplDepFrame *tpl_frame = &cpi->tpl_stats[gf_group_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + int mi_wide = num_8x8_blocks_wide_lookup[bsize]; + int mi_high = num_8x8_blocks_high_lookup[bsize]; + int row, col; + + int dr = 0; + int count = 0; + double r0, rk, beta; + + if (tpl_frame->is_valid == 0) return orig_rdmult; + + if (cpi->twopass.gf_group.layer_depth[gf_group_index] > 1) return orig_rdmult; + + if (gf_group_index >= MAX_ARF_GOP_SIZE) return orig_rdmult; + + for (row = mi_row; row < mi_row + mi_high; ++row) { + for (col = mi_col; col < mi_col + mi_wide; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + + if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue; + + intra_cost += this_stats->intra_cost; + mc_dep_cost += this_stats->mc_dep_cost; + + ++count; + } + } + + vpx_clear_system_state(); + + r0 = cpi->rd.r0; + rk = (double)intra_cost / mc_dep_cost; + beta = r0 / rk; + dr = vp9_get_adaptive_rdmult(cpi, beta); + + dr = VPXMIN(dr, orig_rdmult * 3 / 2); + dr = VPXMAX(dr, orig_rdmult * 1 / 2); + + dr = VPXMAX(1, dr); + + return dr; +} +#endif // !CONFIG_REALTIME_ONLY + +#if !CONFIG_REALTIME_ONLY // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, - TileDataEnc *tile_data, TOKENEXTRA **tp, - int mi_row, int mi_col, BLOCK_SIZE bsize, - RD_COST *rd_cost, int64_t best_rd, - PC_TREE *pc_tree) { +static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_COST *rd_cost, RD_COST best_rdc, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -3102,11 +3821,11 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; - PICK_MODE_CONTEXT *ctx = &pc_tree->none; + PICK_MODE_CONTEXT *const ctx = &pc_tree->none; int i; const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); BLOCK_SIZE subsize; - RD_COST this_rdc, sum_rdc, best_rdc; + RD_COST this_rdc, sum_rdc; int do_split = bsize >= BLOCK_8X8; int do_rect = 1; INTERP_FILTER pred_interp_filter; @@ -3133,24 +3852,35 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist; int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate; + int must_split = 0; + int should_encode_sb = 0; + + // Ref frames picked in the [i_th] quarter subblock during square partition + // RD search. It may be used to prune ref frame selection of rect partitions. + uint8_t ref_frames_used[4] = { 0, 0, 0, 0 }; + + int partition_mul = x->cb_rdmult; (void)*tp_orig; assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); - // Adjust dist breakout threshold according to the partition size. dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; vp9_rd_cost_init(&this_rdc); vp9_rd_cost_init(&sum_rdc); - vp9_rd_cost_reset(&best_rdc); - best_rdc.rdcost = best_rd; set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &partition_mul); + } + vp9_rd_cost_update(partition_mul, x->rddiv, &best_rdc); + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != LOOKAHEAD_AQ) x->mb_energy = vp9_block_energy(cpi, x, bsize); @@ -3165,10 +3895,18 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); } + // Get sub block energy range + if (bsize >= BLOCK_16X16) { + int min_energy, max_energy; + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + must_split = (min_energy < -3) && (max_energy - min_energy > 2); + } + // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= max_size && bsize >= min_size); + partition_none_allowed &= (bsize <= max_size); partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) || force_horz_split); partition_vert_allowed &= @@ -3177,7 +3915,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (cpi->sf.use_square_partition_only && - bsize > cpi->sf.use_square_only_threshold) { + (bsize > cpi->sf.use_square_only_thresh_high || + bsize < cpi->sf.use_square_only_thresh_low)) { if (cpi->use_svc) { if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) partition_horz_allowed &= force_horz_split; @@ -3250,48 +3989,84 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } #endif + pc_tree->partitioning = PARTITION_NONE; + + if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) { + const int do_rd_ml_partition_var_pruning = + partition_none_allowed && do_split && + mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && + mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; + if (do_rd_ml_partition_var_pruning) { + ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); + } else { + vp9_zero(pc_tree->mv); + } + if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks. + for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv; + } + } + // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx, - best_rdc.rdcost); + best_rdc.rate, best_rdc.dist); + ctx->rdcost = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = ctx->mic.ref_frame[0]; + const int ref2 = ctx->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } if (bsize >= BLOCK_8X8) { this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; - this_rdc.rdcost = - RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc); } if (this_rdc.rdcost < best_rdc.rdcost) { MODE_INFO *mi = xd->mi[0]; best_rdc = this_rdc; + should_encode_sb = 1; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - if (!cpi->sf.ml_partition_search_early_termination) { - // If all y, u, v transform blocks in this partition are skippable, - // and the dist & rate are within the thresholds, the partition search - // is terminated for current branch of the partition search tree. - if (!x->e_mbd.lossless && ctx->skippable && - ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { - do_split = 0; - do_rect = 0; - } - } else { + if (cpi->sf.rd_ml_partition.search_early_termination) { // Currently, the machine-learning based partition search early // termination is only used while bsize is 16x16, 32x32 or 64x64, // VPXMIN(cm->width, cm->height) >= 480, and speed = 0. if (!x->e_mbd.lossless && !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) && ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) { - if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) { + if (ml_pruning_partition(cm, xd, ctx, mi_row, mi_col, bsize)) { do_split = 0; do_rect = 0; } } } + if ((do_split || do_rect) && !x->e_mbd.lossless && ctx->skippable) { + const int use_ml_based_breakout = + cpi->sf.rd_ml_partition.search_breakout && cm->base_qindex >= 100; + if (use_ml_based_breakout) { + if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) { + do_split = 0; + do_rect = 0; + } + } else { + if (!cpi->sf.rd_ml_partition.search_early_termination) { + if ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr)) { + do_split = 0; + do_rect = 0; + } + } + } + } + #if CONFIG_FP_MB_STATS // Check if every 16x16 first pass block statistics has zero // motion and the corresponding first pass residue is small enough. @@ -3341,10 +4116,13 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } else { + vp9_zero(ctx->pred_mv); + ctx->mic.interp_filter = EIGHTTAP; } // store estimated motion vector - if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx); + store_pred_mv(x, ctx); // If the interp_filter is marked as SWITCHABLE_FILTERS, it was for an // intra block and used for context purposes. @@ -3357,113 +4135,184 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_SPLIT // TODO(jingning): use the motion vectors given by the above search as // the starting point of motion search in the following partition type check. - if (do_split) { + pc_tree->split[0]->none.rdcost = 0; + pc_tree->split[1]->none.rdcost = 0; + pc_tree->split[2]->none.rdcost = 0; + pc_tree->split[3]->none.rdcost = 0; + if (do_split || must_split) { subsize = get_subsize(bsize, PARTITION_SPLIT); + load_pred_mv(x, ctx); if (bsize == BLOCK_8X8) { i = 4; if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - pc_tree->leaf_split[0], best_rdc.rdcost); - - if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX; + pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist); + if (sum_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0]; + const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } + } } else { - for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) { + for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split); + ++i) { const int x_idx = (i & 1) * mi_step; const int y_idx = (i >> 1) * mi_step; + int found_best_rd = 0; + RD_COST best_rdc_split; + vp9_rd_cost_reset(&best_rdc_split); + + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { + // A must split test here increases the number of sub + // partitions but hurts metrics results quite a bit, + // so this extra test is commented out pending + // further tests on whether it adds much in terms of + // visual quality. + // (must_split) ? best_rdc.rate + // : best_rdc.rate - sum_rdc.rate, + // (must_split) ? best_rdc.dist + // : best_rdc.dist - sum_rdc.dist, + best_rdc_split.rate = best_rdc.rate - sum_rdc.rate; + best_rdc_split.dist = best_rdc.dist - sum_rdc.dist; + } if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); - pc_tree->split[i]->index = i; - rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &this_rdc, - best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); + if (cpi->sf.prune_ref_frame_for_rect_partitions) + pc_tree->split[i]->none.rate = INT_MAX; + found_best_rd = rd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rdc, best_rdc_split, pc_tree->split[i]); - if (this_rdc.rate == INT_MAX) { + if (found_best_rd == 0) { sum_rdc.rdcost = INT64_MAX; break; } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions && + pc_tree->split[i]->none.rate != INT_MAX) { + const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0]; + const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1]; + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; - sum_rdc.rdcost += this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); } } } - if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) { + if (((sum_rdc.rdcost < best_rdc.rdcost) || must_split) && i == 4) { sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); - if (sum_rdc.rdcost < best_rdc.rdcost) { + if ((sum_rdc.rdcost < best_rdc.rdcost) || + (must_split && (sum_rdc.dist < best_rdc.dist))) { best_rdc = sum_rdc; + should_encode_sb = 1; pc_tree->partitioning = PARTITION_SPLIT; // Rate and distortion based partition search termination clause. - if (!cpi->sf.ml_partition_search_early_termination && - !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { + if (!cpi->sf.rd_ml_partition.search_early_termination && + !x->e_mbd.lossless && + ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr))) { do_rect = 0; } } } else { // skip rectangular partition test when larger block size // gives better rd cost - if ((cpi->sf.less_rectangular_check) && - ((bsize > cpi->sf.use_square_only_threshold) || - (best_rdc.dist < dist_breakout_thr))) + if (cpi->sf.less_rectangular_check && + (bsize > cpi->sf.use_square_only_thresh_high || + best_rdc.dist < dist_breakout_thr)) do_rect &= !partition_none_allowed; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + pc_tree->vertical[0].skip_ref_frame_mask = 0; + pc_tree->vertical[1].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + uint8_t used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames; + } + + { + const int do_ml_rect_partition_pruning = + !frame_is_intra_only(cm) && !force_horz_split && !force_vert_split && + (partition_horz_allowed || partition_vert_allowed) && bsize > BLOCK_8X8; + if (do_ml_rect_partition_pruning) { + ml_prune_rect_partition(cpi, x, bsize, pc_tree, &partition_horz_allowed, + &partition_vert_allowed, best_rdc.rdcost); + } + } + // PARTITION_HORZ if (partition_horz_allowed && (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ]; subsize = get_subsize(bsize, PARTITION_HORZ); - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - &pc_tree->horizontal[0], best_rdc.rdcost); + &pc_tree->horizontal[0], best_rdc.rate - part_mode_rate, + best_rdc.dist); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rate += part_mode_rate; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) { PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, subsize, &pc_tree->horizontal[1], - best_rdc.rdcost - sum_rdc.rdcost); + best_rdc.rate - sum_rdc.rate, + best_rdc.dist - sum_rdc.dist); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; - sum_rdc.rdcost += this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); } } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); - if (sum_rdc.rdcost < best_rdc.rdcost) { - best_rdc = sum_rdc; - pc_tree->partitioning = PARTITION_HORZ; + best_rdc = sum_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_HORZ; - if ((cpi->sf.less_rectangular_check) && - (bsize > cpi->sf.use_square_only_threshold)) - do_rect = 0; - } + if (cpi->sf.less_rectangular_check && + bsize > cpi->sf.use_square_only_thresh_high) + do_rect = 0; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } @@ -3471,56 +4320,52 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_VERT if (partition_vert_allowed && (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT]; subsize = get_subsize(bsize, PARTITION_VERT); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[0].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - &pc_tree->vertical[0], best_rdc.rdcost); + &pc_tree->vertical[0], best_rdc.rate - part_mode_rate, + best_rdc.dist); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rate += part_mode_rate; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } + if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) { update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, &pc_tree->vertical[0]); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[1].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, subsize, &pc_tree->vertical[1], - best_rdc.rdcost - sum_rdc.rdcost); + best_rdc.rate - sum_rdc.rate, + best_rdc.dist - sum_rdc.dist); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; - sum_rdc.rdcost += this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); } } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); - if (sum_rdc.rdcost < best_rdc.rdcost) { - best_rdc = sum_rdc; - pc_tree->partitioning = PARTITION_VERT; - } + best_rdc = sum_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_VERT; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } - // TODO(jbb): This code added so that we avoid static analysis - // warning related to the fact that best_rd isn't used after this - // point. This code should be refactored so that the duplicate - // checks occur in some sub function and thus are used... - (void)best_rd; *rd_cost = best_rdc; - if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && - pc_tree->index != 3) { + if (should_encode_sb && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); @@ -3533,6 +4378,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } else { assert(tp_orig == *tp); } + + return should_encode_sb; } static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, @@ -3564,10 +4411,12 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, RD_COST dummy_rdc; int i; int seg_skip = 0; + int orig_rdmult = cpi->rd.RDMULT; const int idx_str = cm->mi_stride * mi_row + mi_col; MODE_INFO **mi = cm->mi_grid_visible + idx_str; + vp9_rd_cost_reset(&dummy_rdc); (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile); @@ -3582,7 +4431,10 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, } } - vp9_zero(x->pred_mv); + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } td->pc_root->index = 0; if (seg->enabled) { @@ -3593,6 +4445,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, } x->source_variance = UINT_MAX; + + x->cb_rdmult = orig_rdmult; + if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { const BLOCK_SIZE bsize = seg_skip ? BLOCK_64X64 : sf->always_this_block_size; @@ -3613,19 +4468,33 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root); } else { + if (cpi->twopass.gf_group.index > 0 && cpi->sf.enable_tpl_model) { + int dr = + get_rdmult_delta(cpi, BLOCK_64X64, mi_row, mi_col, orig_rdmult); + x->cb_rdmult = dr; + } + + if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ && cm->show_frame) { + x->segment_id = wiener_var_segment(cpi, BLOCK_64X64, mi_row, mi_col); + x->cb_rdmult = vp9_compute_rd_mult( + cpi, vp9_get_qindex(&cm->seg, x->segment_id, cm->base_qindex)); + } + // If required set upper and lower partition size limits if (sf->auto_min_max_partition_size) { set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col, &x->min_partition_size, &x->max_partition_size); } + td->pc_root->none.rdcost = 0; rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rdc, INT64_MAX, td->pc_root); + &dummy_rdc, dummy_rdc, td->pc_root); } (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile, num_sb_cols); } } +#endif // !CONFIG_REALTIME_ONLY static void init_encode_frame_mb_context(VP9_COMP *cpi) { MACROBLOCK *const x = &cpi->td.mb; @@ -3703,6 +4572,36 @@ static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x, vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); } +static void hybrid_search_svc_baseiskey(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF) + vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); + else if (bsize >= BLOCK_8X8) + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + else + vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + +static void hybrid_search_scene_change(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, @@ -3718,6 +4617,9 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, int plane; set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + mi = xd->mi[0]; mi->sb_type = bsize; @@ -3733,14 +4635,23 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, if (cyclic_refresh_segment_id_boosted(mi->segment_id)) x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); + else if (cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) + hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); - else if (bsize >= BLOCK_8X8) - vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); - else + else if (bsize >= BLOCK_8X8) { + if (cpi->rc.hybrid_intra_scene_change) + hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); + else + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + } else { vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); @@ -3830,6 +4741,76 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { } } +#define FEATURES 6 +#define LABELS 2 +static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + + switch (bsize) { + case BLOCK_64X64: nn_config = &vp9_var_part_nnconfig_64; break; + case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break; + case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break; + case BLOCK_8X8: break; + default: assert(0 && "Unexpected block size."); return -1; + } + + if (!nn_config) return -1; + + vpx_clear_system_state(); + + { + const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; + float features[FEATURES] = { 0.0f }; + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); + int feature_idx = 0; + float score[LABELS]; + + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 8 * (mi_row & 7); + const int sb_offset_col = 8 * (mi_col & 7); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + + assert(feature_idx == FEATURES); + nn_predict(features, nn_config, score); + if (score[0] > thresh) return PARTITION_SPLIT; + if (score[0] < -thresh) return PARTITION_NONE; + return -1; + } +} +#undef FEATURES +#undef LABELS + static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, @@ -3859,6 +4840,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; + const int use_ml_based_partitioning = + sf->partition_search_type == ML_BASED_PARTITION; + (void)*tp_orig; // Avoid checking for rectangular partitions for speed >= 6. @@ -3889,6 +4873,18 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, partition_vert_allowed &= force_vert_split; } + if (use_ml_based_partitioning) { + if (partition_none_allowed || do_split) do_rect = 0; + if (partition_none_allowed && do_split) { + const int ml_predicted_partition = + ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col); + if (ml_predicted_partition == PARTITION_NONE) do_split = 0; + if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; + } + } + + if (!partition_none_allowed && !do_split) do_rect = 1; + ctx->pred_pixel_ready = !(partition_vert_allowed || partition_horz_allowed || do_split); @@ -3902,26 +4898,25 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, ctx->skip = x->skip; if (this_rdc.rate != INT_MAX) { - int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); if (this_rdc.rdcost < best_rdc.rdcost) { - int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; - int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; - - dist_breakout_thr >>= - 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - - rate_breakout_thr *= num_pels_log2_lookup[bsize]; - best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && - this_rdc.dist < dist_breakout_thr) { - do_split = 0; - do_rect = 0; + if (!use_ml_based_partitioning) { + int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; + int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; + dist_breakout_thr >>= + 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; + if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && + this_rdc.dist < dist_breakout_thr) { + do_split = 0; + do_rect = 0; + } } } } @@ -3969,7 +4964,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - if (sf->adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); pc_tree->horizontal[0].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->horizontal[0]); @@ -4013,7 +5008,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_VERT if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - if (sf->adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); pc_tree->vertical[0].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->vertical[0]); @@ -4173,7 +5168,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, output_enabled, rd_cost, @@ -4203,7 +5199,6 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, rd_cost->dist += this_rdc.dist; } break; - default: assert(0 && "Invalid partition type."); break; } } @@ -4292,7 +5287,8 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, output_enabled, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); if (bsize == BLOCK_8X8) { nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, @@ -4313,13 +5309,110 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, dummy_cost, pc_tree->split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock. +static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + + if (!is_key_frame) { + MODE_INFO *mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + + (mi_row + 4 < cm->mi_rows); + unsigned int y_sad_g, y_sad_thr; + unsigned int y_sad = UINT_MAX; + + assert(yv12 != NULL); + + if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { + // For now, GOLDEN will not be used for non-zero spatial layers, since + // it may not be a temporal reference. + yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad_g = UINT_MAX; + } + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } + mi->ref_frame[1] = NONE; + mi->sb_type = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filter = BILINEAR; + + { + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + } + + // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad + // are close if short_circuit_low_temp_var is on. + y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; + if (y_sad_g < y_sad_thr) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + } else { + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = x->est_pred; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + switch (xd->bd) { + case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; + case 10: + memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); + break; + case 12: + memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); + break; + } +#else + memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} + static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, TOKENEXTRA **tp) { @@ -4350,6 +5443,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type; BLOCK_SIZE bsize = BLOCK_64X64; int seg_skip = 0; + int i; (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile); @@ -4359,7 +5453,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } x->source_variance = UINT_MAX; - vp9_zero(x->pred_mv); + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } vp9_rd_cost_init(&dummy_rdc); x->color_sensitivity[0] = 0; x->color_sensitivity[1] = 0; @@ -4367,6 +5464,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, x->skip_low_source_sad = 0; x->lowvar_highsumdiff = 0; x->content_state_sb = 0; + x->zero_temp_sad_source = 0; x->sb_use_mv_part = 0; x->sb_mvcol_part = 0; x->sb_mvrow_part = 0; @@ -4406,6 +5504,15 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; + case ML_BASED_PARTITION: + get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); + x->max_partition_size = BLOCK_64X64; + x->min_partition_size = BLOCK_8X8; + x->sb_pickmode_part = 1; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, + td->pc_root); + break; case SOURCE_VAR_BASED_PARTITION: set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col); nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, @@ -4417,14 +5524,15 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; - case REFERENCE_PARTITION: + default: + assert(partition_search_type == REFERENCE_PARTITION); x->sb_pickmode_part = 1; set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); // Use nonrd_pick_partition on scene-cut for VBR mode. // nonrd_pick_partition does not support 4x4 partition, so avoid it // on key frame for now. if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && - cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME && + cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { // Use lower max_partition_size for low resoultions. if (cm->width <= 352 && cm->height <= 288) @@ -4440,7 +5548,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, // TODO(marpan): Seems like nonrd_select_partition does not support // 4x4 partition. Since 4x4 is used on key frame, use this switch // for now. - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); else @@ -4449,7 +5557,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } break; - default: assert(0); break; } // Update ref_frame usage for inter frame if this group is ARF group. @@ -4516,16 +5623,12 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { &var16->sse, &var16->sum); var16->var = variance_highbd(var16); break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); var16->var = variance_highbd(var16); break; - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10" - " or VPX_BITS_12"); - return -1; } } else { vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, @@ -4620,8 +5723,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); - CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * - sizeof(*cpi->tile_data))); + CHECK_MEM_ERROR( + cm, cpi->tile_data, + vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; for (tile_row = 0; tile_row < tile_rows; ++tile_row) @@ -4632,6 +5736,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; +#if CONFIG_CONSISTENT_RECODE + tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; +#endif tile_data->mode_map[i][j] = j; } } @@ -4645,6 +5752,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; TileInfo *tile_info = &this_tile->tile_info; + if (cpi->sf.adaptive_rd_thresh_row_mt && + this_tile->row_base_thresh_freq_fact == NULL) + vp9_row_mt_alloc_rd_thresh(cpi, this_tile); vp9_tile_init(tile_info, cm, tile_row, tile_col); cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; @@ -4675,8 +5785,10 @@ void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row, if (cpi->sf.use_nonrd_pick_mode) encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); +#if !CONFIG_REALTIME_ONLY else encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); +#endif cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok; cpi->tplist[tile_row][tile_col][tile_sb_row].count = @@ -4729,16 +5841,117 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, } #endif +static int compare_kmeans_data(const void *a, const void *b) { + if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) { + return 1; + } else if (((const KMEANS_DATA *)a)->value < + ((const KMEANS_DATA *)b)->value) { + return -1; + } else { + return 0; + } +} + +static void compute_boundary_ls(const double *ctr_ls, int k, + double *boundary_ls) { + // boundary_ls[j] is the upper bound of data centered at ctr_ls[j] + int j; + for (j = 0; j < k - 1; ++j) { + boundary_ls[j] = (ctr_ls[j] + ctr_ls[j + 1]) / 2.; + } + boundary_ls[k - 1] = DBL_MAX; +} + +int vp9_get_group_idx(double value, double *boundary_ls, int k) { + int group_idx = 0; + while (value >= boundary_ls[group_idx]) { + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + return group_idx; +} + +void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k, + KMEANS_DATA *arr, int size) { + int i, j; + int itr; + int group_idx; + double sum[MAX_KMEANS_GROUPS]; + int count[MAX_KMEANS_GROUPS]; + + vpx_clear_system_state(); + + assert(k >= 2 && k <= MAX_KMEANS_GROUPS); + + qsort(arr, size, sizeof(*arr), compare_kmeans_data); + + // initialize the center points + for (j = 0; j < k; ++j) { + ctr_ls[j] = arr[(size * (2 * j + 1)) / (2 * k)].value; + } + + for (itr = 0; itr < 10; ++itr) { + compute_boundary_ls(ctr_ls, k, boundary_ls); + for (i = 0; i < MAX_KMEANS_GROUPS; ++i) { + sum[i] = 0; + count[i] = 0; + } + + // Both the data and centers are sorted in ascending order. + // As each data point is processed in order, its corresponding group index + // can only increase. So we only need to reset the group index to zero here. + group_idx = 0; + for (i = 0; i < size; ++i) { + while (arr[i].value >= boundary_ls[group_idx]) { + // place samples into clusters + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + sum[group_idx] += arr[i].value; + ++count[group_idx]; + } + + for (group_idx = 0; group_idx < k; ++group_idx) { + if (count[group_idx] > 0) + ctr_ls[group_idx] = sum[group_idx] / count[group_idx]; + + sum[group_idx] = 0; + count[group_idx] = 0; + } + } + + // compute group_idx, boundary_ls and count_ls + for (j = 0; j < k; ++j) { + count_ls[j] = 0; + } + compute_boundary_ls(ctr_ls, k, boundary_ls); + group_idx = 0; + for (i = 0; i < size; ++i) { + while (arr[i].value >= boundary_ls[group_idx]) { + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + arr[i].group_idx = group_idx; + ++count_ls[group_idx]; + } +} + static void encode_frame_internal(VP9_COMP *cpi) { SPEED_FEATURES *const sf = &cpi->sf; ThreadData *const td = &cpi->td; MACROBLOCK *const x = &td->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + const int gf_group_index = cpi->twopass.gf_group.index; xd->mi = cm->mi_grid_visible; xd->mi[0] = cm->mi; - vp9_zero(*td->counts); vp9_zero(cpi->td.rd_counts); @@ -4756,8 +5969,12 @@ static void encode_frame_internal(VP9_COMP *cpi) { x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - +#if CONFIG_CONSISTENT_RECODE + x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; +#endif if (xd->lossless) x->optimize = 0; + x->sharpness = cpi->oxcf.sharpness; + x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ); cm->tx_mode = select_tx_mode(cpi, xd); @@ -4799,8 +6016,33 @@ static void encode_frame_internal(VP9_COMP *cpi) { if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION) source_var_based_partition_search_method(cpi); + } else if (gf_group_index && gf_group_index < MAX_ARF_GOP_SIZE && + cpi->sf.enable_tpl_model) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int row, col; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + } + } + + vpx_clear_system_state(); + + if (tpl_frame->is_valid) + cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; } + // Frame segmentation + if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi); + { struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); @@ -4881,9 +6123,52 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { return sum_delta / (cm->mi_rows * cm->mi_cols); } +#if CONFIG_CONSISTENT_RECODE +static void restore_encode_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + int i, j; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes[i][j] = + rd_opt->prediction_type_threshes_prev[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; + } + + if (cpi->tile_data != NULL) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = + tile_data->thresh_freq_fact_prev[i][j]; + } + } + } + } + + cm->interp_filter = cpi->sf.default_interp_filter; +} +#endif + void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; +#if CONFIG_CONSISTENT_RECODE + restore_encode_params(cpi); +#endif + +#if CONFIG_MISMATCH_DEBUG + mismatch_reset_frame(MAX_MB_PLANE); +#endif + // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a // different sign bias and that buffer is then the fixed ref. However, this @@ -4891,16 +6176,11 @@ void vp9_encode_frame(VP9_COMP *cpi) { // side behavior is where the ALT ref buffer has opposite sign bias to // the other two. if (!frame_is_intra_only(cm)) { - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) || - (cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[LAST_FRAME])) { - cpi->allow_comp_inter_inter = 0; - } else { + if (vp9_compound_reference_allowed(cm)) { cpi->allow_comp_inter_inter = 1; - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; + vp9_setup_compound_reference_mode(cm); + } else { + cpi->allow_comp_inter_inter = 0; } } @@ -5064,7 +6344,8 @@ static void update_zeromv_cnt(VP9_COMP *const cpi, const MODE_INFO *const mi, for (y = 0; y < ymis; y++) for (x = 0; x < xmis; x++) { int map_offset = block_index + y * cm->mi_cols + x; - if (is_inter_block(mi) && mi->segment_id <= CR_SEGMENT_ID_BOOST2) { + if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) && + mi->segment_id <= CR_SEGMENT_ID_BOOST2) { if (abs(mv.row) < 8 && abs(mv.col) < 8) { if (cpi->consec_zero_mv[map_offset] < 255) cpi->consec_zero_mv[map_offset]++; @@ -5131,7 +6412,27 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, VPXMAX(bsize, BLOCK_8X8)); - vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8)); +#if CONFIG_MISMATCH_DEBUG + if (output_enabled) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]); + const int bw = get_block_width(plane_bsize); + const int bh = get_block_height(plane_bsize); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + + mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c, + pixel_r, bw, bh, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#endif + + vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8), mi_row, mi_col, output_enabled); vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip, VPXMAX(bsize, BLOCK_8X8)); } @@ -5159,7 +6460,11 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])]; if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize); - if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0) + if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize); } } diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.h b/libs/libvpx/vp9/encoder/vp9_encodeframe.h index cf5ae3d8ac..fd0a9c517e 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_ -#define VP9_ENCODER_VP9_ENCODEFRAME_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ #include "vpx/vpx_integer.h" @@ -45,8 +45,13 @@ void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td, void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q, int content_state); +struct KMEANS_DATA; +void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k, + struct KMEANS_DATA *arr, int size); +int vp9_get_group_idx(double value, double *boundary_ls, int k); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEFRAME_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.c b/libs/libvpx/vp9/encoder/vp9_encodemb.c index f3c17f2559..7630a81103 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libs/libvpx/vp9/encoder/vp9_encodemb.c @@ -16,6 +16,10 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#if CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif + #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" @@ -50,12 +54,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { } static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { - { 10, 6 }, { 8, 5 }, + { 10, 6 }, + { 8, 5 }, }; // 'num' can be negative, but 'shift' must be non-negative. #define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ - ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)) + (((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))) int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx) { @@ -76,13 +81,19 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, const scan_order *const so = get_scan(xd, tx_size, plane_type, block); const int16_t *const scan = so->scan; const int16_t *const nb = so->neighbors; + const MODE_INFO *mbmi = xd->mi[0]; + const int sharpness = mb->sharpness; + const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]; const int64_t rdmult = - ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; + (sharpness == 0 ? rdadj >> 1 + : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4); + const int64_t rddiv = mb->rddiv; int64_t rd_cost0, rd_cost1; int64_t rate0, rate1; int16_t t0, t1; int i, final_eob; + int count_high_values_after_eob = 0; #if CONFIG_VP9_HIGHBITDEPTH const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); #else @@ -200,9 +211,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, const int band_next = band_translate[i + 1]; const int token_next = (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; - unsigned int( - *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - token_costs + band_next; + unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS] + [ENTROPY_TOKENS] = + token_costs + band_next; token_cache[rc] = vp9_pt_energy_class[t0]; ctx_next = get_coef_context(nb, token_cache, i + 1); token_tree_sel_next = (x == 0); @@ -262,6 +273,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, assert(distortion0 <= distortion_for_zero); token_cache[rc] = vp9_pt_energy_class[t0]; } + if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++; assert(accu_error >= 0); x_prev = qcoeff[rc]; // Update based on selected quantized value. @@ -272,6 +284,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, if (best_eob_cost_cur < best_block_rd_cost) { best_block_rd_cost = best_eob_cost_cur; final_eob = i + 1; + count_high_values_after_eob = 0; if (use_x1) { before_best_eob_qc = x1; before_best_eob_dqc = dqc1; @@ -283,19 +296,31 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, } } } - assert(final_eob <= eob); - if (final_eob > 0) { - int rc; - assert(before_best_eob_qc != 0); - i = final_eob - 1; - rc = scan[i]; - qcoeff[rc] = before_best_eob_qc; - dqcoeff[rc] = before_best_eob_dqc; - } - for (i = final_eob; i < eob; i++) { - int rc = scan[i]; - qcoeff[rc] = 0; - dqcoeff[rc] = 0; + if (count_high_values_after_eob > 0) { + final_eob = eob - 1; + for (; final_eob >= 0; final_eob--) { + const int rc = scan[final_eob]; + const int x = qcoeff[rc]; + if (x) { + break; + } + } + final_eob++; + } else { + assert(final_eob <= eob); + if (final_eob > 0) { + int rc; + assert(before_best_eob_qc != 0); + i = final_eob - 1; + rc = scan[i]; + qcoeff[rc] = before_best_eob_qc; + dqcoeff[rc] = before_best_eob_dqc; + } + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; + } } mb->plane[plane].eobs[block] = final_eob; return final_eob; @@ -357,13 +382,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); } return; } @@ -383,17 +408,19 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, scan_order->iscan); break; case TX_8X8: - vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block, - p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_fdct8x8(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); + break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } } @@ -433,13 +460,13 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - default: assert(0); } return; } @@ -461,12 +488,12 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - default: assert(0); break; } } @@ -510,14 +537,14 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); } return; } @@ -543,19 +570,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } } static void encode_block(int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct encode_b_args *const args = arg; +#if CONFIG_MISMATCH_DEBUG + int mi_row = args->mi_row; + int mi_col = args->mi_col; + int output_enabled = args->output_enabled; +#endif MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; @@ -572,7 +604,11 @@ static void encode_block(int plane, int block, int row, int col, if (x->zcoeff_blk[tx_size][block] && plane == 0) { p->eobs[block] = 0; *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } if (!x->skip_recode) { @@ -582,7 +618,11 @@ static void encode_block(int plane, int block, int row, int col, // skip forward transform p->eobs[block] = 0; *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } else { vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size); } @@ -599,7 +639,11 @@ static void encode_block(int plane, int block, int row, int col, // skip forward transform p->eobs[block] = 0; *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } } else { vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); @@ -616,7 +660,13 @@ static void encode_block(int plane, int block, int row, int col, if (p->eobs[block]) *(args->skip) = 0; - if (x->skip_encode || p->eobs[block] == 0) return; + if (x->skip_encode || p->eobs[block] == 0) { +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else + return; +#endif + } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); @@ -633,16 +683,20 @@ static void encode_block(int plane, int block, int row, int col, vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; - default: assert(0 && "Invalid transform size"); } +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -656,14 +710,27 @@ static void encode_block(int plane, int block, int row, int col, case TX_8X8: vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - default: assert(0 && "Invalid transform size"); break; } +#if CONFIG_MISMATCH_DEBUG +encode_block_end: + if (output_enabled) { + int pixel_c, pixel_r; + int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row, + pd->subsampling_x, pd->subsampling_y); + mismatch_record_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, + blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif } static void encode_block_pass1(int plane, int block, int row, int col, @@ -697,12 +764,21 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { encode_block_pass1, x); } -void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + int output_enabled) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; MODE_INFO *mi = xd->mi[0]; - struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip }; int plane; +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args arg = { x, 1, NULL, NULL, + &mi->skip, mi_row, mi_col, output_enabled }; +#else + struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip }; + (void)mi_row; + (void)mi_col; + (void)output_enabled; +#endif mi->skip = 1; @@ -847,7 +923,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, xd->bd); } break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); @@ -875,7 +952,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } } break; - default: assert(0); return; } if (*eob) *(args->skip) = 0; return; @@ -929,7 +1005,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, if (!x->skip_encode && *eob) vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride); @@ -954,7 +1031,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } break; - default: assert(0); break; } if (*eob) *(args->skip) = 0; } @@ -963,8 +1039,16 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, int enable_optimize_b) { const MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; +#if CONFIG_MISMATCH_DEBUG + // TODO(angiebird): make mismatch_debug support intra mode + struct encode_b_args arg = { + x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0, + 0 + }; +#else struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip }; +#endif if (enable_optimize_b && x->optimize && (!x->skip_recode || !x->skip_optimize)) { diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.h b/libs/libvpx/vp9/encoder/vp9_encodemb.h index cf943bedfd..1975ee73ac 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodemb.h +++ b/libs/libvpx/vp9/encoder/vp9_encodemb.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEMB_H_ -#define VP9_ENCODER_VP9_ENCODEMB_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMB_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMB_H_ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" @@ -24,10 +24,16 @@ struct encode_b_args { ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; int8_t *skip; +#if CONFIG_MISMATCH_DEBUG + int mi_row; + int mi_col; + int output_enabled; +#endif }; int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx); -void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + int output_enabled); void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size); @@ -48,4 +54,4 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEMB_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.h b/libs/libvpx/vp9/encoder/vp9_encodemv.h index 9fc7ab8dc4..2f1be4b233 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodemv.h +++ b/libs/libvpx/vp9/encoder/vp9_encodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEMV_H_ -#define VP9_ENCODER_VP9_ENCODEMV_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMV_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMV_H_ #include "vp9/encoder/vp9_encoder.h" @@ -27,7 +27,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, unsigned int *const max_mv_magnitude); void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], - const nmv_context *mvctx, int usehp); + const nmv_context *ctx, int usehp); void vp9_update_mv_count(ThreadData *td); @@ -35,4 +35,4 @@ void vp9_update_mv_count(ThreadData *td); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEMV_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.c b/libs/libvpx/vp9/encoder/vp9_encoder.c index 2ae59dd981..7f82a470b3 100644 --- a/libs/libvpx/vp9/encoder/vp9_encoder.c +++ b/libs/libvpx/vp9/encoder/vp9_encoder.c @@ -8,9 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include -#include +#include #include "./vp9_rtcd.h" #include "./vpx_config.h" @@ -25,31 +26,49 @@ #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" #include "vpx_ports/vpx_timer.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/common/vp9_mvref_common.h" +#endif #if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_tile_common.h" +#include "vp9/common/vp9_scan.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_alt_ref_aq.h" #include "vp9/encoder/vp9_aq_360.h" #include "vp9/encoder/vp9_aq_complexity.h" +#endif #include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_aq_variance.h" +#endif #include "vp9/encoder/vp9_bitstream.h" +#if CONFIG_INTERNAL_STATS +#include "vp9/encoder/vp9_blockiness.h" +#endif #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_encoder.h" -#include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mbgraph.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/encoder/vp9_mcomp.h" +#endif #include "vp9/encoder/vp9_multi_thread.h" #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_picklpf.h" @@ -65,12 +84,12 @@ #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 -#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv - // for altref computation. -#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision - // mv. Choose a very high value for - // now so that HIGH_PRECISION is always - // chosen. +// Whether to use high precision mv for altref computation. +#define ALTREF_HIGH_PRECISION_MV 1 + +// Q threshold for high precision mv. Choose a very high value for now so that +// HIGH_PRECISION is always chosen. +#define HIGH_PRECISION_MV_QTHRESH 200 #define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold #define FRAME_RATE_FACTOR 8 @@ -84,6 +103,9 @@ static FILE *yuv_skinmap_file = NULL; #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; #endif +#ifdef OUTPUT_YUV_SVC_SRC +FILE *yuv_svc_src[3] = { NULL, NULL, NULL }; +#endif #if 0 FILE *framepsnr; @@ -102,6 +124,14 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) { } #endif +#if CONFIG_VP9_HIGHBITDEPTH +void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#endif +void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); + +#if !CONFIG_REALTIME_ONLY // compute adaptive threshold for skip recoding static int compute_context_model_thresh(const VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; @@ -426,6 +456,7 @@ static int compute_context_model_diff(const VP9_COMMON *const cm) { return -diff; } +#endif // !CONFIG_REALTIME_ONLY // Test for whether to calculate metrics for the frame. static int is_psnr_calc_enabled(VP9_COMP *cpi) { @@ -483,15 +514,11 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { *hr = 3; *hs = 5; break; - case ONETWO: + default: + assert(mode == ONETWO); *hr = 1; *hs = 2; break; - default: - *hr = 1; - *hs = 1; - assert(0); - break; } } @@ -547,6 +574,74 @@ static void apply_active_map(VP9_COMP *cpi) { } } +static void apply_roi_map(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + vpx_roi_map_t *roi = &cpi->roi; + const int *delta_q = roi->delta_q; + const int *delta_lf = roi->delta_lf; + const int *skip = roi->skip; + int ref_frame[8]; + int internal_delta_q[MAX_SEGMENTS]; + int i; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + + // TODO(jianj): Investigate why ROI not working in speed < 5 or in non + // realtime mode. + if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return; + if (!roi->enabled) return; + + memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame)); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + // Select delta coding method; + seg->abs_delta = SEGMENT_DELTADATA; + + memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols)); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Translate the external delta q values to internal values. + internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i])); + if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i]; + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); + if (internal_delta_q[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]); + } + if (delta_lf[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF); + vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]); + } + if (skip[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_SKIP); + vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]); + } + if (ref_frame[i] >= 0) { + int valid_ref = 1; + // ALTREF is not used as reference for nonrd_pickmode with 0 lag. + if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode) + valid_ref = 0; + // If GOLDEN is selected, make sure it's set as reference. + if (ref_frame[i] == GOLDEN_FRAME && + !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) { + valid_ref = 0; + } + // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are + // same reference. + if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0) + ref_frame[i] = LAST_FRAME; + if (valid_ref) { + vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]); + } + } + } + roi->enabled = 1; +} + static void init_level_info(Vp9LevelInfo *level_info) { Vp9LevelStats *const level_stats = &level_info->level_stats; Vp9LevelSpec *const level_spec = &level_info->level_spec; @@ -557,6 +652,13 @@ static void init_level_info(Vp9LevelInfo *level_info) { level_spec->min_altref_distance = INT_MAX; } +static int check_seg_range(int seg_data[8], int range) { + return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range || + abs(seg_data[2]) > range || abs(seg_data[3]) > range || + abs(seg_data[4]) > range || abs(seg_data[5]) > range || + abs(seg_data[6]) > range || abs(seg_data[7]) > range); +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { int i; const Vp9LevelSpec *this_level; @@ -583,6 +685,61 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level; } +int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[8], int delta_lf[8], + int skip[8], int ref_frame[8]) { + VP9_COMMON *cm = &cpi->common; + vpx_roi_map_t *roi = &cpi->roi; + const int range = 63; + const int ref_frame_range = 3; // Alt-ref + const int skip_range = 1; + const int frame_rows = cpi->common.mi_rows; + const int frame_cols = cpi->common.mi_cols; + + // Check number of rows and columns match + if (frame_rows != (int)rows || frame_cols != (int)cols) { + return -1; + } + + if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) || + !check_seg_range(ref_frame, ref_frame_range) || + !check_seg_range(skip, skip_range)) + return -1; + + // Also disable segmentation if no deltas are specified. + if (!map || + (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] | + delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] | + delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] | + delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] | + skip[5] | skip[6] | skip[7]) && + (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 && + ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 && + ref_frame[6] == -1 && ref_frame[7] == -1))) { + vp9_disable_segmentation(&cm->seg); + cpi->roi.enabled = 0; + return 0; + } + + if (roi->roi_map) { + vpx_free(roi->roi_map); + roi->roi_map = NULL; + } + CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols)); + + // Copy to ROI sturcture in the compressor. + memcpy(roi->roi_map, map, rows * cols); + memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0])); + memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0])); + memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0])); + memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0])); + roi->enabled = 1; + roi->rows = rows; + roi->cols = cols; + + return 0; +} + int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols) { if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { @@ -660,8 +817,17 @@ static void setup_frame(VP9_COMP *cpi) { if (!cpi->use_svc) cm->frame_context_idx = cpi->refresh_alt_ref_frame; } + // TODO(jingning): Overwrite the frame_context_idx index in multi-layer ARF + // case. Need some further investigation on if we could apply this to single + // layer ARF case as well. + if (cpi->multi_layer_arf && !cpi->use_svc) { + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + cm->frame_context_idx = clamp(gf_group->layer_depth[gf_group->index] - 1, 0, + FRAME_CONTEXTS - 1); + } + if (cm->frame_type == KEY_FRAME) { - if (!is_two_pass_svc(cpi)) cpi->refresh_golden_frame = 1; + cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; vp9_zero(cpi->interp_filter_selected); } else { @@ -713,12 +879,17 @@ static void vp9_enc_free_mi(VP9_COMMON *cm) { cm->mi_grid_base = NULL; vpx_free(cm->prev_mi_grid_base); cm->prev_mi_grid_base = NULL; + cm->mi_alloc_size = 0; } static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { // Current mip will be the prev_mip for the next frame. MODE_INFO **temp_base = cm->prev_mi_grid_base; MODE_INFO *temp = cm->prev_mip; + + // Skip update prev_mi frame in show_existing_frame mode. + if (cm->show_existing_frame) return; + cm->prev_mip = cm->mip; cm->mip = temp; @@ -817,9 +988,18 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->active_map.map); cpi->active_map.map = NULL; + vpx_free(cpi->roi.roi_map); + cpi->roi.roi_map = NULL; + vpx_free(cpi->consec_zero_mv); cpi->consec_zero_mv = NULL; + vpx_free(cpi->mb_wiener_variance); + cpi->mb_wiener_variance = NULL; + + vpx_free(cpi->mi_ssim_rdmult_scaling_factors); + cpi->mi_ssim_rdmult_scaling_factors = NULL; + vp9_free_ref_frame_buffers(cm->buffer_pool); #if CONFIG_VP9_POSTPROC vp9_free_postproc_buffers(cm); @@ -1121,8 +1301,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a - // target of 1/4x1/4. - if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) { + // target of 1/4x1/4. number_spatial_layers must be greater than 2. + if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && + cpi->svc.number_spatial_layers > 2) { cpi->svc.scaled_temp_is_alloc = 1; if (vpx_realloc_frame_buffer( &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1, @@ -1213,15 +1394,9 @@ static void set_tile_limits(VP9_COMP *cpi) { int min_log2_tile_cols, max_log2_tile_cols; vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - if (is_two_pass_svc(cpi) && (cpi->svc.encode_empty_frame_state == ENCODING || - cpi->svc.number_spatial_layers > 1)) { - cm->log2_tile_cols = 0; - cm->log2_tile_rows = 0; - } else { - cm->log2_tile_cols = - clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); - cm->log2_tile_rows = cpi->oxcf.tile_rows; - } + cm->log2_tile_cols = + clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + cm->log2_tile_rows = cpi->oxcf.tile_rows; if (cpi->oxcf.target_level == LEVEL_AUTO) { const int level_tile_cols = @@ -1244,24 +1419,17 @@ static void update_frame_size(VP9_COMP *cpi) { cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base)); set_tile_limits(cpi); - - if (is_two_pass_svc(cpi)) { - if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate alt_ref_buffer"); - } } static void init_buffer_indices(VP9_COMP *cpi) { - cpi->lst_fb_idx = 0; - cpi->gld_fb_idx = 1; - cpi->alt_fb_idx = 2; + int ref_frame; + + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + cpi->ref_fb_idx[ref_frame] = ref_frame; + + cpi->lst_fb_idx = cpi->ref_fb_idx[LAST_FRAME - 1]; + cpi->gld_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; + cpi->alt_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; } static void init_level_constraint(LevelConstraint *lc) { @@ -1610,7 +1778,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x4x4d_bits10) break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, @@ -1689,11 +1858,6 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits12) break; - - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); } } } @@ -1757,6 +1921,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { int last_w = cpi->oxcf.width; int last_h = cpi->oxcf.height; + vp9_init_quantizer(cpi); if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; cm->color_space = oxcf->color_space; @@ -2017,10 +2182,13 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, realloc_segmentation_maps(cpi); - CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR( + cm, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); +#if !CONFIG_REALTIME_ONLY CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); +#endif CHECK_MEM_ERROR( cm, cpi->consec_zero_mv, @@ -2062,8 +2230,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, #endif cpi->refresh_alt_ref_frame = 0; - cpi->multi_arf_last_grp_enabled = 0; - cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; init_level_info(&cpi->level_info); @@ -2104,9 +2270,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, if (cpi->b_calculate_consistency) { CHECK_MEM_ERROR(cm, cpi->ssim_vars, - vpx_malloc(sizeof(*cpi->ssim_vars) * 4 * - cpi->common.mi_rows * cpi->common.mi_cols)); + vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, + sizeof(*cpi->ssim_vars) * 4)); cpi->worst_consistency = 100.0; + } else { + cpi->ssim_vars = NULL; } #endif @@ -2141,6 +2309,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); #endif +#ifdef OUTPUT_YUV_SVC_SRC + yuv_svc_src[0] = fopen("svc_src_0.yuv", "wb"); + yuv_svc_src[1] = fopen("svc_src_1.yuv", "wb"); + yuv_svc_src[2] = fopen("svc_src_2.yuv", "wb"); +#endif #if 0 framepsnr = fopen("framepsnr.stt", "a"); @@ -2216,8 +2389,30 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, } #endif // !CONFIG_REALTIME_ONLY - vp9_set_speed_features_framesize_independent(cpi); - vp9_set_speed_features_framesize_dependent(cpi); + cpi->mb_wiener_var_cols = 0; + cpi->mb_wiener_var_rows = 0; + cpi->mb_wiener_variance = NULL; + + vp9_set_speed_features_framesize_independent(cpi, oxcf->speed); + vp9_set_speed_features_framesize_dependent(cpi, oxcf->speed); + + { + const int bsize = BLOCK_16X16; + const int w = num_8x8_blocks_wide_lookup[bsize]; + const int h = num_8x8_blocks_high_lookup[bsize]; + const int num_cols = (cm->mi_cols + w - 1) / w; + const int num_rows = (cm->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors, + vpx_calloc(num_rows * num_cols, + sizeof(*cpi->mi_ssim_rdmult_scaling_factors))); + } + + cpi->kmeans_data_arr_alloc = 0; +#if CONFIG_NON_GREEDY_MV + cpi->feature_score_loc_alloc = 0; + cpi->tpl_ready = 0; +#endif // CONFIG_NON_GREEDY_MV + for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL; // Allocate memory to store variances for a frame. CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff))); @@ -2293,6 +2488,17 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, vp9_loop_filter_init(cm); + // Set up the unit scaling factor used during motion search. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + cpi->td.mb.me_sf = &cpi->me_sf; + cm->error.setjmp = 0; return cpi; @@ -2307,11 +2513,15 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *cm; - unsigned int i; + unsigned int i, frame; int t; if (!cpi) return; +#if CONFIG_INTERNAL_STATS + vpx_free(cpi->ssim_vars); +#endif + cm = &cpi->common; if (cm->current_video_frame > 0) { #if CONFIG_INTERNAL_STATS @@ -2383,7 +2593,6 @@ void vp9_remove_compressor(VP9_COMP *cpi) { fclose(f); } - #endif #if 0 @@ -2402,6 +2611,35 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_denoiser_free(&(cpi->denoiser)); #endif + if (cpi->kmeans_data_arr_alloc) { +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&cpi->kmeans_mutex); +#endif + vpx_free(cpi->kmeans_data_arr); + } + +#if CONFIG_NON_GREEDY_MV + vpx_free(cpi->feature_score_loc_arr); + vpx_free(cpi->feature_score_loc_sort); + vpx_free(cpi->feature_score_loc_heap); + vpx_free(cpi->select_mv_arr); +#endif + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + int sqr_bsize; + for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) { + vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]); + } + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + cpi->tpl_stats[frame].is_valid = 0; + } + for (t = 0; t < cpi->num_workers; ++t) { VPxWorker *const worker = &cpi->workers[t]; EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; @@ -2425,7 +2663,9 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_bitstream_encode_tiles_buffer_dealloc(cpi); } +#if !CONFIG_REALTIME_ONLY vp9_alt_ref_aq_destroy(cpi->alt_ref_aq); +#endif dealloc_compressor_data(cpi); @@ -2459,6 +2699,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #ifdef OUTPUT_YUV_REC fclose(yuv_rec_file); #endif +#ifdef OUTPUT_YUV_SVC_SRC + fclose(yuv_svc_src[0]); + fclose(yuv_svc_src[1]); + fclose(yuv_svc_src[2]); +#endif #if 0 @@ -2707,6 +2952,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, } #endif // CONFIG_VP9_HIGHBITDEPTH +#if !CONFIG_REALTIME_ONLY static int scale_down(VP9_COMP *cpi, int q) { RATE_CONTROL *const rc = &cpi->rc; GF_GROUP *const gf_group = &cpi->twopass.gf_group; @@ -2754,11 +3000,14 @@ static int big_rate_miss(VP9_COMP *cpi) { // test in two pass for the first static int two_pass_first_group_inter(VP9_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - if ((cpi->oxcf.pass == 2) && - (gf_group->index == gf_group->first_inter_index)) { - return 1; + if (cpi->oxcf.pass == 2) { + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int gfg_index = gf_group->index; + + if (gfg_index == 0) return gf_group->update_type[gfg_index] == LF_UPDATE; + return gf_group->update_type[gfg_index - 1] != LF_UPDATE && + gf_group->update_type[gfg_index] == LF_UPDATE; } else { return 0; } @@ -2807,10 +3056,24 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, } return force_recode; } +#endif // !CONFIG_REALTIME_ONLY -void vp9_update_reference_frames(VP9_COMP *cpi) { +static void update_ref_frames(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; BufferPool *const pool = cm->buffer_pool; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + if (cpi->rc.show_arf_as_gld) { + int tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + } else if (cm->show_existing_frame) { + // Pop ARF. + cpi->lst_fb_idx = cpi->alt_fb_idx; + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. @@ -2836,23 +3099,23 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { tmp = cpi->alt_fb_idx; cpi->alt_fb_idx = cpi->gld_fb_idx; cpi->gld_fb_idx = tmp; - - if (is_two_pass_svc(cpi)) { - cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx; - cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx; - } } else { /* For non key/golden frames */ if (cpi->refresh_alt_ref_frame) { - int arf_idx = cpi->alt_fb_idx; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; - } + int arf_idx = gf_group->top_arf_idx; + + // Push new ARF into stack. + stack_push(gf_group->arf_index_stack, cpi->alt_fb_idx, + gf_group->stack_size); + ++gf_group->stack_size; + + assert(arf_idx < REF_FRAMES); ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); memcpy(cpi->interp_filter_selected[ALTREF_FRAME], cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); + + cpi->alt_fb_idx = arf_idx; } if (cpi->refresh_golden_frame) { @@ -2877,69 +3140,39 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); } + + if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) { + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } +} + +void vp9_update_reference_frames(VP9_COMP *cpi) { + update_ref_frames(cpi); + #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && - cpi->denoiser.denoising_level > kDenLowLow) { - int svc_base_is_key = 0; - int denoise_svc_second_layer = 0; - if (cpi->use_svc) { - int realloc_fail = 0; - const int svc_buf_shift = - cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 - ? cpi->denoiser.num_ref_frames - : 0; - int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - svc_base_is_key = lc->is_key_frame; - denoise_svc_second_layer = - cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1 - : 0; - // Check if we need to allocate extra buffers in the denoiser - // for - // refreshed frames. - realloc_fail = vp9_denoiser_realloc_svc( - cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame, - cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx, - cpi->gld_fb_idx, cpi->lst_fb_idx); - if (realloc_fail) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to re-allocate denoiser for SVC"); - } - vp9_denoiser_update_frame_info( - &cpi->denoiser, *cpi->Source, cpi->common.frame_type, - cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, - cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, - cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key, - denoise_svc_second_layer); - } + vp9_denoiser_update_ref_frame(cpi); #endif - if (is_one_pass_cbr_svc(cpi)) { - // Keep track of frame index for each reference frame. - SVC *const svc = &cpi->svc; - if (cm->frame_type == KEY_FRAME) { - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; - } else { - if (cpi->refresh_last_frame) - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - if (cpi->refresh_golden_frame) - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - if (cpi->refresh_alt_ref_frame) - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; - } - } + + if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi); } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { MACROBLOCKD *xd = &cpi->td.mb.e_mbd; struct loopfilter *lf = &cm->lf; - - const int is_reference_frame = + int is_reference_frame = (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + if (cpi->use_svc && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + is_reference_frame = !cpi->svc.non_reference_frame; + + // Skip loop filter in show_existing_frame mode. + if (cm->show_existing_frame) { + lf->filter_level = 0; + return; + } if (xd->lossless) { lf->filter_level = 0; @@ -3066,8 +3299,8 @@ void vp9_scale_references(VP9_COMP *cpi) { if (cpi->oxcf.pass == 0 && !cpi->use_svc) { // Check for release of scaled reference. buf_idx = cpi->scaled_ref_idx[ref_frame - 1]; - buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL; - if (buf != NULL) { + if (buf_idx != INVALID_IDX) { + buf = &pool->frame_bufs[buf_idx]; --buf->ref_count; cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; } @@ -3098,22 +3331,21 @@ static void release_scaled_references(VP9_COMP *cpi) { refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0; for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const int idx = cpi->scaled_ref_idx[i - 1]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); - if (buf != NULL && - (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && - buf->buf.y_crop_height == ref->y_crop_height))) { - --buf->ref_count; - cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); + if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && + buf->buf.y_crop_height == ref->y_crop_height)) { + --buf->ref_count; + cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + } } } } else { - for (i = 0; i < MAX_REF_FRAMES; ++i) { + for (i = 0; i < REFS_PER_FRAME; ++i) { const int idx = cpi->scaled_ref_idx[i]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - if (buf != NULL) { + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; --buf->ref_count; cpi->scaled_ref_idx[i] = INVALID_IDX; } @@ -3172,11 +3404,9 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { case VPX_BITS_10: dc_quant_devisor = 16.0; break; - case VPX_BITS_12: - dc_quant_devisor = 64.0; - break; default: - assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + dc_quant_devisor = 64.0; break; } #else @@ -3292,7 +3522,7 @@ static void set_mv_search_params(VP9_COMP *cpi) { } static void set_size_independent_vars(VP9_COMP *cpi) { - vp9_set_speed_features_framesize_independent(cpi); + vp9_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); vp9_set_rd_speed_thresholds(cpi); vp9_set_rd_speed_thresholds_sub8x8(cpi); cpi->common.interp_filter = cpi->sf.default_interp_filter; @@ -3303,11 +3533,16 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, VP9_COMMON *const cm = &cpi->common; // Setup variables that depend on the dimensions of the frame. - vp9_set_speed_features_framesize_dependent(cpi); + vp9_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed); // Decide q and q bounds. *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index); + if (cpi->oxcf.rc_mode == VPX_CBR && cpi->rc.force_max_q) { + *q = cpi->rc.worst_quality; + cpi->rc.force_max_q = 0; + } + if (!frame_is_intra_only(cm)) { vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); } @@ -3415,9 +3650,7 @@ static void set_frame_size(VP9_COMP *cpi) { #endif } - if ((oxcf->pass == 2) && - (!cpi->use_svc || (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state != ENCODING))) { + if ((oxcf->pass == 2) && !cpi->use_svc) { vp9_set_target_rate(cpi); } @@ -3464,19 +3697,76 @@ static void set_frame_size(VP9_COMP *cpi) { set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } -static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, - uint8_t *dest) { +#if CONFIG_CONSISTENT_RECODE +static void save_encode_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + int i, j; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes_prev[i][j] = + rd_opt->prediction_type_threshes[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; + } + + if (cpi->tile_data != NULL) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact_prev[i][j] = + tile_data->thresh_freq_fact[i][j]; + } + } + } + } +} +#endif + +static INLINE void set_raw_source_frame(VP9_COMP *cpi) { +#ifdef ENABLE_KF_DENOISE + if (is_spatial_denoise_enabled(cpi)) { + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); + } else { + cpi->raw_source_frame = cpi->Source; + } +#else + cpi->raw_source_frame = cpi->Source; +#endif +} + +static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, + uint8_t *dest) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + int q = 0, bottom_index = 0, top_index = 0; + int no_drop_scene_change = 0; const INTERP_FILTER filter_scaler = (is_one_pass_cbr_svc(cpi)) - ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] + ? svc->downsample_filter_type[svc->spatial_layer_id] : EIGHTTAP; const int phase_scaler = (is_one_pass_cbr_svc(cpi)) - ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] + ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0; + if (cm->show_existing_frame) { + cpi->rc.this_frame_target = 0; + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return 1; + } + + svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe; + // Flag to check if its valid to compute the source sad (used for // scene detection and for superblock content state in CBR mode). // The flag may get reset below based on SVC or resizing state. @@ -3489,30 +3779,36 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (is_one_pass_cbr_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 2 && cpi->un_scaled_source->y_height == cm->height << 2 && - cpi->svc.scaled_temp.y_width == cm->width << 1 && - cpi->svc.scaled_temp.y_height == cm->height << 1) { + svc->scaled_temp.y_width == cm->width << 1 && + svc->scaled_temp.y_height == cm->height << 1) { // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2 // result will be saved in scaled_temp and might be used later. - const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1]; - const int phase_scaler2 = cpi->svc.downsample_filter_phase[1]; + const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1]; + const int phase_scaler2 = svc->downsample_filter_phase[1]; cpi->Source = vp9_svc_twostage_scale( - cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp, + cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); - cpi->svc.scaled_one_half = 1; + svc->scaled_one_half = 1; } else if (is_one_pass_cbr_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && cpi->un_scaled_source->y_height == cm->height << 1 && - cpi->svc.scaled_one_half) { + svc->scaled_one_half) { // If the spatial layer is 1/2x1/2 and the scaling is already done in the // two-stage scaling, use the result directly. - cpi->Source = &cpi->svc.scaled_temp; - cpi->svc.scaled_one_half = 0; + cpi->Source = &svc->scaled_temp; + svc->scaled_one_half = 0; } else { cpi->Source = vp9_scale_if_required( cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0), filter_scaler, phase_scaler); } +#ifdef OUTPUT_YUV_SVC_SRC + // Write out at most 3 spatial layers. + if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) { + vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source); + } +#endif // Unfiltered raw source used in metrics calculation if the source // has been filtered. if (is_psnr_calc_enabled(cpi)) { @@ -3530,9 +3826,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } if ((cpi->use_svc && - (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 || - cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 || - cpi->svc.current_superframe < 1)) || + (svc->spatial_layer_id < svc->number_spatial_layers - 1 || + svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->current_superframe < 1)) || cpi->resize_pending || cpi->resize_state || cpi->external_resize || cpi->resize_state != ORIG) { cpi->compute_source_sad_onepass = 0; @@ -3562,53 +3858,102 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->Last_Source->y_height != cpi->Source->y_height) cpi->compute_source_sad_onepass = 0; - if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) { + if (frame_is_intra_only(cm) || cpi->resize_pending != 0) { memset(cpi->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); } - vp9_update_noise_estimate(cpi); +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc) + vp9_denoiser_reset_on_first_frame(cpi); +#endif // Scene detection is always used for VBR mode or screen-content case. // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now // (need to check encoding time cost for doing this for speed 8). cpi->rc.high_source_sad = 0; - if (cpi->compute_source_sad_onepass && cm->show_frame && + cpi->rc.hybrid_intra_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 0; + if (cm->show_frame && cpi->oxcf.mode == REALTIME && (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.content == VP9E_CONTENT_SCREEN || - (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc))) + (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8))) vp9_scene_detection_onepass(cpi); + if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { + svc->high_source_sad_superframe = cpi->rc.high_source_sad; + svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion; + // On scene change reset temporal layer pattern to TL0. + // Note that if the base/lower spatial layers are skipped: instead of + // inserting base layer here, we force max-q for the next superframe + // with lower spatial layers: this is done in vp9_encodedframe_overshoot() + // when max-q is decided for the current layer. + // Only do this reset for bypass/flexible mode. + if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 && + svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + // rc->high_source_sad will get reset so copy it to restore it. + int tmp_high_source_sad = cpi->rc.high_source_sad; + vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME); + cpi->rc.high_source_sad = tmp_high_source_sad; + } + } + + vp9_update_noise_estimate(cpi); + + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame, if base layer is key for svc, + // on scene change, or if superframe has layer sync. + if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) && + !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0])) + no_drop_scene_change = 1; + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + !frame_is_intra_only(cm) && !no_drop_scene_change && + !svc->superframe_has_layer_sync && + (!cpi->use_svc || + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi)) return 0; + } + // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can // avoid this frame-level upsampling (for non intra_only frames). if (frame_is_intra_only(cm) == 0 && - !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) { + !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref)) { vp9_scale_references(cpi); } set_size_independent_vars(cpi); set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + // search method and step parameter might be changed in speed settings. + init_motion_estimation(cpi); + if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi); if (cpi->sf.svc_use_lowres_part && - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) { - if (cpi->svc.prev_partition_svc == NULL) { + svc->spatial_layer_id == svc->number_spatial_layers - 2) { + if (svc->prev_partition_svc == NULL) { CHECK_MEM_ERROR( - cm, cpi->svc.prev_partition_svc, + cm, svc->prev_partition_svc, (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, - sizeof(*cpi->svc.prev_partition_svc))); + sizeof(*svc->prev_partition_svc))); } } - if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && + // TODO(jianj): Look into issue of skin detection with high bitdepth. + if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { cpi->use_skin_detection = 1; } + // Enable post encode frame dropping for CBR on non key frame, when + // ext_use_post_encode_drop is specified by user. + cpi->rc.use_post_encode_drop = cpi->rc.ext_use_post_encode_drop && + cpi->oxcf.rc_mode == VPX_CBR && + cm->frame_type != KEY_FRAME; + vp9_set_quantizer(cm, q); vp9_set_variance_partition_thresholds(cpi, q, 0); @@ -3616,6 +3961,34 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, suppress_active_map(cpi); + if (cpi->use_svc) { + // On non-zero spatial layer, check for disabling inter-layer + // prediction. + if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi); + vp9_svc_assert_constraints_pattern(cpi); + } + + if (cpi->rc.last_post_encode_dropped_scene_change) { + cpi->rc.high_source_sad = 1; + svc->high_source_sad_superframe = 1; + // For now disable use_source_sad since Last_Source will not be the previous + // encoded but the dropped one. + cpi->sf.use_source_sad = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + } + // Check if this high_source_sad (scene/slide change) frame should be + // encoded at high/max QP, and if so, set the q and adjust some rate + // control parameters. + if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { + if (vp9_encodedframe_overshoot(cpi, -1, &q)) { + vp9_set_quantizer(cm, q); + vp9_set_variance_partition_thresholds(cpi, q, 0); + } + } + +#if !CONFIG_REALTIME_ONLY // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. if (cpi->oxcf.aq_mode == VARIANCE_AQ) { @@ -3624,24 +3997,32 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vp9_360aq_frame_setup(cpi); } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { vp9_setup_in_frame_q_adj(cpi); - } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - vp9_cyclic_refresh_setup(cpi); } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) { // it may be pretty bad for rate-control, // and I should handle it somehow vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else { +#endif + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vp9_cyclic_refresh_setup(cpi); + } else if (cpi->roi.enabled && !frame_is_intra_only(cm)) { + apply_roi_map(cpi); + } +#if !CONFIG_REALTIME_ONLY } +#endif apply_active_map(cpi); vp9_encode_frame(cpi); - // Check if we should drop this frame because of high overshoot. - // Only for frames where high temporal-source SAD is detected. - if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && - cpi->resize_state == ORIG && cm->frame_type != KEY_FRAME && - cpi->oxcf.content == VP9E_CONTENT_SCREEN && - cpi->rc.high_source_sad == 1) { + // Check if we should re-encode this frame at high Q because of high + // overshoot based on the encoded frame size. Only for frames where + // high temporal-source SAD is detected. + // For SVC: all spatial layers are checked for re-encoding. + if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { int frame_size = 0; // Get an estimate of the encoded frame size. save_coding_context(cpi); @@ -3657,8 +4038,12 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, suppress_active_map(cpi); // Turn-off cyclic refresh for re-encoded frame. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; vp9_disable_segmentation(&cm->seg); } apply_active_map(cpi); @@ -3668,15 +4053,17 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // Update some stats from cyclic refresh, and check for golden frame update. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - cm->frame_type != KEY_FRAME) + !frame_is_intra_only(cm)) vp9_cyclic_refresh_postencode(cpi); // Update the skip mb flag probabilities based on the distribution // seen in the last encoder iteration. // update_base_skip_probs(cpi); vpx_clear_system_state(); + return 1; } +#if !CONFIG_REALTIME_ONLY #define MAX_QSTEP_ADJ 4 static int get_qstep_adj(int rate_excess, int rate_limit) { int qstep = @@ -3703,11 +4090,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, int qrange_adj = 1; #endif + if (cm->show_existing_frame) { + rc->this_frame_target = 0; + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return; + } + set_size_independent_vars(cpi); - enable_acl = cpi->sf.allow_acl - ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0) - : 0; + enable_acl = cpi->sf.allow_acl ? (cm->frame_type == KEY_FRAME) || + (cpi->twopass.gf_group.index == 1) + : 0; do { vpx_clear_system_state(); @@ -3796,6 +4189,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, vp9_setup_in_frame_q_adj(cpi); } else if (oxcf->aq_mode == LOOKAHEAD_AQ) { vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else if (oxcf->aq_mode == PSNR_AQ) { + vp9_psnr_aq_mode_setup(&cm->seg); } vp9_encode_frame(cpi); @@ -3900,8 +4295,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // Special case if the projected size is > the max allowed. if ((q == q_high) && ((rc->projected_frame_size >= rc->max_frame_bandwidth) || - (rc->projected_frame_size >= - big_rate_miss_high_threshold(cpi)))) { + (!rc->is_src_frame_alt_ref && + (rc->projected_frame_size >= + big_rate_miss_high_threshold(cpi))))) { int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth, big_rate_miss_high_threshold(cpi))); double q_val_high; @@ -4006,7 +4402,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, #endif // Have we been forced to adapt Q outside the expected range by an extreme // rate miss. If so adjust the active maxQ for the subsequent frames. - if (q > cpi->twopass.active_worst_quality) { + if (!rc->is_src_frame_alt_ref && (q > cpi->twopass.active_worst_quality)) { cpi->twopass.active_worst_quality = q; } else if (oxcf->vbr_corpus_complexity && q == q_low && rc->projected_frame_size < rc->this_frame_target) { @@ -4028,14 +4424,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, vp9_encode_frame(cpi); vpx_clear_system_state(); restore_coding_context(cpi); - vp9_pack_bitstream(cpi, dest, size); - - vp9_encode_frame(cpi); - vpx_clear_system_state(); - - restore_coding_context(cpi); } } +#endif // !CONFIG_REALTIME_ONLY static int get_ref_frame_flags(const VP9_COMP *cpi) { const int *const map = cpi->common.ref_frame_map; @@ -4131,20 +4522,21 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( } } -static void set_arf_sign_bias(VP9_COMP *cpi) { +static void set_ref_sign_bias(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - int arf_sign_bias; + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + const int cur_frame_index = ref_buffer->frame_index; + MV_REFERENCE_FRAME ref_frame; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_sign_bias = cpi->rc.source_alt_ref_active && - (!cpi->refresh_alt_ref_frame || - (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); - } else { - arf_sign_bias = - (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame); + for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + const RefCntBuffer *const ref_cnt_buf = + get_ref_cnt_buffer(&cpi->common, buf_idx); + if (ref_cnt_buf) { + cm->ref_frame_sign_bias[ref_frame] = + cur_frame_index < ref_cnt_buf->frame_index; + } } - cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; } static int setup_interp_filter_search_mask(VP9_COMP *cpi) { @@ -4328,6 +4720,7 @@ static void spatial_denoise_frame(VP9_COMP *cpi) { } #endif // ENABLE_KF_DENOISE +#if !CONFIG_REALTIME_ONLY static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size, uint8_t *dest) { if (cpi->common.seg.enabled) @@ -4351,6 +4744,228 @@ static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size, vp9_enable_segmentation(&cpi->common.seg); } } +#endif + +static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) { + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + + if (ref_buffer) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + ref_buffer->frame_index = + cm->current_video_frame + gf_group->arf_src_offset[gf_group->index]; + } +} + +// Implementation and modifications of C. Yeo, H. L. Tan, and Y. H. Tan, "On +// rate distortion optimization using SSIM," Circuits and Systems for Video +// Technology, IEEE Transactions on, vol. 23, no. 7, pp. 1170-1181, 2013. +// SSIM_VAR_SCALE defines the strength of the bias towards SSIM in RDO. +// Some sample values are: +// (for midres test set) +// SSIM_VAR_SCALE avg_psnr ssim ms_ssim +// 8.0 9.421 -5.537 -6.898 +// 16.0 4.703 -5.378 -6.238 +// 32.0 1.929 -4.308 -4.807 +#define SSIM_VAR_SCALE 16.0 +static void set_mb_ssim_rdmult_scaling(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + uint8_t *y_buffer = cpi->Source->y_buffer; + const int y_stride = cpi->Source->y_stride; + const int block_size = BLOCK_16X16; + + const int num_8x8_w = num_8x8_blocks_wide_lookup[block_size]; + const int num_8x8_h = num_8x8_blocks_high_lookup[block_size]; + const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w; + const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h; + double log_sum = 0.0; + int row, col; + +#if CONFIG_VP9_HIGHBITDEPTH + double c2; + if (xd->bd == 10) { + c2 = 941.8761; // (.03*1023)^2 + } else if (xd->bd == 12) { + c2 = 15092.1225; // (.03*4095)^2 + } else { + c2 = 58.5225; // (.03*255)^2 + } +#else + const double c2 = 58.5225; // (.03*255)^2 +#endif + + // Loop through each 64x64 block. + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + int mi_row, mi_col; + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + for (mi_row = row * num_8x8_h; + mi_row < cm->mi_rows && mi_row < (row + 1) * num_8x8_h; ++mi_row) { + for (mi_col = col * num_8x8_w; + mi_col < cm->mi_cols && mi_col < (col + 1) * num_8x8_w; ++mi_col) { + struct buf_2d buf; + const int row_offset_y = mi_row << 3; + const int col_offset_y = mi_col << 3; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + // In order to make SSIM_VAR_SCALE in a same scale for both 8 bit + // and high bit videos, the variance needs to be divided by 2.0 or + // 64.0 separately. +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + var += + vp9_high_get_sby_variance(cpi, &buf, BLOCK_8X8, xd->bd) / 2.0; + else +#endif + var += vp9_get_sby_variance(cpi, &buf, BLOCK_8X8) / 64.0; + + num_of_var += 1.0; + } + } + var = var / num_of_var / SSIM_VAR_SCALE; + var = 2.0 * var + c2; + cpi->mi_ssim_rdmult_scaling_factors[index] = var; + log_sum += log(var); + } + } + log_sum = exp(log_sum / (double)(num_rows * num_cols)); + + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->mi_ssim_rdmult_scaling_factors[index] /= log_sum; + } + } + + (void)xd; +} + +// Process the wiener variance in 16x16 block basis. +static int qsort_comp(const void *elem1, const void *elem2) { + int a = *((const int *)elem1); + int b = *((const int *)elem2); + if (a > b) return 1; + if (a < b) return -1; + return 0; +} + +static void init_mb_wiener_var_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + + if (cpi->mb_wiener_variance && cpi->mb_wiener_var_rows >= cm->mb_rows && + cpi->mb_wiener_var_cols >= cm->mb_cols) + return; + + vpx_free(cpi->mb_wiener_variance); + cpi->mb_wiener_variance = NULL; + + CHECK_MEM_ERROR( + cm, cpi->mb_wiener_variance, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance))); + cpi->mb_wiener_var_rows = cm->mb_rows; + cpi->mb_wiener_var_cols = cm->mb_cols; +} + +static void set_mb_wiener_variance(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + uint8_t *buffer = cpi->Source->y_buffer; + int buf_stride = cpi->Source->y_stride; + +#if CONFIG_VP9_HIGHBITDEPTH + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + DECLARE_ALIGNED(16, uint16_t, zero_pred16[32 * 32]); + DECLARE_ALIGNED(16, uint8_t, zero_pred8[32 * 32]); + uint8_t *zero_pred; +#else + DECLARE_ALIGNED(16, uint8_t, zero_pred[32 * 32]); +#endif + + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + + int mb_row, mb_col, count = 0; + // Hard coded operating block size + const int block_size = 16; + const int coeff_count = block_size * block_size; + const TX_SIZE tx_size = TX_16X16; + +#if CONFIG_VP9_HIGHBITDEPTH + xd->cur_buf = cpi->Source; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + zero_pred = CONVERT_TO_BYTEPTR(zero_pred16); + memset(zero_pred16, 0, sizeof(*zero_pred16) * coeff_count); + } else { + zero_pred = zero_pred8; + memset(zero_pred8, 0, sizeof(*zero_pred8) * coeff_count); + } +#else + memset(zero_pred, 0, sizeof(*zero_pred) * coeff_count); +#endif + + cpi->norm_wiener_variance = 0; + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int idx; + int16_t median_val = 0; + uint8_t *mb_buffer = + buffer + mb_row * block_size * buf_stride + mb_col * block_size; + int64_t wiener_variance = 0; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size, + xd->bd); + highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + } else { + vpx_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size); + wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + } +#else + vpx_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size); + wht_fwd_txfm(src_diff, block_size, coeff, tx_size); +#endif // CONFIG_VP9_HIGHBITDEPTH + + coeff[0] = 0; + for (idx = 1; idx < coeff_count; ++idx) coeff[idx] = abs(coeff[idx]); + + qsort(coeff, coeff_count - 1, sizeof(*coeff), qsort_comp); + + // Noise level estimation + median_val = coeff[coeff_count / 2]; + + // Wiener filter + for (idx = 1; idx < coeff_count; ++idx) { + int64_t sqr_coeff = (int64_t)coeff[idx] * coeff[idx]; + int64_t tmp_coeff = (int64_t)coeff[idx]; + if (median_val) { + tmp_coeff = (sqr_coeff * coeff[idx]) / + (sqr_coeff + (int64_t)median_val * median_val); + } + wiener_variance += tmp_coeff * tmp_coeff; + } + cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col] = + wiener_variance / coeff_count; + cpi->norm_wiener_variance += + cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col]; + ++count; + } + } + + if (count) cpi->norm_wiener_variance /= count; + cpi->norm_wiener_variance = VPXMAX(1, cpi->norm_wiener_variance); +} static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, uint8_t *dest, @@ -4360,6 +4975,34 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, struct segmentation *const seg = &cm->seg; TX_SIZE t; + // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. + // If in constrained layer drop mode (svc.framedrop_mode != LAYER_DROP) and + // base spatial layer was dropped, no need to set svc.skip_enhancement_layer, + // as whole superframe will be dropped. + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.target_bandwidth == 0 && + !(cpi->svc.framedrop_mode != LAYER_DROP && + cpi->svc.drop_spatial_layer[0])) { + cpi->svc.skip_enhancement_layer = 1; + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; + if (cpi->svc.framedrop_mode == LAYER_DROP || + cpi->svc.drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where the base is dropped + // (drop_spatial_layer[0] == 1), which means full superframe dropped, + // we don't increment the svc frame counters. In particular temporal + // layer counter (which is incremented in vp9_inc_frame_in_layer()) + // won't be incremented, so on a dropped frame we try the same + // temporal_layer_id on next incoming frame. This is to avoid an + // issue with temporal alignement with full superframe dropping. + vp9_inc_frame_in_layer(cpi); + } + return; + } + set_ext_overrides(cpi); vpx_clear_system_state(); @@ -4368,8 +5011,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi); #endif - // Set the arf sign bias for this frame. - set_arf_sign_bias(cpi); + if (cm->show_existing_frame == 0) { + // Update frame index + set_frame_index(cpi, cm); + + // Set the arf sign bias for this frame. + set_ref_sign_bias(cpi); + } // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; @@ -4404,66 +5052,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->reset_frame_context = 2; } } - if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { - // Use context 0 for intra only empty frame, but the last frame context - // for other empty frames. - if (cpi->svc.encode_empty_frame_state == ENCODING) { - if (cpi->svc.encode_intra_empty_frame != 0) - cm->frame_context_idx = 0; - else - cm->frame_context_idx = FRAME_CONTEXTS - 1; - } else { - cm->frame_context_idx = - cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id; - } - cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; + if (oxcf->tuning == VP8_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi); - // The probs will be updated based on the frame type of its previous - // frame if frame_parallel_decoding_mode is 0. The type may vary for - // the frame after a key frame in base layer since we may drop enhancement - // layers. So set frame_parallel_decoding_mode to 1 in this case. - if (cm->frame_parallel_decoding_mode == 0) { - if (cpi->svc.number_temporal_layers == 1) { - if (cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) - cm->frame_parallel_decoding_mode = 1; - } else if (cpi->svc.spatial_layer_id == 0) { - // Find the 2nd frame in temporal base layer and 1st frame in temporal - // enhancement layers from the key frame. - int i; - for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { - if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { - cm->frame_parallel_decoding_mode = 1; - break; - } - } - } - } - } - - // For 1 pass CBR, check if we are dropping this frame. - // For spatial layers, for now only check for frame-dropping on first spatial - // layer, and if decision is to drop, we drop whole super-frame. - if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && - cm->frame_type != KEY_FRAME) { - if (vp9_rc_drop_frame(cpi) || - (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) { - vp9_rc_postencode_update_drop_frame(cpi); - ++cm->current_video_frame; - cpi->ext_refresh_frame_flags_pending = 0; - cpi->svc.rc_drop_superframe = 1; - cpi->last_frame_dropped = 1; - // TODO(marpan): Advancing the svc counters on dropped frames can break - // the referencing scheme for the fixed svc patterns defined in - // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but - // for now, don't advance the svc frame counters on dropped frame. - // if (cpi->use_svc) - // vp9_inc_frame_in_layer(cpi); - - return; - } + if (oxcf->aq_mode == PERCEPTUAL_AQ) { + init_mb_wiener_var_buffer(cpi); + set_mb_wiener_variance(cpi); } vpx_clear_system_state(); @@ -4472,18 +5066,33 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif +#if CONFIG_CONSISTENT_RECODE + // Backup to ensure consistency between recodes + save_encode_params(cpi); +#endif if (cpi->sf.recode_loop == DISALLOW_RECODE) { - encode_without_recode_loop(cpi, size, dest); + if (!encode_without_recode_loop(cpi, size, dest)) return; } else { +#if !CONFIG_REALTIME_ONLY encode_with_recode_loop(cpi, size, dest); +#endif } - cpi->last_frame_dropped = 0; + // TODO(jingning): When using show existing frame mode, we assume that the + // current ARF will be directly used as the final reconstructed frame. This is + // an encoder control scheme. One could in principle explore other + // possibilities to arrange the reference frame buffer and their coding order. + if (cm->show_existing_frame) { + ref_cnt_fb(cm->buffer_pool->frame_bufs, &cm->new_fb_idx, + cm->ref_frame_map[cpi->alt_fb_idx]); + } +#if !CONFIG_REALTIME_ONLY // Disable segmentation if it decrease rate/distortion ratio if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) vp9_try_disable_lookahead_aq(cpi, size, dest); +#endif #if CONFIG_VP9_TEMPORAL_DENOISING #ifdef OUTPUT_YUV_DENOISED @@ -4527,9 +5136,33 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, // Pick the loop filter level for the frame. loopfilter_frame(cpi, cm); + if (cpi->rc.use_post_encode_drop) save_coding_context(cpi); + // build the bitstream vp9_pack_bitstream(cpi, dest, size); + if (cpi->rc.use_post_encode_drop && cm->base_qindex < cpi->rc.worst_quality && + cpi->svc.spatial_layer_id == 0 && post_encode_drop_cbr(cpi, size)) { + restore_coding_context(cpi); + return; + } + + cpi->last_frame_dropped = 0; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.num_encoded_top_layer++; + + // Keep track of the frame buffer index updated/refreshed for the + // current encoded TL0 superframe. + if (cpi->svc.temporal_layer_id == 0) { + if (cpi->refresh_last_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx; + else if (cpi->refresh_golden_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx; + else if (cpi->refresh_alt_ref_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx; + } + if (cm->seg.update_map) update_reference_segmentation_map(cpi); if (frame_is_intra_only(cm) == 0) { @@ -4537,17 +5170,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, } vp9_update_reference_frames(cpi); - for (t = TX_4X4; t <= TX_32X32; t++) - full_to_model_counts(cpi->td.counts->coef[t], - cpi->td.rd_counts.coef_counts[t]); + if (!cm->show_existing_frame) { + for (t = TX_4X4; t <= TX_32X32; ++t) { + full_to_model_counts(cpi->td.counts->coef[t], + cpi->td.rd_counts.coef_counts[t]); + } - if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) - vp9_adapt_coef_probs(cm); - - if (!frame_is_intra_only(cm)) { if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { - vp9_adapt_mode_probs(cm); - vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + vp9_adapt_coef_probs(cm); } } @@ -4567,8 +5201,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->last_frame_type = cm->frame_type; - if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) - vp9_rc_postencode_update(cpi, *size); + vp9_rc_postencode_update(cpi, *size); + + *size = VPXMAX(1, *size); #if 0 output_frame_level_debug_stats(cpi); @@ -4592,7 +5227,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->last_height = cm->height; // reset to normal state now that we are done. - if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame; + if (!cm->show_existing_frame) { + cm->last_show_frame = cm->show_frame; + cm->prev_frame = cm->cur_frame; + } if (cm->show_frame) { vp9_swap_mi_and_prev_mi(cm); @@ -4601,19 +5239,26 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, ++cm->current_video_frame; if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); } - cm->prev_frame = cm->cur_frame; - if (cpi->use_svc) + if (cpi->use_svc) { cpi->svc .layer_context[cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id] .last_frame_type = cm->frame_type; + // Reset layer_sync back to 0 for next frame. + cpi->svc.spatial_layer_sync[cpi->svc.spatial_layer_id] = 0; + } cpi->force_update_segmentation = 0; +#if !CONFIG_REALTIME_ONLY if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) vp9_alt_ref_aq_unset_all(cpi->alt_ref_aq, cpi); +#endif + + cpi->svc.previous_frame_is_intra_only = cm->intra_only; + cpi->svc.set_intra_only_frame = 0; } static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, @@ -4636,10 +5281,12 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); +#endif encode_frame_to_data_rate(cpi, size, dest, frame_flags); - if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) - vp9_twopass_postencode_update(cpi); + vp9_twopass_postencode_update(cpi); } #endif // !CONFIG_REALTIME_ONLY @@ -4649,6 +5296,8 @@ static void init_ref_frame_bufs(VP9_COMMON *cm) { cm->new_fb_idx = INVALID_IDX; for (i = 0; i < REF_FRAMES; ++i) { cm->ref_frame_map[i] = INVALID_IDX; + } + for (i = 0; i < FRAME_BUFFERS; ++i) { pool->frame_bufs[i].ref_count = 0; } } @@ -4702,6 +5351,12 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, check_initial_width(cpi, subsampling_x, subsampling_y); #endif // CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_VP9_HIGHBITDEPTH + // Disable denoiser for high bitdepth since vp9_denoiser_filter only works for + // 8 bits. + if (cm->bit_depth > 8) cpi->oxcf.noise_sensitivity = 0; +#endif + #if CONFIG_VP9_TEMPORAL_DENOISING setup_denoiser_buffer(cpi); #endif @@ -4822,10 +5477,6 @@ static void check_src_altref(VP9_COMP *cpi, } #if CONFIG_INTERNAL_STATS -extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, - const uint8_t *img2, int img2_pitch, int width, - int height); - static void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) { s->stat[Y] += y; @@ -5065,6 +5716,1455 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { } } +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[3]; + FRAME_UPDATE_TYPE update_type; +} GF_PICTURE; + +static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -1; + int lst_index = -1; + int arf_index_stack[MAX_ARF_LAYERS]; + int arf_stack_size = 0; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + stack_init(arf_index_stack, MAX_ARF_LAYERS); + + // TODO(jingning): To be used later for gf frame type parsing. + (void)gf_group; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; + gf_picture[0].update_type = gf_group->update_type[0]; + gld_index = 0; + ++*tpl_group_frames; + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + gf_picture[1].update_type = gf_group->update_type[1]; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + switch (gf_group->update_type[frame_idx]) { + case ARF_UPDATE: + stack_push(arf_index_stack, alt_index, arf_stack_size); + ++arf_stack_size; + alt_index = frame_idx; + break; + case LF_UPDATE: lst_index = frame_idx; break; + case OVERLAY_UPDATE: + gld_index = frame_idx; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + case USE_BUF_FRAME: + lst_index = alt_index; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + default: break; + } + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) break; + } + + alt_index = -1; + ++frame_idx; + ++frame_gop_offset; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } +} + +static void init_tpl_stats(VP9_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +#if CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction( + VP9_COMP *cpi, ThreadData *td, int frame_idx, uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, BLOCK_SIZE bsize, int mi_row, + int mi_col, MV *mv, int rf_idx) { +#else // CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, + int frame_idx, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + int mi_row, int mi_col, MV *mv) { +#endif // CONFIG_NON_GREEDY_MV + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; +#if CONFIG_NON_GREEDY_MV + // lambda is used to adjust the importance of motion vector consitency. + // TODO(angiebird): Figure out lambda's proper value. + const int lambda = cpi->tpl_stats[frame_idx].lambda; + int_mv nb_full_mvs[NB_MVS_NUM]; +#endif + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + +#if CONFIG_NON_GREEDY_MV + (void)search_method; + (void)sadpb; + vp9_prepare_nb_full_mvs(&cpi->tpl_stats[frame_idx], mi_row, mi_col, rf_idx, + bsize, nb_full_mvs); + vp9_full_pixel_diamond_new(cpi, x, &best_ref_mv1_full, step_param, lambda, 1, + &cpi->fn_ptr[bsize], nb_full_mvs, NB_MVS_NUM, mv); +#else + (void)frame_idx; + (void)mi_row; + (void)mi_col; + vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, mv, 0, 0); +#endif + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; + const int64_t mc_flow = tpl_ptr->mc_flow; + const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; + *tpl_ptr = *src_stats; + tpl_ptr->mc_flow = mc_flow; + tpl_ptr->mc_ref_cost = mc_ref_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + } + } +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + des_stats->mc_ref_cost += + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_8X8); + } + } +} + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + uint16_t eob; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, + &eob, scan_order->scan, scan_order->iscan); + } else { + vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob, + scan_order->scan, scan_order->iscan); + } +#else + vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan, + scan_order->iscan); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = VPXMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = VPXMAX(*sse, 1); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. + switch (tx_size) { + case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col) { + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); +} + +static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + int64_t *recon_error, int64_t *sse) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MODE_INFO mi_above, mi_left; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, + src_stride, dst, dst_stride, 0, 0, 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); + } +#else + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + int_mv mv; + if (ref_frame[rf_idx] == NULL) continue; + +#if CONFIG_NON_GREEDY_MV + (void)td; + mv.as_int = + get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col)->as_int; +#else + motion_compensated_prediction( + cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bsize, + mi_row, mi_col, &mv.as_mv); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), + ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, + 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); + } +#else + vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); +#endif + + if (inter_cost < best_inter_cost) { + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = mv.as_int; + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse); + } + } + best_intra_cost = VPXMAX(best_intra_cost, 1); + best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = VPXMAX( + 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->intra_cost = VPXMAX( + 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + tpl_stats->mv.as_int = best_mv.as_int; +} + +#if CONFIG_NON_GREEDY_MV +static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, + int frame_idx, int rf_idx, int mi_row, + int mi_col, struct buf_2d *src, + struct buf_2d *pre) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + YV12_BUFFER_CONFIG *ref_frame = NULL; + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + ref_frame = gf_picture[ref_frame_idx].frame; + src->buf = xd->cur_buf->y_buffer + mb_y_offset; + src->stride = xd->cur_buf->y_stride; + pre->buf = ref_frame->y_buffer + mb_y_offset; + pre->stride = ref_frame->y_stride; + assert(src->stride == pre->stride); + return 1; + } else { + printf("invalid ref_frame_idx"); + assert(ref_frame_idx != -1); + return 0; + } +} + +#define kMvPreCheckLines 5 +#define kMvPreCheckSize 15 + +#define MV_REF_POS_NUM 3 +POSITION mv_ref_pos[MV_REF_POS_NUM] = { + { -1, 0 }, + { 0, -1 }, + { -1, -1 }, +}; + +static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, + int mi_col) { + return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; +} + +static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int i; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int_mv nearest_mv, near_mv, invalid_mv; + nearest_mv.as_int = INVALID_MV; + near_mv.as_int = INVALID_MV; + invalid_mv.as_int = INVALID_MV; + for (i = 0; i < MV_REF_POS_NUM; ++i) { + int nb_row = mi_row + mv_ref_pos[i].row * mi_height; + int nb_col = mi_col + mv_ref_pos[i].col * mi_width; + assert(mv_ref_pos[i].row <= 0); + assert(mv_ref_pos[i].col <= 0); + if (nb_row >= 0 && nb_col >= 0) { + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + } else { + int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + if (mv.as_int == nearest_mv.as_int) { + continue; + } else { + near_mv = mv; + break; + } + } + } + } + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv.as_mv.row = 0; + nearest_mv.as_mv.col = 0; + } + if (near_mv.as_int == INVALID_MV) { + near_mv.as_mv.row = 0; + near_mv.as_mv.col = 0; + } + if (mv_mode == NEAREST_MV_MODE) { + return nearest_mv; + } + if (mv_mode == NEAR_MV_MODE) { + return near_mv; + } + assert(0); + return invalid_mv; +} + +static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int_mv mv; + switch (mv_mode) { + case ZERO_MV_MODE: + mv.as_mv.row = 0; + mv.as_mv.col = 0; + break; + case NEW_MV_MODE: + mv = *get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col); + break; + case NEAREST_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + case NEAR_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + default: + mv.as_int = INVALID_MV; + assert(0); + break; + } + return mv; +} + +static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, BLOCK_SIZE bsize, + int mi_row, int mi_col, int_mv *mv) { + uint32_t sse; + struct buf_2d src; + struct buf_2d pre; + MV full_mv; + *mv = get_mv_from_mv_mode(mv_mode, cpi, tpl_frame, rf_idx, bsize, mi_row, + mi_col); + full_mv = get_full_mv(&mv->as_mv); + if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, + &src, &pre)) { + // TODO(angiebird): Consider subpixel when computing the sse. + cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), + pre.stride, &sse); + return (double)(sse << VP9_DIST_SCALE_LOG2); + } else { + assert(0); + return 0; + } +} + +static int get_mv_mode_cost(int mv_mode) { + // TODO(angiebird): The probabilities are roughly inferred from + // default_inter_mode_probs. Check if there is a better way to set the + // probabilities. + const int zero_mv_prob = 16; + const int new_mv_prob = 24 * 1; + const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; + assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); + switch (mv_mode) { + case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; + case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; + case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + default: assert(0); return -1; + } +} + +static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { + double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + + log2(1 + abs(new_mv->col - ref_mv->col)); + mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); + return mv_diff_cost; +} +static double get_mv_cost(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + int rf_idx, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + double mv_cost = get_mv_mode_cost(mv_mode); + if (mv_mode == NEW_MV_MODE) { + MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, tpl_frame, rf_idx, bsize, + mi_row, mi_col) + .as_mv; + MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, tpl_frame, rf_idx, + bsize, mi_row, mi_col) + .as_mv; + MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, tpl_frame, rf_idx, + bsize, mi_row, mi_col) + .as_mv; + double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); + double near_cost = get_mv_diff_cost(&new_mv, &near_mv); + mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; + } + return mv_cost; +} + +static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, BLOCK_SIZE bsize, + int mi_row, int mi_col, int_mv *mv) { + MACROBLOCKD *xd = &x->e_mbd; + double mv_dist = get_mv_dist(mv_mode, cpi, xd, gf_picture, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); + double mv_cost = + get_mv_cost(mv_mode, cpi, tpl_frame, rf_idx, bsize, mi_row, mi_col); + double mult = 180; + + return mv_cost + mult * log2f(1 + mv_dist); +} + +static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd, int_mv *mv) { + int best_mv_mode = ZERO_MV_MODE; + int update = 0; + int mv_mode; + *rd = 0; + for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { + double this_rd; + int_mv this_mv; + if (mv_mode == NEW_MV_MODE) { + continue; + } + this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col, &this_mv); + if (update == 0) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + update = 1; + } else { + if (this_rd < *rd) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + } + } + } + return best_mv_mode; +} + +static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int tmp_mv_mode_arr[kMvPreCheckSize]; + int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; + double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; + int_mv *select_mv_arr = cpi->select_mv_arr; + int_mv tmp_select_mv_arr[kMvPreCheckSize]; + int stride = tpl_frame->stride; + double new_mv_rd = 0; + double no_new_mv_rd = 0; + double this_new_mv_rd = 0; + double this_no_new_mv_rd = 0; + int idx; + int tmp_idx; + assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); + + // no new mv + // diagnal scan order + tmp_idx = 0; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = + find_best_ref_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame, + rf_idx, bsize, nb_row, nb_col, &this_rd, mv); + if (r == 0 && c == 0) { + this_no_new_mv_rd = this_rd; + } + no_new_mv_rd += this_rd; + tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; + tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; + ++tmp_idx; + } + } + } + + // new mv + mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; + this_new_mv_rd = eval_mv_mode(NEW_MV_MODE, cpi, x, gf_picture, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, + &select_mv_arr[mi_row * stride + mi_col]); + new_mv_rd = this_new_mv_rd; + // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE + // beforehand. + for (idx = 1; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = + find_best_ref_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame, + rf_idx, bsize, nb_row, nb_col, &this_rd, mv); + new_mv_rd += this_rd; + } + } + } + + // update best_mv_mode + tmp_idx = 0; + if (no_new_mv_rd < new_mv_rd) { + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; + select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; + ++tmp_idx; + } + } + } + rd_diff_arr[mi_row * stride + mi_col] = 0; + } else { + rd_diff_arr[mi_row * stride + mi_col] = + (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); + } +} + +static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int unit_rows = tpl_frame->mi_rows / mi_height; + const int unit_cols = tpl_frame->mi_cols / mi_width; + const int max_diagonal_lines = unit_rows + unit_cols - 1; + int idx; + for (idx = 0; idx < max_diagonal_lines; ++idx) { + int r; + for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); + ++r) { + int c = idx - r; + int mi_row = r * mi_height; + int mi_col = c * mi_width; + assert(c >= 0 && c < unit_cols); + assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); + assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); + predict_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame, rf_idx, bsize, + mi_row, mi_col); + } + } +} + +static double get_feature_score(uint8_t *buf, ptrdiff_t stride, int rows, + int cols) { + double IxIx = 0; + double IxIy = 0; + double IyIy = 0; + double score; + int r, c; + vpx_clear_system_state(); + for (r = 0; r + 1 < rows; ++r) { + for (c = 0; c + 1 < cols; ++c) { + int diff_x = buf[r * stride + c] - buf[r * stride + c + 1]; + int diff_y = buf[r * stride + c] - buf[(r + 1) * stride + c]; + IxIx += diff_x * diff_x; + IxIy += diff_x * diff_y; + IyIy += diff_y * diff_y; + } + } + IxIx /= (rows - 1) * (cols - 1); + IxIy /= (rows - 1) * (cols - 1); + IyIy /= (rows - 1) * (cols - 1); + score = (IxIx * IyIy - IxIy * IxIy + 0.0001) / (IxIx + IyIy + 0.0001); + return score; +} + +static int compare_feature_score(const void *a, const void *b) { + const FEATURE_SCORE_LOC *aa = *(FEATURE_SCORE_LOC *const *)a; + const FEATURE_SCORE_LOC *bb = *(FEATURE_SCORE_LOC *const *)b; + if (aa->feature_score < bb->feature_score) { + return 1; + } else if (aa->feature_score > bb->feature_score) { + return -1; + } else { + return 0; + } +} + +static void do_motion_search(VP9_COMP *cpi, ThreadData *td, int frame_idx, + YV12_BUFFER_CONFIG **ref_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + int rf_idx; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + int_mv *mv = get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col); + if (ref_frame[rf_idx] == NULL) { + tpl_stats->ready[rf_idx] = 0; + continue; + } else { + tpl_stats->ready[rf_idx] = 1; + } + motion_compensated_prediction( + cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bsize, + mi_row, mi_col, &mv->as_mv, rf_idx); + } +} + +#define CHANGE_MV_SEARCH_ORDER 1 +#define USE_PQSORT 1 + +#if CHANGE_MV_SEARCH_ORDER +#if USE_PQSORT +static void max_heap_pop(FEATURE_SCORE_LOC **heap, int *size, + FEATURE_SCORE_LOC **output) { + if (*size > 0) { + *output = heap[0]; + --*size; + if (*size > 0) { + int p, l, r; + heap[0] = heap[*size]; + p = 0; + l = 2 * p + 1; + r = 2 * p + 2; + while (l < *size) { + FEATURE_SCORE_LOC *tmp; + int c = l; + if (r < *size && heap[r]->feature_score > heap[l]->feature_score) { + c = r; + } + if (heap[p]->feature_score >= heap[c]->feature_score) { + break; + } + tmp = heap[p]; + heap[p] = heap[c]; + heap[c] = tmp; + p = c; + l = 2 * p + 1; + r = 2 * p + 2; + } + } + } else { + assert(0); + } +} + +static void max_heap_push(FEATURE_SCORE_LOC **heap, int *size, + FEATURE_SCORE_LOC *input) { + int c, p; + FEATURE_SCORE_LOC *tmp; + input->visited = 1; + heap[*size] = input; + ++*size; + c = *size - 1; + p = c >> 1; + while (c > 0 && heap[c]->feature_score > heap[p]->feature_score) { + tmp = heap[p]; + heap[p] = heap[c]; + heap[c] = tmp; + c = p; + p >>= 1; + } +} + +static void add_nb_blocks_to_heap(VP9_COMP *cpi, const TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int *heap_size) { + const int mi_unit = num_8x8_blocks_wide_lookup[bsize]; + const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } }; + int i; + for (i = 0; i < NB_MVS_NUM; ++i) { + int r = dirs[i][0] * mi_unit; + int c = dirs[i][1] * mi_unit; + if (mi_row + r >= 0 && mi_row + r < tpl_frame->mi_rows && mi_col + c >= 0 && + mi_col + c < tpl_frame->mi_cols) { + FEATURE_SCORE_LOC *fs_loc = + &cpi->feature_score_loc_arr[(mi_row + r) * tpl_frame->stride + + (mi_col + c)]; + if (fs_loc->visited == 0) { + max_heap_push(cpi->feature_score_loc_heap, heap_size, fs_loc); + } + } + } +} +#endif // USE_PQSORT +#endif // CHANGE_MV_SEARCH_ORDER + +static void build_motion_field(VP9_COMP *cpi, MACROBLOCKD *xd, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame[3], + BLOCK_SIZE bsize) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; + int fs_loc_sort_size; + int fs_loc_heap_size; + int mi_row, mi_col; + + tpl_frame->lambda = (pw * ph) >> 2; + assert(pw * ph == tpl_frame->lambda << 2); + + fs_loc_sort_size = 0; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + FEATURE_SCORE_LOC *fs_loc = + &cpi->feature_score_loc_arr[mi_row * tpl_frame->stride + mi_col]; + tpl_stats->feature_score = get_feature_score( + xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh); + fs_loc->visited = 0; + fs_loc->feature_score = tpl_stats->feature_score; + fs_loc->mi_row = mi_row; + fs_loc->mi_col = mi_col; + cpi->feature_score_loc_sort[fs_loc_sort_size] = fs_loc; + ++fs_loc_sort_size; + } + } + + qsort(cpi->feature_score_loc_sort, fs_loc_sort_size, + sizeof(*cpi->feature_score_loc_sort), compare_feature_score); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + int rf_idx; + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + tpl_stats->ready[rf_idx] = 0; + } + } + } + +#if CHANGE_MV_SEARCH_ORDER +#if !USE_PQSORT + for (i = 0; i < fs_loc_sort_size; ++i) { + FEATURE_SCORE_LOC *fs_loc = cpi->feature_score_loc_sort[i]; + do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row, + fs_loc->mi_col); + } +#else // !USE_PQSORT + fs_loc_heap_size = 0; + max_heap_push(cpi->feature_score_loc_heap, &fs_loc_heap_size, + cpi->feature_score_loc_sort[0]); + + while (fs_loc_heap_size > 0) { + FEATURE_SCORE_LOC *fs_loc; + max_heap_pop(cpi->feature_score_loc_heap, &fs_loc_heap_size, &fs_loc); + + do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row, + fs_loc->mi_col); + + add_nb_blocks_to_heap(cpi, tpl_frame, bsize, fs_loc->mi_row, fs_loc->mi_col, + &fs_loc_heap_size); + } +#endif // !USE_PQSORT +#else // CHANGE_MV_SEARCH_ORDER + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + do_motion_search(cpi, td, frame_idx, ref_frame, bsize, mi_row, mi_col); + } + } +#endif // CHANGE_MV_SEARCH_ORDER +} +#endif // CONFIG_NON_GREEDY_MV + +static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx, BLOCK_SIZE bsize) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[3] = { NULL, NULL, NULL }; + + VP9_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); +#endif + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int64_t recon_error, sse; +#if CONFIG_NON_GREEDY_MV + int square_block_idx; + int rf_idx; +#endif + + // Setup scaling factor +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height, + cpi->common.use_highbitdepth); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; +#else + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < 3; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + vp9_frame_init_quantizer(cpi); + +#if CONFIG_NON_GREEDY_MV + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, xd, frame_idx, ref_frame, square_bsize); + } + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + predict_mv_mode_arr(cpi, x, gf_picture, frame_idx, tpl_frame, rf_idx, + bsize); + } + } +#endif + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, + src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, + tx_size, ref_frame, predictor, &recon_error, &sse); + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); + } + } +} + +#if CONFIG_NON_GREEDY_MV +#define DUMP_TPL_STATS 0 +#if DUMP_TPL_STATS +static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { + int i, j; + printf("%d %d\n", h, w); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + printf("%d ", buf[(row + i) * stride + col + j]); + } + } + printf("\n"); +} + +static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { + dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, + frame_buf->y_width); + dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); + dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); +} + +static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, + const GF_GROUP *gf_group, + const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { + int frame_idx; + const VP9_COMMON *cm = &cpi->common; + int rf_idx; + for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + int mi_row, mi_col; + int ref_frame_idx; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; + const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; + const int ref_gf_frame_offset = + gf_group->frame_gop_index[ref_frame_idx]; + printf("=\n"); + printf( + "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " + "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", + frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, + ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + int_mv mv = + *get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col); + printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, + mv.as_mv.col); + } + } + } + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + printf("%f ", tpl_ptr->feature_score); + } + } + } + printf("\n"); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mv_mode = + tpl_frame + ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; + printf("%d ", mv_mode); + } + } + printf("\n"); + + dump_frame_buf(gf_picture[frame_idx].frame); + dump_frame_buf(ref_frame_buf); + } + } + } +} +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +static void init_tpl_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int frame; + + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_NON_GREEDY_MV + int sqr_bsize; + int rf_idx; + + // TODO(angiebird): This probably needs further modifications to support + // frame scaling later on. + if (cpi->feature_score_loc_alloc == 0) { + // The smallest block size of motion field is 4x4, but the mi_unit is 8x8, + // therefore the number of units is "mi_rows * mi_cols * 4" here. + CHECK_MEM_ERROR( + cm, cpi->feature_score_loc_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->feature_score_loc_arr))); + CHECK_MEM_ERROR(cm, cpi->feature_score_loc_sort, + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->feature_score_loc_sort))); + CHECK_MEM_ERROR(cm, cpi->feature_score_loc_heap, + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->feature_score_loc_heap))); + + cpi->feature_score_loc_alloc = 1; + } + vpx_free(cpi->select_mv_arr); + CHECK_MEM_ERROR( + cm, cpi->select_mv_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); +#endif + + // TODO(jingning): Reduce the actual memory use for tpl model build up. + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + if (cpi->tpl_stats[frame].width >= mi_cols && + cpi->tpl_stats[frame].height >= mi_rows && + cpi->tpl_stats[frame].tpl_stats_ptr) + continue; + +#if CONFIG_NON_GREEDY_MV + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) { + vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize], + vpx_calloc( + mi_rows * mi_cols * 4, + sizeof( + *cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]))); + } + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + cpi->tpl_stats[frame].is_valid = 0; + cpi->tpl_stats[frame].width = mi_cols; + cpi->tpl_stats[frame].height = mi_rows; + cpi->tpl_stats[frame].stride = mi_cols; + cpi->tpl_stats[frame].mi_rows = cm->mi_rows; + cpi->tpl_stats[frame].mi_cols = cm->mi_cols; + } + + for (frame = 0; frame < REF_FRAMES; ++frame) { + cpi->enc_frame_buf[frame].mem_valid = 0; + cpi->enc_frame_buf[frame].released = 1; + } +} + +static void setup_tpl_stats(VP9_COMP *cpi) { + GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + cpi->tpl_bsize = BLOCK_32X32; + + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + + init_tpl_stats(cpi); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { + if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; + mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); + } +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 1; +#if DUMP_TPL_STATS + dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV +} + int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, int64_t *time_end, int flush) { @@ -5077,17 +7177,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, struct lookahead_entry *last_source = NULL; struct lookahead_entry *source = NULL; int arf_src_index; + const int gf_group_index = cpi->twopass.gf_group.index; int i; - if (is_two_pass_svc(cpi)) { -#if CONFIG_SPATIAL_SVC - vp9_svc_start_frame(cpi); - // Use a small empty frame instead of a real frame - if (cpi->svc.encode_empty_frame_state == ENCODING) - source = &cpi->svc.empty_frame; -#endif - if (oxcf->pass == 2) vp9_restore_layer_context(cpi); - } else if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_cbr_svc(cpi)) { vp9_one_pass_cbr_svc_start_layer(cpi); } @@ -5098,10 +7191,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Is multi-arf enabled. // Note that at the moment multi_arf is only configured for 2 pass VBR and // will not work properly with svc. - if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf > 1)) - cpi->multi_arf_allowed = 1; + // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf" + // is greater than or equal to 2. + if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2)) + cpi->multi_layer_arf = 1; else - cpi->multi_arf_allowed = 0; + cpi->multi_layer_arf = 0; // Normal defaults cm->reset_frame_context = 0; @@ -5115,9 +7210,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Should we encode an arf frame. arf_src_index = get_arf_src_index(cpi); - // Skip alt frame if we encode the empty frame - if (is_two_pass_svc(cpi) && source != NULL) arf_src_index = 0; - if (arf_src_index) { for (i = 0; i <= arf_src_index; ++i) { struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i); @@ -5132,25 +7224,17 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } } + // Clear arf index stack before group of pictures processing starts. + if (gf_group_index == 1) { + stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2); + cpi->twopass.gf_group.stack_size = 0; + } + if (arf_src_index) { assert(arf_src_index <= rc->frames_to_key); - if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { cpi->alt_ref_source = source; -#if CONFIG_SPATIAL_SVC - if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) { - int i; - // Reference a hidden frame from a lower layer - for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) { - if (oxcf->ss_enable_auto_arf[i]) { - cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx; - break; - } - } - } - cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; -#endif #if !CONFIG_REALTIME_ONLY if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) { @@ -5192,7 +7276,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Read in the source frame. - if (cpi->use_svc) + if (cpi->use_svc || cpi->svc.set_intra_only_frame) source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); else source = vp9_lookahead_pop(cpi->lookahead, flush); @@ -5202,8 +7286,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->intra_only = 0; // if the flags indicate intra frame, but if the current picture is for // non-zero spatial layer, it should not be an intra picture. - if ((source->flags & VPX_EFLAG_FORCE_KF) && - cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) { + if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc && + cpi->svc.spatial_layer_id > 0) { source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF); } @@ -5227,7 +7311,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, *time_stamp = source->ts_start; *time_end = source->ts_end; *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - } else { *size = 0; #if !CONFIG_REALTIME_ONLY @@ -5249,7 +7332,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // adjust frame rates based on timestamps given if (cm->show_frame) { - adjust_frame_rate(cpi, source); + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.duration[cpi->svc.spatial_layer_id] > 0) + vp9_svc_adjust_frame_rate(cpi); + else + adjust_frame_rate(cpi, source); } if (is_one_pass_cbr_svc(cpi)) { @@ -5268,24 +7355,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; - if (!cpi->use_svc && cpi->multi_arf_allowed) { - if (cm->frame_type == KEY_FRAME) { - init_buffer_indices(cpi); - } else if (oxcf->pass == 2) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; - } - } - // Start with a 0 size frame. *size = 0; cpi->frame_flags = *frame_flags; #if !CONFIG_REALTIME_ONLY - if ((oxcf->pass == 2) && - (!cpi->use_svc || (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state != ENCODING))) { + if ((oxcf->pass == 2) && !cpi->use_svc) { vp9_rc_get_second_pass_params(cpi); } else if (oxcf->pass == 1) { set_frame_size(cpi); @@ -5297,9 +7373,39 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, level_rc_framerate(cpi, arf_src_index); if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) { - for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; + for (i = 0; i < REFS_PER_FRAME; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; } + if (cpi->kmeans_data_arr_alloc == 0) { + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_MULTITHREAD + pthread_mutex_init(&cpi->kmeans_mutex, NULL); +#endif + CHECK_MEM_ERROR( + cm, cpi->kmeans_data_arr, + vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr))); + cpi->kmeans_data_stride = mi_cols; + cpi->kmeans_data_arr_alloc = 1; + } + + if (gf_group_index == 1 && + cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && + cpi->sf.enable_tpl_model) { + init_tpl_buffer(cpi); + vp9_estimate_qp_gop(cpi); + setup_tpl_stats(cpi); + } + +#if CONFIG_BITSTREAM_DEBUG + assert(cpi->oxcf.max_threads == 0 && + "bitstream debug tool does not support multithreading"); + bitstream_queue_record_write(); +#endif +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame); +#endif + cpi->td.mb.fp_src_pred = 0; #if CONFIG_REALTIME_ONLY if (cpi->use_svc) { @@ -5309,7 +7415,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, Pass0Encode(cpi, size, dest, frame_flags); } #else // !CONFIG_REALTIME_ONLY - if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { + if (oxcf->pass == 1 && !cpi->use_svc) { const int lossless = is_lossless_requested(oxcf); #if CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.use_highbitdepth) @@ -5324,7 +7430,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif // CONFIG_VP9_HIGHBITDEPTH cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); - } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) { + } else if (oxcf->pass == 2 && !cpi->use_svc) { Pass2Encode(cpi, size, dest, frame_flags); } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); @@ -5334,6 +7440,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } #endif // CONFIG_REALTIME_ONLY + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; + if (cm->refresh_frame_context) cm->frame_contexts[cm->frame_context_idx] = *cm->fc; @@ -5416,7 +7524,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, ppflags.post_proc_flag = VP9D_DEBLOCK; ppflags.deblocking_level = 0; // not used in vp9_post_proc_frame() ppflags.noise_level = 0; // not used in vp9_post_proc_frame() - vp9_post_proc_frame(cm, pp, &ppflags); + vp9_post_proc_frame(cm, pp, &ppflags, + cpi->un_scaled_source->y_width); } #endif vpx_clear_system_state(); @@ -5462,11 +7571,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->summedp_quality += frame_ssim2 * weight; cpi->summedp_weights += weight; #if 0 - { + if (cm->show_frame) { FILE *f = fopen("q_used.stt", "a"); fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", - cpi->common.current_video_frame, y2, u2, v2, - frame_psnr2, frame_ssim2); + cpi->common.current_video_frame, psnr2.psnr[1], + psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2); fclose(f); } #endif @@ -5525,21 +7634,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif - if (is_two_pass_svc(cpi)) { - if (cpi->svc.encode_empty_frame_state == ENCODING) { - cpi->svc.encode_empty_frame_state = ENCODED; - cpi->svc.encode_intra_empty_frame = 0; - } - - if (cm->show_frame) { - ++cpi->svc.spatial_layer_to_encode; - if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) - cpi->svc.spatial_layer_to_encode = 0; - - // May need the empty frame after an visible frame. - cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE; - } - } else if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_cbr_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) @@ -5563,7 +7658,7 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } else { int ret; #if CONFIG_VP9_POSTPROC - ret = vp9_post_proc_frame(cm, dest, flags); + ret = vp9_post_proc_frame(cm, dest, flags, cpi->un_scaled_source->y_width); #else if (cm->frame_to_show) { *dest = *cm->frame_to_show; diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.h b/libs/libvpx/vp9/encoder/vp9_encoder.h index d723d93cbc..f157fdfc5e 100644 --- a/libs/libvpx/vp9/encoder/vp9_encoder.h +++ b/libs/libvpx/vp9/encoder/vp9_encoder.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODER_H_ -#define VP9_ENCODER_VP9_ENCODER_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODER_H_ +#define VPX_VP9_ENCODER_VP9_ENCODER_H_ #include @@ -29,7 +29,9 @@ #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_alt_ref_aq.h" +#endif #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodemb.h" @@ -119,9 +121,11 @@ typedef enum { COMPLEXITY_AQ = 2, CYCLIC_REFRESH_AQ = 3, EQUATOR360_AQ = 4, + PERCEPTUAL_AQ = 5, + PSNR_AQ = 6, // AQ based on lookahead temporal // variance (only valid for altref frames) - LOOKAHEAD_AQ = 5, + LOOKAHEAD_AQ = 7, AQ_MODE_COUNT // This should always be the last member of the enum } AQ_MODE; @@ -248,6 +252,8 @@ typedef struct VP9EncoderConfig { int tile_columns; int tile_rows; + int enable_tpl_model; + int max_threads; unsigned int target_level; @@ -278,11 +284,102 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; } +typedef struct TplDepStats { + int64_t intra_cost; + int64_t inter_cost; + int64_t mc_flow; + int64_t mc_dep_cost; + int64_t mc_ref_cost; + + int ref_frame_index; + int_mv mv; + +#if CONFIG_NON_GREEDY_MV + int ready[3]; + int64_t sse_arr[3]; + double feature_score; +#endif +} TplDepStats; + +#if CONFIG_NON_GREEDY_MV +#define SQUARE_BLOCK_SIZES 4 + +#define ZERO_MV_MODE 0 +#define NEW_MV_MODE 1 +#define NEAREST_MV_MODE 2 +#define NEAR_MV_MODE 3 +#define MAX_MV_MODE 4 +#endif + +typedef struct TplDepFrame { + uint8_t is_valid; + TplDepStats *tpl_stats_ptr; + int stride; + int width; + int height; + int mi_rows; + int mi_cols; + int base_qindex; +#if CONFIG_NON_GREEDY_MV + int lambda; + int_mv *pyramid_mv_arr[3][SQUARE_BLOCK_SIZES]; + int *mv_mode_arr[3]; + double *rd_diff_arr[3]; +#endif +} TplDepFrame; + +#if CONFIG_NON_GREEDY_MV +static INLINE int get_square_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_4X4) { + return 0; + } + if (bsize == BLOCK_8X8) { + return 1; + } + if (bsize == BLOCK_16X16) { + return 2; + } + if (bsize == BLOCK_32X32) { + return 3; + } + assert(0 && "ERROR: non-square block size"); + return -1; +} + +static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) { + if (square_block_idx == 0) { + return BLOCK_4X4; + } + if (square_block_idx == 1) { + return BLOCK_8X8; + } + if (square_block_idx == 2) { + return BLOCK_16X16; + } + if (square_block_idx == 3) { + return BLOCK_32X32; + } + assert(0 && "ERROR: invalid square_block_idx"); + return BLOCK_INVALID; +} + +static INLINE int_mv *get_pyramid_mv(const TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + return &tpl_frame->pyramid_mv_arr[rf_idx][get_square_block_idx(bsize)] + [mi_row * tpl_frame->stride + mi_col]; +} +#endif + +#define TPL_DEP_COST_SCALE_LOG2 4 + // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; - int mode_map[BLOCK_SIZES][MAX_MODES]; +#if CONFIG_CONSISTENT_RECODE + int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; +#endif + int8_t mode_map[BLOCK_SIZES][MAX_MODES]; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; @@ -450,6 +547,31 @@ typedef struct ARNRFilterData { struct scale_factors sf; } ARNRFilterData; +typedef struct EncFrameBuf { + int mem_valid; + int released; + YV12_BUFFER_CONFIG frame; +} EncFrameBuf; + +// Maximum operating frame buffer size needed for a GOP using ARF reference. +#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS) +#if CONFIG_NON_GREEDY_MV +typedef struct FEATURE_SCORE_LOC { + int visited; + double feature_score; + int mi_row; + int mi_col; +} FEATURE_SCORE_LOC; +#endif + +#define MAX_KMEANS_GROUPS 8 + +typedef struct KMEANS_DATA { + double value; + int pos; + int group_idx; +} KMEANS_DATA; + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -473,17 +595,43 @@ typedef struct VP9_COMP { #endif YV12_BUFFER_CONFIG *raw_source_frame; + BLOCK_SIZE tpl_bsize; + TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; + YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; + EncFrameBuf enc_frame_buf[REF_FRAMES]; +#if CONFIG_MULTITHREAD + pthread_mutex_t kmeans_mutex; +#endif + int kmeans_data_arr_alloc; + KMEANS_DATA *kmeans_data_arr; + int kmeans_data_size; + int kmeans_data_stride; + double kmeans_ctr_ls[MAX_KMEANS_GROUPS]; + double kmeans_boundary_ls[MAX_KMEANS_GROUPS]; + int kmeans_count_ls[MAX_KMEANS_GROUPS]; + int kmeans_ctr_num; +#if CONFIG_NON_GREEDY_MV + int tpl_ready; + int feature_score_loc_alloc; + FEATURE_SCORE_LOC *feature_score_loc_arr; + FEATURE_SCORE_LOC **feature_score_loc_sort; + FEATURE_SCORE_LOC **feature_score_loc_heap; + int_mv *select_mv_arr; +#endif + TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. // For a still frame, this flag is set to 1 to skip partition search. int partition_search_skippable_frame; - int scaled_ref_idx[MAX_REF_FRAMES]; + int scaled_ref_idx[REFS_PER_FRAME]; int lst_fb_idx; int gld_fb_idx; int alt_fb_idx; + int ref_fb_idx[REF_FRAMES]; + int refresh_last_frame; int refresh_golden_frame; int refresh_alt_ref_frame; @@ -496,10 +644,15 @@ typedef struct VP9_COMP { int ext_refresh_frame_context_pending; int ext_refresh_frame_context; + int64_t norm_wiener_variance; + int64_t *mb_wiener_variance; + int mb_wiener_var_rows; + int mb_wiener_var_cols; + double *mi_ssim_rdmult_scaling_factors; + YV12_BUFFER_CONFIG last_frame_uf; TOKENEXTRA *tile_tok[4][1 << 6]; - uint32_t tok_count[4][1 << 6]; TOKENLIST *tplist[4][1 << 6]; // Ambient reconstruction err target for force key frames @@ -521,7 +674,7 @@ typedef struct VP9_COMP { RATE_CONTROL rc; double framerate; - int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]; + int interp_filter_selected[REF_FRAMES][SWITCHABLE]; struct vpx_codec_pkt_list *output_pkt_list; @@ -555,6 +708,7 @@ typedef struct VP9_COMP { ActiveMap active_map; fractional_mv_step_fp *find_fractional_mv_step; + struct scale_factors me_sf; vp9_diamond_search_fn_t diamond_search_sad; vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; uint64_t time_receive_data; @@ -645,10 +799,8 @@ typedef struct VP9_COMP { int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; - - int multi_arf_allowed; - int multi_arf_enabled; - int multi_arf_last_grp_enabled; + // Indices are: max_tx_size-1, tx_size_ctx, tx_size + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_DENOISER denoiser; @@ -723,6 +875,9 @@ typedef struct VP9_COMP { uint8_t *count_arf_frame_usage; uint8_t *count_lastgolden_frame_usage; + + int multi_layer_arf; + vpx_roi_map_t roi; } VP9_COMP; void vp9_initialize_enc(void); @@ -737,7 +892,7 @@ void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf); // frame is made and not just a copy of the pointer.. int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); + int64_t end_time); int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, @@ -758,9 +913,11 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, int vp9_update_entropy(VP9_COMP *cpi, int update); -int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); -int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); @@ -770,6 +927,27 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, void vp9_set_svc(VP9_COMP *cpi, int use_svc); +static INLINE int stack_pop(int *stack, int stack_size) { + int idx; + const int r = stack[0]; + for (idx = 1; idx < stack_size; ++idx) stack[idx - 1] = stack[idx]; + + return r; +} + +static INLINE int stack_top(const int *stack) { return stack[0]; } + +static INLINE void stack_push(int *stack, int new_item, int stack_size) { + int idx; + for (idx = stack_size; idx > 0; --idx) stack[idx] = stack[idx - 1]; + stack[0] = new_item; +} + +static INLINE void stack_init(int *stack, int length) { + int idx; + for (idx = 0; idx < length; ++idx) stack[idx] = -1; +} + int vp9_get_quantizer(struct VP9_COMP *cpi); static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { @@ -795,9 +973,13 @@ static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi, return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX; } +static INLINE RefCntBuffer *get_ref_cnt_buffer(VP9_COMMON *cm, int fb_idx) { + return fb_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[fb_idx] : NULL; +} + static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( - VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) { + const VP9_COMMON *const cm = &cpi->common; const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL; @@ -858,19 +1040,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); -static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) { - return cpi->use_svc && cpi->oxcf.pass != 0; -} - static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { return (cpi->use_svc && cpi->oxcf.pass == 0); } #if CONFIG_VP9_TEMPORAL_DENOISING static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { - return (!cpi->use_svc || - (cpi->use_svc && - cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); + return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >= + cpi->svc.first_layer_denoise)); } #endif @@ -878,12 +1055,10 @@ static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) && cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS && - (cpi->oxcf.enable_auto_arf && - (!is_two_pass_svc(cpi) || - cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id])); + cpi->oxcf.enable_auto_arf; } -static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, +static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd, MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) { xd->block_refs[0] = @@ -938,6 +1113,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width, VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); +int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[8], int delta_lf[8], + int skip[8], int ref_frame[8]); + void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_set_row_mt(VP9_COMP *cpi); @@ -948,4 +1127,4 @@ void vp9_set_row_mt(VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODER_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODER_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.c b/libs/libvpx/vp9/encoder/vp9_ethread.c index 0bd2e21451..e7f8a537d4 100644 --- a/libs/libvpx/vp9/encoder/vp9_ethread.c +++ b/libs/libvpx/vp9/encoder/vp9_ethread.c @@ -270,19 +270,19 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, { int i; - CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, - vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows)); - if (row_mt_sync->mutex_) { + CHECK_MEM_ERROR(cm, row_mt_sync->mutex, + vpx_malloc(sizeof(*row_mt_sync->mutex) * rows)); + if (row_mt_sync->mutex) { for (i = 0; i < rows; ++i) { - pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); + pthread_mutex_init(&row_mt_sync->mutex[i], NULL); } } - CHECK_MEM_ERROR(cm, row_mt_sync->cond_, - vpx_malloc(sizeof(*row_mt_sync->cond_) * rows)); - if (row_mt_sync->cond_) { + CHECK_MEM_ERROR(cm, row_mt_sync->cond, + vpx_malloc(sizeof(*row_mt_sync->cond) * rows)); + if (row_mt_sync->cond) { for (i = 0; i < rows; ++i) { - pthread_cond_init(&row_mt_sync->cond_[i], NULL); + pthread_cond_init(&row_mt_sync->cond[i], NULL); } } } @@ -301,17 +301,17 @@ void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) { #if CONFIG_MULTITHREAD int i; - if (row_mt_sync->mutex_ != NULL) { + if (row_mt_sync->mutex != NULL) { for (i = 0; i < row_mt_sync->rows; ++i) { - pthread_mutex_destroy(&row_mt_sync->mutex_[i]); + pthread_mutex_destroy(&row_mt_sync->mutex[i]); } - vpx_free(row_mt_sync->mutex_); + vpx_free(row_mt_sync->mutex); } - if (row_mt_sync->cond_ != NULL) { + if (row_mt_sync->cond != NULL) { for (i = 0; i < row_mt_sync->rows; ++i) { - pthread_cond_destroy(&row_mt_sync->cond_[i]); + pthread_cond_destroy(&row_mt_sync->cond[i]); } - vpx_free(row_mt_sync->cond_); + vpx_free(row_mt_sync->cond); } #endif // CONFIG_MULTITHREAD vpx_free(row_mt_sync->cur_col); @@ -327,11 +327,11 @@ void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) { const int nsync = row_mt_sync->sync_range; if (r && !(c & (nsync - 1))) { - pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; + pthread_mutex_t *const mutex = &row_mt_sync->mutex[r - 1]; pthread_mutex_lock(mutex); while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) { - pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); + pthread_cond_wait(&row_mt_sync->cond[r - 1], mutex); } pthread_mutex_unlock(mutex); } @@ -365,12 +365,12 @@ void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, } if (sig) { - pthread_mutex_lock(&row_mt_sync->mutex_[r]); + pthread_mutex_lock(&row_mt_sync->mutex[r]); row_mt_sync->cur_col[r] = cur; - pthread_cond_signal(&row_mt_sync->cond_[r]); - pthread_mutex_unlock(&row_mt_sync->mutex_[r]); + pthread_cond_signal(&row_mt_sync->cond[r]); + pthread_mutex_unlock(&row_mt_sync->mutex[r]); } #else (void)row_mt_sync; @@ -390,8 +390,9 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, } #if !CONFIG_REALTIME_ONLY -static int first_pass_worker_hook(EncWorkerData *const thread_data, - MultiThreadHandle *multi_thread_ctxt) { +static int first_pass_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -470,8 +471,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) { } } - launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook, - multi_thread_ctxt, num_workers); + launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt, + num_workers); first_tile_col = &cpi->tile_data[0]; for (i = 1; i < tile_cols; i++) { @@ -480,8 +481,9 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) { } } -static int temporal_filter_worker_hook(EncWorkerData *const thread_data, - MultiThreadHandle *multi_thread_ctxt) { +static int temporal_filter_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -508,8 +510,8 @@ static int temporal_filter_worker_hook(EncWorkerData *const thread_data, tile_col = proc_job->tile_col_id; tile_row = proc_job->tile_row_id; this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; - mb_col_start = (this_tile->tile_info.mi_col_start) >> 1; - mb_col_end = (this_tile->tile_info.mi_col_end + 1) >> 1; + mb_col_start = (this_tile->tile_info.mi_col_start) >> TF_SHIFT; + mb_col_end = (this_tile->tile_info.mi_col_end + TF_ROUND) >> TF_SHIFT; mb_row = proc_job->vert_unit_row_num; vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row, @@ -553,13 +555,14 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { } } - launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook, - multi_thread_ctxt, num_workers); + launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt, + num_workers); } #endif // !CONFIG_REALTIME_ONLY -static int enc_row_mt_worker_hook(EncWorkerData *const thread_data, - MultiThreadHandle *multi_thread_ctxt) { +static int enc_row_mt_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -648,8 +651,8 @@ void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { } } - launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook, - multi_thread_ctxt, num_workers); + launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt, + num_workers); for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.h b/libs/libvpx/vp9/encoder/vp9_ethread.h index a396e621d7..cda0293bcf 100644 --- a/libs/libvpx/vp9/encoder/vp9_ethread.h +++ b/libs/libvpx/vp9/encoder/vp9_ethread.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ETHREAD_H_ -#define VP9_ENCODER_VP9_ETHREAD_H_ +#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_ +#define VPX_VP9_ENCODER_VP9_ETHREAD_H_ #ifdef __cplusplus extern "C" { @@ -33,8 +33,8 @@ typedef struct EncWorkerData { // Encoder row synchronization typedef struct VP9RowMTSyncData { #if CONFIG_MULTITHREAD - pthread_mutex_t *mutex_; - pthread_cond_t *cond_; + pthread_mutex_t *mutex; + pthread_cond_t *cond; #endif // Allocate memory to store the sb/mb block index in each row. int *cur_col; @@ -69,4 +69,4 @@ void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ETHREAD_H_ +#endif // VPX_VP9_ENCODER_VP9_ETHREAD_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_extend.h b/libs/libvpx/vp9/encoder/vp9_extend.h index c0dd757159..4ba7fc95e3 100644 --- a/libs/libvpx/vp9/encoder/vp9_extend.h +++ b/libs/libvpx/vp9/encoder/vp9_extend.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_EXTEND_H_ -#define VP9_ENCODER_VP9_EXTEND_H_ +#ifndef VPX_VP9_ENCODER_VP9_EXTEND_H_ +#define VPX_VP9_ENCODER_VP9_EXTEND_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" @@ -28,4 +28,4 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_EXTEND_H_ +#endif // VPX_VP9_ENCODER_VP9_EXTEND_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.c b/libs/libvpx/vp9/encoder/vp9_firstpass.c index fb6b132a5b..e0acf563b8 100644 --- a/libs/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libs/libvpx/vp9/encoder/vp9_firstpass.c @@ -44,15 +44,11 @@ #define COMPLEXITY_STATS_OUTPUT 0 #define FIRST_PASS_Q 10.0 -#define INTRA_MODE_PENALTY 1024 -#define MIN_ARF_GF_BOOST 240 +#define NORMAL_BOOST 100 +#define MIN_ARF_GF_BOOST 250 #define MIN_DECAY_FACTOR 0.01 #define NEW_MV_MODE_PENALTY 32 #define DARK_THRESH 64 -#define DEFAULT_GRP_WEIGHT 1.0 -#define RC_FACTOR_MIN 0.75 -#define RC_FACTOR_MAX 1.75 -#define SECTION_NOISE_DEF 250.0 #define LOW_I_THRESH 24000 #define NCOUNT_INTRA_THRESH 8192 @@ -105,7 +101,7 @@ static void output_stats(FIRSTPASS_STATS *stats, fprintf(fpfile, "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" - "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.4lf %12.0lf" "%12.4lf" "\n", stats->frame, stats->weight, stats->intra_error, stats->coded_error, @@ -316,16 +312,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) { } void vp9_end_first_pass(VP9_COMP *cpi) { - if (is_two_pass_svc(cpi)) { - int i; - for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { - output_stats(&cpi->svc.layer_context[i].twopass.total_stats, - cpi->output_pkt_list); - } - } else { - output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); - } - + output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); vpx_free(cpi->twopass.fp_mb_float_stats); cpi->twopass.fp_mb_float_stats = NULL; } @@ -503,11 +490,10 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = thresh; break; case VPX_BITS_10: ret_val = thresh << 4; break; - case VPX_BITS_12: ret_val = thresh << 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = thresh << 8; + break; } } #else @@ -529,11 +515,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break; case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break; - case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = UL_INTRA_THRESH << 4; + break; } } #else @@ -550,11 +535,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break; case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break; - case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = SMOOTH_INTRA_THRESH << 8; + break; } } #else @@ -564,7 +548,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { } #define FP_DN_THRESH 8 -#define FP_MAX_DN_THRESH 16 +#define FP_MAX_DN_THRESH 24 #define KERNEL_SIZE 3 // Baseline Kernal weights for first pass noise metric @@ -731,9 +715,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, // Exclude any image dead zone if (fp_acc_data->image_data_start_row > 0) { fp_acc_data->intra_skip_count = - VPXMAX(0, - fp_acc_data->intra_skip_count - - (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + VPXMAX(0, fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); } fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; @@ -825,6 +808,8 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, fp_acc_data->image_data_start_row); } +#define NZ_MOTION_PENALTY 128 +#define INTRA_MODE_PENALTY 1024 void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, FIRSTPASS_DATA *fp_acc_data, TileDataEnc *tile_data, MV *best_ref_mv, @@ -834,6 +819,8 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; TileInfo tile = tile_data->tile_info; + const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1); + const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1); struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; const PICK_MODE_CONTEXT *ctx = &td->pc_root->none; @@ -850,40 +837,19 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : NULL; MODE_INFO mi_above, mi_left; double mb_intra_factor; double mb_brightness_factor; double mb_neutral_count; + int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); - assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); - if (lc != NULL) { - // Use either last frame or alt frame for motion search. - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); - if (first_ref_buf == NULL) - first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); - } - - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); - if (gld_yv12 == NULL) { - gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - } - } else { - gld_yv12 = NULL; - } - } - - xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + - (tile.mi_col_start >> 1); - xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1); + xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start; + xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start; for (i = 0; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][1]; @@ -897,10 +863,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); // Reset above block coeffs. - recon_yoffset = - (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16; - recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) + - (tile.mi_col_start >> 1) * uv_mb_height; + recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16; + recon_uvoffset = + (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height; // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. @@ -908,8 +873,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, x->mv_limits.row_max = ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; - for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1); - ++mb_col, c++) { + for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) { int this_error; int this_intra_error; const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); @@ -955,7 +919,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, x->skip_encode = 0; x->fp_src_pred = 0; // Do intra prediction based on source pixels for tile boundaries - if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) { + if (mb_col == mb_col_start && mb_col != 0) { xd->left_mi = &mi_left; x->fp_src_pred = 1; } @@ -1002,12 +966,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, switch (cm->bit_depth) { case VPX_BITS_8: break; case VPX_BITS_10: this_error >>= 4; break; - case VPX_BITS_12: this_error >>= 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); - return; + assert(cm->bit_depth == VPX_BITS_12); + this_error >>= 8; + break; } } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1073,30 +1035,34 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; // Other than for the first frame do a motion search. - if ((lc == NULL && cm->current_video_frame > 0) || - (lc != NULL && lc->current_video_frame_in_layer > 0)) { - int tmp_err, motion_error, raw_motion_error; + if (cm->current_video_frame > 0) { + int tmp_err, motion_error, this_motion_error, raw_motion_error; // Assume 0,0 motion with no mv overhead. MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; struct buf_2d unscaled_last_source_buf_2d; + vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + this_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], 8); } else { motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; } #else motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; #endif // CONFIG_VP9_HIGHBITDEPTH // Compute the motion error of the 0,0 motion using the last source // frame as the reference. Skip the further motion search on - // reconstructed frame if this error is small. + // reconstructed frame if this error is very small. unscaled_last_source_buf_2d.buf = cpi->unscaled_last_source->y_buffer + recon_yoffset; unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; @@ -1113,12 +1079,20 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, &unscaled_last_source_buf_2d); #endif // CONFIG_VP9_HIGHBITDEPTH - // TODO(pengchong): Replace the hard-coded threshold - if (raw_motion_error > 25 || lc != NULL) { + if (raw_motion_error > NZ_MOTION_PENALTY) { // Test last reference frame using the previous best mv as the // starting point (best reference) for the search. first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + v_fn_ptr.vf = get_block_variance_fn(bsize); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + this_motion_error = + vp9_get_mvpred_var(x, &mv, best_ref_mv, &v_fn_ptr, 0); + // If the current best reference mv is not centered on 0,0 then do a // 0,0 based search as well. if (!is_zero_mv(best_ref_mv)) { @@ -1128,13 +1102,13 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, if (tmp_err < motion_error) { motion_error = tmp_err; mv = tmp_mv; + this_motion_error = + vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0); } } // Search in an older reference frame. - if (((lc == NULL && cm->current_video_frame > 1) || - (lc != NULL && lc->current_video_frame_in_layer > 1)) && - gld_yv12 != NULL) { + if ((cm->current_video_frame > 1) && gld_yv12 != NULL) { // Assume 0,0 motion with no mv overhead. int gf_motion_error; @@ -1280,7 +1254,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } } #endif - // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { if (mv.row > 0) @@ -1306,17 +1279,16 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, else if (mv.col < 0) --(fp_acc_data->sum_in_vectors); } - fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + } + if (this_intra_error < scaled_low_intra_thresh) { fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { // 0,0 mv but high error + } else { fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; } } else { // Intra < inter error - int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); if (this_intra_error < scaled_low_intra_thresh) { fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); - if (motion_error < scaled_low_intra_thresh) { + if (this_motion_error < scaled_low_intra_thresh) { fp_acc_data->intra_count_low += 1.0; } else { fp_acc_data->intra_count_high += 1.0; @@ -1335,7 +1307,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, recon_uvoffset += uv_mb_height; // Accumulate row level stats to the corresponding tile stats - if (cpi->row_mt && mb_col == (tile.mi_col_end >> 1) - 1) + if (cpi->row_mt && mb_col == mb_col_end - 1) accumulate_fp_mb_row_stat(tile_data, fp_acc_data); (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c, @@ -1372,9 +1344,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : NULL; BufferPool *const pool = cm->buffer_pool; FIRSTPASS_DATA fp_temp_data; @@ -1386,7 +1355,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); - assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { @@ -1397,50 +1366,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { set_first_pass_params(cpi); vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); - if (lc != NULL) { - twopass = &lc->twopass; - - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; - cpi->ref_frame_flags = VP9_LAST_FLAG; - - if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id < - REF_FRAMES) { - cpi->gld_fb_idx = - cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id; - cpi->ref_frame_flags |= VP9_GOLD_FLAG; - cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0); - } else { - cpi->refresh_golden_frame = 0; - } - - if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0; - - vp9_scale_references(cpi); - - // Use either last frame or alt frame for motion search. - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); - if (first_ref_buf == NULL) - first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); - } - - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); - if (gld_yv12 == NULL) { - gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - } - } else { - gld_yv12 = NULL; - } - - set_ref_ptrs(cm, xd, - (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE, - (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); - - cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source, 0, EIGHTTAP, 0); - } - vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); vp9_setup_src_planes(x, cpi->Source, 0, 0); @@ -1524,18 +1449,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vpx_extend_frame_borders(new_yv12); - if (lc != NULL) { - vp9_update_reference_frames(cpi); - } else { - // The frame we just compressed now becomes the last frame. - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], - cm->new_fb_idx); - } + // The frame we just compressed now becomes the last frame. + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + cm->new_fb_idx); // Special case for the first frame. Copy into the GF buffer as a second // reference. - if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX && - lc == NULL) { + if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], cm->ref_frame_map[cpi->lst_fb_idx]); } @@ -1560,9 +1480,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); } -static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { - 0.65, 0.70, 0.75, 0.85, 0.90, 0.90, 0.90, 1.00, 1.25 -}; +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, + 0.85, 0.90, 0.90, + 0.90, 1.00, 1.25 }; static double calc_correction_factor(double err_per_mb, double err_divisor, int q) { @@ -1583,7 +1503,26 @@ static double calc_correction_factor(double err_per_mb, double err_divisor, return fclamp(pow(error_term, power_term), 0.05, 5.0); } -#define ERR_DIVISOR 115.0 +static double wq_err_divisor(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + unsigned int screen_area = (cm->width * cm->height); + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area <= 640 * 360) { + return 115.0; + } else if (screen_area < 1280 * 720) { + return 125.0; + } else if (screen_area <= 1920 * 1080) { + return 130.0; + } else if (screen_area < 3840 * 2160) { + return 150.0; + } + + // Fall through to here only for 4K and above. + return 200.0; +} + #define NOISE_FACTOR_MIN 0.9 #define NOISE_FACTOR_MAX 1.1 static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, @@ -1643,7 +1582,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { const double factor = - calc_correction_factor(av_err_per_mb, ERR_DIVISOR, q); + calc_correction_factor(av_err_per_mb, wq_err_divisor(cpi), q); const int bits_per_mb = vp9_rc_bits_per_mb( INTER_FRAME, q, factor * speed_term * cpi->twopass.bpm_factor * noise_factor, @@ -1690,14 +1629,9 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width, } void vp9_init_second_pass(VP9_COMP *cpi) { - SVC *const svc = &cpi->svc; VP9EncoderConfig *const oxcf = &cpi->oxcf; - const int is_two_pass_svc = - (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = - is_two_pass_svc ? &svc->layer_context[svc->spatial_layer_id].twopass - : &cpi->twopass; + TWO_PASS *const twopass = &cpi->twopass; double frame_rate; FIRSTPASS_STATS *stats; @@ -1774,18 +1708,9 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // encoded in the second pass is a guess. However, the sum duration is not. // It is calculated based on the actual durations of all frames from the // first pass. - - if (is_two_pass_svc) { - vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * - svc->layer_context[svc->spatial_layer_id].target_bandwidth / - 10000000.0); - } else { - vp9_new_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - } + vp9_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; @@ -1913,10 +1838,12 @@ static int detect_flash(const TWO_PASS *twopass, int offset) { // brief break in prediction (such as a flash) but subsequent frames // are reasonably well predicted by an earlier (pre flash) frame. // The recovery after a flash is indicated by a high pcnt_second_ref - // compared to pcnt_inter. + // useage or a second ref coded error notabley lower than the last + // frame coded error. return next_frame != NULL && - next_frame->pcnt_second_ref > next_frame->pcnt_inter && - next_frame->pcnt_second_ref >= 0.5; + ((next_frame->sr_coded_error < next_frame->coded_error) || + ((next_frame->pcnt_second_ref > next_frame->pcnt_inter) && + (next_frame->pcnt_second_ref >= 0.5))); } // Update the motion related elements to the GF arf boost calculation. @@ -1971,7 +1898,20 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction); } -#define KF_BASELINE_ERR_PER_MB 12500.0 +static double kf_err_per_mb(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + unsigned int screen_area = (cm->width * cm->height); + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area < 1280 * 720) { + return 2000.0; + } else if (screen_area < 1920 * 1080) { + return 500.0; + } + return 250.0; +} + static double calc_kf_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, double *sr_accumulator, @@ -1984,7 +1924,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, const double active_area = calculate_active_area(cpi, this_frame); // Underlying boost factor is based on inter error ratio. - frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) / + frame_boost = (kf_err_per_mb(cpi) * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); // Update the accumulator for second ref error difference. @@ -1997,8 +1937,11 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // Q correction and scalling - frame_boost = frame_boost * boost_q_correction; + // Q correction and scaling + // The 40.0 value here is an experimentally derived baseline minimum. + // This value is in line with the minimum per frame boost in the alt_ref + // boost calculation. + frame_boost = ((frame_boost + 40.0) * boost_q_correction); return VPXMIN(frame_boost, max_boost * boost_q_correction); } @@ -2105,10 +2048,15 @@ static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, // Calculate the total bits to allocate in this GF/ARF group. static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, double gf_group_err) { + VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const TWO_PASS *const twopass = &cpi->twopass; const int max_bits = frame_max_bits(rc, &cpi->oxcf); int64_t total_group_bits; + const int is_key_frame = frame_is_intra_only(cm); + const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + int gop_frames = + rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf; // Calculate the bits to be allocated to the group as a whole. if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) { @@ -2126,8 +2074,8 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, : total_group_bits; // Clip based on user supplied data rate variability limit. - if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) - total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + if (total_group_bits > (int64_t)max_bits * gop_frames) + total_group_bits = (int64_t)max_bits * gop_frames; return total_group_bits; } @@ -2140,7 +2088,7 @@ static int calculate_boost_bits(int frame_count, int boost, // return 0 for invalid inputs (could arise e.g. through rounding errors) if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0; - allocation_chunks = (frame_count * 100) + boost; + allocation_chunks = (frame_count * NORMAL_BOOST) + boost; // Prevent overflow. if (boost > 1023) { @@ -2154,18 +2102,6 @@ static int calculate_boost_bits(int frame_count, int boost, 0); } -// Current limit on maximum number of active arfs in a GF/ARF group. -#define MAX_ACTIVE_ARFS 2 -#define ARF_SLOT1 2 -#define ARF_SLOT2 3 -// This function indirects the choice of buffers for arfs. -// At the moment the values are fixed but this may change as part of -// the integration process with other codec features that swap buffers around. -static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { - arf_buffer_indices[0] = ARF_SLOT1; - arf_buffer_indices[1] = ARF_SLOT2; -} - // Used in corpus vbr: Calculates the total normalized group complexity score // for a given number of frames starting at the current position in the stats // file. @@ -2185,11 +2121,129 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score, ++s; ++i; } - assert(i == frame_count); return score_total; } +static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group, + int *index_counter, int depth, int start, int end) { + TWO_PASS *twopass = &cpi->twopass; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + FIRSTPASS_STATS fpf_frame; + const int mid = (start + end + 1) >> 1; + const int min_frame_interval = 2; + int idx; + + // Process regular P frames + if ((end - start < min_frame_interval) || + (depth > gf_group->allowed_max_layer_depth)) { + for (idx = start; idx <= end; ++idx) { + gf_group->update_type[*index_counter] = LF_UPDATE; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = idx; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + gf_group->gfu_boost[*index_counter] = NORMAL_BOOST; + ++(*index_counter); + } + gf_group->max_layer_depth = VPXMAX(gf_group->max_layer_depth, depth); + return; + } + + assert(abs(mid - start) >= 1 && abs(mid - end) >= 1); + + // Process ARF frame + gf_group->layer_depth[*index_counter] = depth; + gf_group->update_type[*index_counter] = ARF_UPDATE; + gf_group->arf_src_offset[*index_counter] = mid - start; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = GF_ARF_LOW; + + for (idx = 0; idx <= mid; ++idx) + if (EOF == input_stats(twopass, &fpf_frame)) break; + + gf_group->gfu_boost[*index_counter] = + VPXMAX(MIN_ARF_GF_BOOST, + calc_arf_boost(cpi, end - mid + 1, mid - start) >> depth); + + reset_fpf_position(twopass, start_pos); + + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, start, mid - 1); + + gf_group->update_type[*index_counter] = USE_BUF_FRAME; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end); +} + +static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group, + int frame_index, + int source_alt_ref_active) { + if (source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1; + gf_group->gfu_boost[frame_index] = NORMAL_BOOST; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = 0; + } +} + +static void define_gf_group_structure(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + int frame_index = 0; + int key_frame = cpi->common.frame_type == KEY_FRAME; + int layer_depth = 1; + int gop_frames = + rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); + + gf_group->frame_start = cpi->common.current_video_frame; + gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval; + gf_group->max_layer_depth = 0; + gf_group->allowed_max_layer_depth = 0; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + if (!key_frame) + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active); + + ++frame_index; + + // === [frame_index == 1] === + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = layer_depth; + gf_group->arf_src_offset[frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + gf_group->max_layer_depth = 1; + ++frame_index; + ++layer_depth; + gf_group->allowed_max_layer_depth = cpi->oxcf.enable_auto_arf; + } + + find_arf_order(cpi, gf_group, &frame_index, layer_depth, 1, gop_frames); + + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending); + gf_group->arf_src_offset[frame_index] = 0; + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + + // Set the frame ops number. + gf_group->gf_group_size = frame_index; +} + static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int gf_arf_bits) { VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -2198,17 +2252,12 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, GF_GROUP *const gf_group = &twopass->gf_group; FIRSTPASS_STATS frame_stats; int i; - int frame_index = 1; + int frame_index = 0; int target_frame_size; int key_frame; const int max_bits = frame_max_bits(&cpi->rc, oxcf); int64_t total_group_bits = gf_group_bits; - int mid_boost_bits = 0; int mid_frame_idx; - unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; - int alt_frame_index = frame_index; - int has_temporal_layers = - is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1; int normal_frames; int normal_frame_bits; int last_frame_reduction = 0; @@ -2216,81 +2265,97 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, double tot_norm_frame_score = 1.0; double this_frame_score = 1.0; - // Only encode alt reference frame in temporal base layer. - if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers; + // Define the GF structure and specify + int gop_frames = gf_group->gf_group_size; - key_frame = - cpi->common.frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi); - - get_arf_buffer_indices(arf_buffer_indices); + key_frame = cpi->common.frame_type == KEY_FRAME; // For key frames the frame target rate is already set and it // is also the golden frame. + // === [frame_index == 0] === if (!key_frame) { - if (rc->source_alt_ref_active) { - gf_group->update_type[0] = OVERLAY_UPDATE; - gf_group->rf_level[0] = INTER_NORMAL; - gf_group->bit_allocation[0] = 0; - } else { - gf_group->update_type[0] = GF_UPDATE; - gf_group->rf_level[0] = GF_ARF_STD; - gf_group->bit_allocation[0] = gf_arf_bits; - } - gf_group->arf_update_idx[0] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[0] = arf_buffer_indices[0]; - - // Step over the golden frame / overlay frame - if (EOF == input_stats(twopass, &frame_stats)) return; + gf_group->bit_allocation[frame_index] = + rc->source_alt_ref_active ? 0 : gf_arf_bits; } // Deduct the boost bits for arf (or gf if it is not a key frame) // from the group total. if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; + ++frame_index; + + // === [frame_index == 1] === // Store the bits to spend on the ARF if there is one. if (rc->source_alt_ref_pending) { - gf_group->update_type[alt_frame_index] = ARF_UPDATE; - gf_group->rf_level[alt_frame_index] = GF_ARF_STD; - gf_group->bit_allocation[alt_frame_index] = gf_arf_bits; + gf_group->bit_allocation[frame_index] = gf_arf_bits; - if (has_temporal_layers) - gf_group->arf_src_offset[alt_frame_index] = - (unsigned char)(rc->baseline_gf_interval - - cpi->svc.number_temporal_layers); - else - gf_group->arf_src_offset[alt_frame_index] = - (unsigned char)(rc->baseline_gf_interval - 1); - - gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[alt_frame_index] = - arf_buffer_indices[cpi->multi_arf_last_grp_enabled && - rc->source_alt_ref_active]; - if (!has_temporal_layers) ++frame_index; - - if (cpi->multi_arf_enabled) { - // Set aside a slot for a level 1 arf. - gf_group->update_type[frame_index] = ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_LOW; - gf_group->arf_src_offset[frame_index] = - (unsigned char)((rc->baseline_gf_interval >> 1) - 1); - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - ++frame_index; - } + ++frame_index; } - // Note index of the first normal inter frame int eh group (not gf kf arf) - gf_group->first_inter_index = frame_index; - // Define middle frame mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; - normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); + normal_frames = (rc->baseline_gf_interval - 1); if (normal_frames > 1) normal_frame_bits = (int)(total_group_bits / normal_frames); else normal_frame_bits = (int)total_group_bits; + gf_group->gfu_boost[1] = rc->gfu_boost; + + if (cpi->multi_layer_arf) { + int idx; + int arf_depth_bits[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_count[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_boost[MAX_ARF_LAYERS] = { 0 }; + int total_arfs = 1; // Account for the base layer ARF. + + for (idx = 0; idx < gop_frames; ++idx) { + if (gf_group->update_type[idx] == ARF_UPDATE) { + arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->gfu_boost[idx]; + ++arf_depth_count[gf_group->layer_depth[idx]]; + } + } + + for (idx = 2; idx < MAX_ARF_LAYERS; ++idx) { + if (arf_depth_boost[idx] == 0) break; + arf_depth_bits[idx] = calculate_boost_bits( + rc->baseline_gf_interval - total_arfs - arf_depth_count[idx], + arf_depth_boost[idx], total_group_bits); + + total_group_bits -= arf_depth_bits[idx]; + total_arfs += arf_depth_count[idx]; + } + + // offset the base layer arf + normal_frames -= (total_arfs - 1); + if (normal_frames > 1) + normal_frame_bits = (int)(total_group_bits / normal_frames); + else + normal_frame_bits = (int)total_group_bits; + + target_frame_size = normal_frame_bits; + target_frame_size = + clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); + + // The first layer ARF has its bit allocation assigned. + for (idx = frame_index; idx < gop_frames; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + gf_group->bit_allocation[idx] = + (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] * + gf_group->gfu_boost[idx]) / + arf_depth_boost[gf_group->layer_depth[idx]]); + break; + case USE_BUF_FRAME: gf_group->bit_allocation[idx] = 0; break; + default: gf_group->bit_allocation[idx] = target_frame_size; break; + } + } + gf_group->bit_allocation[idx] = 0; + + return; + } + if (oxcf->vbr_corpus_complexity) { av_score = get_distribution_av_err(cpi, twopass); tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); @@ -2298,13 +2363,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, // Allocate bits to the other frames in the group. for (i = 0; i < normal_frames; ++i) { - int arf_idx = 0; if (EOF == input_stats(twopass, &frame_stats)) break; - - if (has_temporal_layers && frame_index == alt_frame_index) { - ++frame_index; - } - if (oxcf->vbr_corpus_complexity) { this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf, &frame_stats, av_score); @@ -2318,21 +2377,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, target_frame_size -= last_frame_reduction; } - if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { - mid_boost_bits += (target_frame_size >> 4); - target_frame_size -= (target_frame_size >> 4); - - if (frame_index <= mid_frame_idx) arf_idx = 1; - } - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; - target_frame_size = clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); - gf_group->update_type[frame_index] = LF_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - gf_group->bit_allocation[frame_index] = target_frame_size; ++frame_index; } @@ -2344,27 +2391,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, // We need to configure the frame at the end of the sequence + 1 that will be // the start frame for the next group. Otherwise prior to the call to // vp9_rc_get_second_pass_params() the data will be undefined. - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - - // Final setup for second arf and its overlay. - if (cpi->multi_arf_enabled) { - gf_group->bit_allocation[2] = - gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits; - gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE; - gf_group->bit_allocation[mid_frame_idx] = 0; - } - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } - - // Note whether multi-arf was enabled this group for next time. - cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; } // Adjusts the ARNF filter for a GF group. @@ -2376,15 +2402,19 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise, twopass->arnr_strength_adjustment = 0; - if ((section_zeromv < 0.10) || (section_noise <= (SECTION_NOISE_DEF * 0.75))) + if (section_noise < 150) { twopass->arnr_strength_adjustment -= 1; + if (section_noise < 75) twopass->arnr_strength_adjustment -= 1; + } else if (section_noise > 250) + twopass->arnr_strength_adjustment += 1; + if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1; } // Analyse and define a gf/arf group. -#define ARF_DECAY_BREAKOUT 0.10 #define ARF_ABS_ZOOM_THRESH 4.0 +#define MAX_GF_BOOST 5400 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2425,6 +2455,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int gf_arf_bits; const int is_key_frame = frame_is_intra_only(cm); const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + int is_alt_ref_flash = 0; + + double gop_intra_factor = 1.0; + int gop_frames; // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. @@ -2465,36 +2499,51 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { { int int_max_q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality, cpi->common.bit_depth)); - int int_lbq = (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex, - cpi->common.bit_depth)); + int q_term = (cm->current_video_frame == 0) + ? int_max_q / 32 + : (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex, + cpi->common.bit_depth) / + 6); active_min_gf_interval = rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); active_min_gf_interval = VPXMIN(active_min_gf_interval, rc->max_gf_interval + arf_active_or_kf); - if (cpi->multi_arf_allowed) { - active_max_gf_interval = rc->max_gf_interval; + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = 11 + arf_active_or_kf + VPXMIN(5, q_term); + + // Force max GF interval to be odd. + active_max_gf_interval = active_max_gf_interval | 0x01; + + // We have: active_min_gf_interval <= + // rc->max_gf_interval + arf_active_or_kf. + if (active_max_gf_interval < active_min_gf_interval) { + active_max_gf_interval = active_min_gf_interval; } else { - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6)); - - // We have: active_min_gf_interval <= - // rc->max_gf_interval + arf_active_or_kf. - if (active_max_gf_interval < active_min_gf_interval) { - active_max_gf_interval = active_min_gf_interval; - } else { - active_max_gf_interval = VPXMIN(active_max_gf_interval, - rc->max_gf_interval + arf_active_or_kf); - } - - // Would the active max drop us out just before the near the next kf? - if ((active_max_gf_interval <= rc->frames_to_key) && - (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval))) - active_max_gf_interval = rc->frames_to_key / 2; + active_max_gf_interval = VPXMIN(active_max_gf_interval, + rc->max_gf_interval + arf_active_or_kf); } + + // Would the active max drop us out just before the near the next kf? + if ((active_max_gf_interval <= rc->frames_to_key) && + (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval))) + active_max_gf_interval = rc->frames_to_key / 2; + } + active_max_gf_interval = + VPXMAX(active_max_gf_interval, active_min_gf_interval); + + if (cpi->multi_layer_arf) { + int layers = 0; + int max_layers = VPXMIN(MAX_ARF_LAYERS, cpi->oxcf.enable_auto_arf); + + // Adapt the intra_error factor to active_max_gf_interval limit. + for (i = active_max_gf_interval; i > 0; i >>= 1) ++layers; + + layers = VPXMIN(max_layers, layers); + gop_intra_factor += (layers * 0.25); } i = 0; @@ -2523,15 +2572,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + // Monitor for static sections. + if ((rc->frames_since_key + i - 1) > 1) { + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + } + // Accumulate the effect of prediction quality decay. if (!flash_detected) { last_loop_decay_rate = loop_decay_rate; loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); - // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); - // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, @@ -2551,18 +2602,27 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Break out conditions. - if ( - // Break at active_max_gf_interval unless almost totally static. - ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) || + // Break at maximum of active_max_gf_interval unless almost totally static. + // + // Note that the addition of a test of rc->source_alt_ref_active is + // deliberate. The effect of this is that after a normal altref group even + // if the material is static there will be one normal length GF group + // before allowing longer GF groups. The reason for this is that in cases + // such as slide shows where slides are separated by a complex transition + // such as a fade, the arf group spanning the transition may not be coded + // at a very high quality and hence this frame (with its overlay) is a + // poor golden frame to use for an extended group. + if (((i >= active_max_gf_interval) && + ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) || ( // Don't break out with a very short interval. (i >= active_min_gf_interval) && // If possible dont break very close to a kf - ((rc->frames_to_key - i) >= rc->min_gf_interval) && + ((rc->frames_to_key - i) >= rc->min_gf_interval) && (i & 0x01) && (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) || - (sr_accumulator > next_frame.intra_error)))) { + (sr_accumulator > gop_intra_factor * next_frame.intra_error)))) { break; } @@ -2573,8 +2633,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; // Should we use the alternate reference frame. - if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && - (i >= rc->min_gf_interval)) { + if ((zero_motion_accumulator < 0.995) && allow_alt_ref && + (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && + (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { const int forward_frames = (rc->frames_to_key - i >= i - 1) ? i - 1 : VPXMAX(0, rc->frames_to_key - i); @@ -2582,18 +2643,23 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Calculate the boost for alt ref. rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1)); rc->source_alt_ref_pending = 1; - - // Test to see if multi arf is appropriate. - cpi->multi_arf_enabled = - (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && - (zero_motion_accumulator < 0.995)) - ? 1 - : 0; } else { - rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1)); + reset_fpf_position(twopass, start_pos); + rc->gfu_boost = VPXMIN(MAX_GF_BOOST, calc_arf_boost(cpi, (i - 1), 0)); rc->source_alt_ref_pending = 0; } +#define LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR 0.2 + rc->arf_active_best_quality_adjustment_factor = 1.0; + if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf) && + rc->frames_to_key <= rc->arf_active_best_quality_adjustment_window) { + rc->arf_active_best_quality_adjustment_factor = + LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR + + (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) * + (rc->frames_to_key - i) / + VPXMAX(1, (rc->arf_active_best_quality_adjustment_window - i)); + } + #ifdef AGGRESSIVE_VBR // Limit maximum boost based on interval length. rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 140); @@ -2601,53 +2667,47 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200); #endif - // Set the interval until the next gf. - rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); + // Cap the ARF boost when perceptual quality AQ mode is enabled. This is + // designed to improve the perceptual quality of high value content and to + // make consistent quality across consecutive frames. It will hurt objective + // quality. + if (oxcf->aq_mode == PERCEPTUAL_AQ) + rc->gfu_boost = VPXMIN(rc->gfu_boost, MIN_ARF_GF_BOOST); - // Only encode alt reference frame in temporal base layer. So - // baseline_gf_interval should be multiple of a temporal layer group - // (typically the frame distance between two base layer frames) - if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { - int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; - int new_gf_interval = (rc->baseline_gf_interval + count) & (~count); - int j; - for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { - if (EOF == input_stats(twopass, this_frame)) break; - gf_group_err += - calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); - gf_group_raw_error += this_frame->coded_error; - gf_group_noise += this_frame->frame_noise_energy; - gf_group_skip_pct += this_frame->intra_skip_pct; - gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; - gf_group_inter += this_frame->pcnt_inter; - gf_group_motion += this_frame->pcnt_motion; - } - rc->baseline_gf_interval = new_gf_interval; - } - - rc->frames_till_gf_update_due = rc->baseline_gf_interval; + rc->baseline_gf_interval = i - rc->source_alt_ref_pending; // Reset the file position. reset_fpf_position(twopass, start_pos); + if (rc->source_alt_ref_pending) + is_alt_ref_flash = detect_flash(twopass, rc->baseline_gf_interval); + // Calculate the bits to be allocated to the gf/arf group as a whole gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); + gop_frames = + rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf; + + // Store the average moise level measured for the group + // TODO(any): Experiment with removal of else condition (gop_frames = 0) so + // that consumption of group noise energy is based on previous gf group + if (gop_frames > 0) + twopass->gf_group.group_noise_energy = (int)(gf_group_noise / gop_frames); + else + twopass->gf_group.group_noise_energy = 0; + // Calculate an estimate of the maxq needed for the group. // We are more aggressive about correcting for sections // where there could be significant overshoot than for easier // sections where we do not wish to risk creating an overshoot // of the allocated bit budget. if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) { - const int vbr_group_bits_per_frame = - (int)(gf_group_bits / rc->baseline_gf_interval); - const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval; - const double group_av_noise = gf_group_noise / rc->baseline_gf_interval; - const double group_av_skip_pct = - gf_group_skip_pct / rc->baseline_gf_interval; - const double group_av_inactive_zone = - ((gf_group_inactive_zone_rows * 2) / - (rc->baseline_gf_interval * (double)cm->mb_rows)); + const int vbr_group_bits_per_frame = (int)(gf_group_bits / gop_frames); + const double group_av_err = gf_group_raw_error / gop_frames; + const double group_av_noise = gf_group_noise / gop_frames; + const double group_av_skip_pct = gf_group_skip_pct / gop_frames; + const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) / + (gop_frames * (double)cm->mb_rows)); int tmp_q = get_twopass_worst_quality( cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), group_av_noise, vbr_group_bits_per_frame); @@ -2663,20 +2723,23 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Context Adjustment of ARNR filter strength if (rc->baseline_gf_interval > 1) { - adjust_group_arnr_filter(cpi, (gf_group_noise / rc->baseline_gf_interval), - (gf_group_inter / rc->baseline_gf_interval), - (gf_group_motion / rc->baseline_gf_interval)); + adjust_group_arnr_filter(cpi, (gf_group_noise / gop_frames), + (gf_group_inter / gop_frames), + (gf_group_motion / gop_frames)); } else { twopass->arnr_strength_adjustment = 0; } // Calculate the extra bits to be used for boosted frame(s) - gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost, - gf_group_bits); + gf_arf_bits = calculate_boost_bits((rc->baseline_gf_interval - 1), + rc->gfu_boost, gf_group_bits); // Adjust KF group bits and error remaining. twopass->kf_group_error_left -= gf_group_err; + // Decide GOP structure. + define_gf_group_structure(cpi); + // Allocate bits to each of the frames in the GF group. allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); @@ -2684,10 +2747,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { reset_fpf_position(twopass, start_pos); // Calculate a section intra ratio used in setting max loop filter. - if (cpi->common.frame_type != KEY_FRAME) { - twopass->section_intra_rating = calculate_section_intra_ratio( - start_pos, twopass->stats_in_end, rc->baseline_gf_interval); - } + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_in_end, rc->baseline_gf_interval); if (oxcf->resize_mode == RESIZE_DYNAMIC) { // Default to starting GF groups at normal frame size. @@ -2698,19 +2759,82 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->rolling_arf_group_target_bits = 0; twopass->rolling_arf_group_actual_bits = 0; #endif + rc->preserve_arf_as_gld = rc->preserve_next_arf_as_gld; + rc->preserve_next_arf_as_gld = 0; + // If alt ref frame is flash do not set preserve_arf_as_gld + if (!is_lossless_requested(&cpi->oxcf) && !cpi->use_svc && + cpi->oxcf.aq_mode == NO_AQ && cpi->multi_layer_arf && !is_alt_ref_flash) + rc->preserve_next_arf_as_gld = 1; +} + +// Intra / Inter threshold very low +#define VERY_LOW_II 1.5 +// Clean slide transitions we expect a sharp single frame spike in error. +#define ERROR_SPIKE 5.0 + +// Slide show transition detection. +// Tests for case where there is very low error either side of the current frame +// but much higher just for this frame. This can help detect key frames in +// slide shows even where the slides are pictures of different sizes. +// Also requires that intra and inter errors are very similar to help eliminate +// harmful false positives. +// It will not help if the transition is a fade or other multi-frame effect. +static int slide_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && + (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && + (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); +} + +// This test looks for anomalous changes in the nature of the intra signal +// related to the previous and next frame as an indicator for coding a key +// frame. This test serves to detect some additional scene cuts, +// especially in lowish motion and low contrast sections, that are missed +// by the other tests. +static int intra_step_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + double last_ii_ratio; + double this_ii_ratio; + double next_ii_ratio; + double last_pcnt_intra = 1.0 - last_frame->pcnt_inter; + double this_pcnt_intra = 1.0 - this_frame->pcnt_inter; + double next_pcnt_intra = 1.0 - next_frame->pcnt_inter; + double mod_this_intra = this_pcnt_intra + this_frame->pcnt_neutral; + + // Calculate ii ratio for this frame last frame and next frame. + last_ii_ratio = + last_frame->intra_error / DOUBLE_DIVIDE_CHECK(last_frame->coded_error); + this_ii_ratio = + this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + next_ii_ratio = + next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error); + + // Return true the intra/inter ratio for the current frame is + // low but better in the next and previous frame and the relative useage of + // intra in the current frame is markedly higher than the last and next frame. + if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) && + (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) && + (this_pcnt_intra > (3 * next_pcnt_intra)) && + ((this_pcnt_intra > 0.075) || (mod_this_intra > 0.85))) { + return 1; + // Very low inter intra ratio (i.e. not much gain from inter coding), most + // blocks neutral on coding method and better inter prediction either side + } else if ((this_ii_ratio < 1.25) && (mod_this_intra > 0.85) && + (this_ii_ratio < last_ii_ratio * 0.9) && + (this_ii_ratio < next_ii_ratio * 0.9)) { + return 1; + } else { + return 0; + } } -// Threshold for use of the lagging second reference frame. High second ref -// usage may point to a transient event like a flash or occlusion rather than -// a real scene cut. -#define SECOND_REF_USEAGE_THRESH 0.1 // Minimum % intra coding observed in first pass (1.0 = 100%) #define MIN_INTRA_LEVEL 0.25 -// Minimum ratio between the % of intra coding and inter coding in the first -// pass after discounting neutral blocks (discounting neutral blocks in this -// way helps catch scene cuts in clips with very flat areas or letter box -// format clips with image padding. -#define INTRA_VS_INTER_THRESH 2.0 +// Threshold for use of the lagging second reference frame. Scene cuts do not +// usually have a high second ref useage. +#define SECOND_REF_USEAGE_THRESH 0.2 // Hard threshold where the first pass chooses intra for almost all blocks. // In such a case even if the frame is not a scene cut coding a key frame // may be a good option. @@ -2718,12 +2842,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Maximum threshold for the relative ratio of intra error score vs best // inter error score. #define KF_II_ERR_THRESHOLD 2.5 -// In real scene cuts there is almost always a sharp change in the intra -// or inter error score. -#define ERR_CHANGE_THRESHOLD 0.4 -// For real scene cuts we expect an improvment in the intra inter error -// ratio in the next frame. -#define II_IMPROVEMENT_THRESHOLD 3.5 #define KF_II_MAX 128.0 #define II_FACTOR 12.5 // Test for very low intra complexity which could cause false key frames @@ -2735,29 +2853,22 @@ static int test_candidate_kf(TWO_PASS *twopass, const FIRSTPASS_STATS *next_frame) { int is_viable_kf = 0; double pcnt_intra = 1.0 - this_frame->pcnt_inter; - double modified_pcnt_inter = - this_frame->pcnt_inter - this_frame->pcnt_neutral; // Does the frame satisfy the primary criteria of a key frame? // See above for an explanation of the test criteria. // If so, then examine how well it predicts subsequent frames. - if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && - (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && + if (!detect_flash(twopass, -1) && !detect_flash(twopass, 0) && + (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || - ((pcnt_intra > MIN_INTRA_LEVEL) && - (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + (slide_transition(this_frame, last_frame, next_frame)) || + (intra_step_transition(this_frame, last_frame, next_frame)) || + (((this_frame->coded_error > (next_frame->coded_error * 1.2)) && + (this_frame->coded_error > (last_frame->coded_error * 1.2))) && + (pcnt_intra > MIN_INTRA_LEVEL) && + ((pcnt_intra + this_frame->pcnt_neutral) > 0.5) && ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < - KF_II_ERR_THRESHOLD) && - ((fabs(last_frame->coded_error - this_frame->coded_error) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > - ERR_CHANGE_THRESHOLD) || - (fabs(last_frame->intra_error - this_frame->intra_error) / - DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > - ERR_CHANGE_THRESHOLD) || - ((next_frame->intra_error / - DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > - II_IMPROVEMENT_THRESHOLD))))) { + KF_II_ERR_THRESHOLD)))) { int i; const FIRSTPASS_STATS *start_pos = twopass->stats_in; FIRSTPASS_STATS local_next_frame = *next_frame; @@ -2814,7 +2925,10 @@ static int test_candidate_kf(TWO_PASS *twopass, #define FRAMES_TO_CHECK_DECAY 8 #define MIN_KF_TOT_BOOST 300 -#define KF_BOOST_SCAN_MAX_FRAMES 32 +#define DEFAULT_SCAN_FRAMES_FOR_KF_BOOST 32 +#define MAX_SCAN_FRAMES_FOR_KF_BOOST 48 +#define MIN_SCAN_FRAMES_FOR_KF_BOOST 32 +#define KF_ABS_ZOOM_THRESH 6.0 #ifdef AGGRESSIVE_VBR #define KF_MAX_FRAME_BOOST 80.0 @@ -2835,17 +2949,27 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { FIRSTPASS_STATS next_frame; FIRSTPASS_STATS last_frame; int kf_bits = 0; + int64_t max_kf_bits; double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; + double zero_motion_sum = 0.0; + double zero_motion_avg; + double motion_compensable_sum = 0.0; + double motion_compensable_avg; + int num_frames = 0; + int kf_boost_scan_frames = DEFAULT_SCAN_FRAMES_FOR_KF_BOOST; double boost_score = 0.0; double kf_mod_err = 0.0; + double kf_raw_err = 0.0; double kf_group_err = 0.0; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; double sr_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; const double av_err = get_distribution_av_err(cpi, twopass); vp9_zero(next_frame); cpi->common.frame_type = KEY_FRAME; + rc->frames_since_key = 0; // Reset the GF group data structures. vp9_zero(*gf_group); @@ -2856,7 +2980,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Clear the alt ref active flag and last group multi arf flags as they // can never be set for a key frame. rc->source_alt_ref_active = 0; - cpi->multi_arf_last_grp_enabled = 0; // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; @@ -2866,6 +2989,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_group_bits = 0; // Total bits available to kf group twopass->kf_group_error_left = 0.0; // Group modified error score. + kf_raw_err = this_frame->intra_error; kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); @@ -2950,18 +3074,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->next_key_frame_forced = 0; } - if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { - int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; - int new_frame_to_key = (rc->frames_to_key + count) & (~count); - int j; - for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { - if (EOF == input_stats(twopass, this_frame)) break; - kf_group_err += - calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); - } - rc->frames_to_key = new_frame_to_key; - } - // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error. @@ -2998,16 +3110,46 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // how many bits to spend on it. boost_score = 0.0; + for (i = 0; i < VPXMIN(MAX_SCAN_FRAMES_FOR_KF_BOOST, (rc->frames_to_key - 1)); + ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; + + zero_motion_sum += next_frame.pcnt_inter - next_frame.pcnt_motion; + motion_compensable_sum += + 1 - (double)next_frame.coded_error / next_frame.intra_error; + num_frames++; + } + + if (num_frames >= MIN_SCAN_FRAMES_FOR_KF_BOOST) { + zero_motion_avg = zero_motion_sum / num_frames; + motion_compensable_avg = motion_compensable_sum / num_frames; + kf_boost_scan_frames = (int)(VPXMAX(64 * zero_motion_avg - 16, + 160 * motion_compensable_avg - 112)); + kf_boost_scan_frames = + VPXMAX(VPXMIN(kf_boost_scan_frames, MAX_SCAN_FRAMES_FOR_KF_BOOST), + MIN_SCAN_FRAMES_FOR_KF_BOOST); + } + reset_fpf_position(twopass, start_position); + for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - if (i <= KF_BOOST_SCAN_MAX_FRAMES) { + // The zero motion test here insures that if we mark a kf group as static + // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES. + // It also allows for a larger boost on long static groups. + if ((i <= kf_boost_scan_frames) || (zero_motion_accumulator >= 0.99)) { double frame_boost; double zm_factor; // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + // First frame in kf group the second ref indicator is invalid. + if (i > 0) { + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + } else { + zero_motion_accumulator = + next_frame.pcnt_inter - next_frame.pcnt_motion; + } // Factor 0.75-1.25 based on how much of frame is static. zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); @@ -3021,7 +3163,15 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { KF_MAX_FRAME_BOOST * zm_factor); boost_score += frame_boost; - if (frame_boost < 25.00) break; + + // Measure of zoom. Large zoom tends to indicate reduced boost. + abs_mv_in_out_accumulator += + fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion); + + if ((frame_boost < 25.00) || + (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH) || + (sr_accumulator > (kf_raw_err * 1.50))) + break; } else { break; } @@ -3033,17 +3183,30 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); // Calculate a section intra ratio used in setting max loop filter. - twopass->section_intra_rating = calculate_section_intra_ratio( + twopass->key_frame_section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_in_end, rc->frames_to_key); - // Apply various clamps for min and max boost - rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); - rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + // Special case for static / slide show content but dont apply + // if the kf group is very short. + if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { + rc->kf_boost = MAX_KF_TOT_BOOST; + } else { + // Apply various clamps for min and max boost + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + } // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, twopass->kf_group_bits); + // Based on the spatial complexity, increase the bits allocated to key frame. + kf_bits += + (int)((twopass->kf_group_bits - kf_bits) * (kf_mod_err / kf_group_err)); + max_kf_bits = + twopass->kf_group_bits - (rc->frames_to_key - 1) * FRAME_OVERHEAD_BITS; + max_kf_bits = lclamp(max_kf_bits, 0, INT_MAX); + kf_bits = VPXMIN(kf_bits, (int)max_kf_bits); twopass->kf_group_bits -= kf_bits; @@ -3064,51 +3227,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default to normal-sized frame on keyframes. cpi->rc.next_frame_size_selector = UNSCALED; } -} - -// Define the reference buffers that will be updated post encode. -static void configure_buffer_updates(VP9_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - - cpi->rc.is_src_frame_alt_ref = 0; - switch (twopass->gf_group.update_type[twopass->gf_group.index]) { - case KF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 1; - break; - case LF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - case GF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 0; - break; - case OVERLAY_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 0; - cpi->rc.is_src_frame_alt_ref = 1; - break; - case ARF_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 1; - break; - default: assert(0); break; - } - if (is_two_pass_svc(cpi)) { - if (cpi->svc.temporal_layer_id > 0) { - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - } - if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0) - cpi->refresh_golden_frame = 0; - if (cpi->alt_ref_source == NULL) cpi->refresh_alt_ref_frame = 0; - } +#define ARF_ACTIVE_BEST_QUALITY_ADJUSTMENT_WINDOW_SIZE 64 + // TODO(ravi.chaudhary@ittiam.com): Experiment without the below min + // condition. This might be helpful for small key frame intervals. + rc->arf_active_best_quality_adjustment_window = + VPXMIN(ARF_ACTIVE_BEST_QUALITY_ADJUSTMENT_WINDOW_SIZE, rc->frames_to_key); } static int is_skippable_frame(const VP9_COMP *cpi) { @@ -3116,10 +3239,7 @@ static int is_skippable_frame(const VP9_COMP *cpi) { // first pass, and so do its previous and forward frames, then this frame // can be skipped for partition check, and the partition size is assigned // according to the variance - const SVC *const svc = &cpi->svc; - const TWO_PASS *const twopass = - is_two_pass_svc(cpi) ? &svc->layer_context[svc->spatial_layer_id].twopass - : &cpi->twopass; + const TWO_PASS *const twopass = &cpi->twopass; return (!frame_is_intra_only(&cpi->common) && twopass->stats_in - 2 > twopass->stats_in_start && @@ -3140,41 +3260,38 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { GF_GROUP *const gf_group = &twopass->gf_group; FIRSTPASS_STATS this_frame; - int target_rate; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : 0; - if (!twopass->stats_in) return; // If this is an arf frame then we dont want to read the stats file or // advance the input pointer as we already have what we need. if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { int target_rate; - configure_buffer_updates(cpi); + + vp9_zero(this_frame); + this_frame = + cpi->twopass.stats_in_start[cm->current_video_frame + + gf_group->arf_src_offset[gf_group->index]]; + + vp9_configure_buffer_updates(cpi, gf_group->index); + target_rate = gf_group->bit_allocation[gf_group->index]; target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); rc->base_frame_target = target_rate; cm->frame_type = INTER_FRAME; - if (lc != NULL) { - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; - - if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - } - } - // Do the firstpass stats indicate that this frame is skippable for the // partition search? if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - (!cpi->use_svc || is_two_pass_svc(cpi))) { + !cpi->use_svc) { cpi->partition_search_skippable_frame = is_skippable_frame(cpi); } + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); + twopass->mb_smooth_pct = this_frame.intra_smooth_pct; + return; } @@ -3182,12 +3299,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (cpi->oxcf.rc_mode == VPX_Q) { twopass->active_worst_quality = cpi->oxcf.cq_level; - } else if (cm->current_video_frame == 0 || - (lc != NULL && lc->current_video_frame_in_layer == 0)) { + } else if (cm->current_video_frame == 0) { const int frames_left = - (int)(twopass->total_stats.count - - ((lc != NULL) ? lc->current_video_frame_in_layer - : cm->current_video_frame)); + (int)(twopass->total_stats.count - cm->current_video_frame); // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / frames_left); @@ -3236,59 +3350,36 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { cm->frame_type = INTER_FRAME; } - if (lc != NULL) { - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = (cm->frame_type == KEY_FRAME); - if (lc->is_key_frame) { - cpi->ref_frame_flags &= - (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - lc->frames_from_key_frame = 0; - // Encode an intra only empty frame since we have a key frame. - cpi->svc.encode_intra_empty_frame = 1; - } - } else { - cm->frame_type = INTER_FRAME; - lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; - - if (lc->is_key_frame) { - cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - lc->frames_from_key_frame = 0; - } - } - } - // Define a new GF/ARF group. (Should always enter here for key frames). if (rc->frames_till_gf_update_due == 0) { define_gf_group(cpi, &this_frame); rc->frames_till_gf_update_due = rc->baseline_gf_interval; - if (lc != NULL) cpi->refresh_golden_frame = 1; #if ARF_STATS_OUTPUT { FILE *fpfile; fpfile = fopen("arf.stt", "a"); ++arf_count; - fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame, - rc->frames_till_gf_update_due, rc->kf_boost, arf_count, - rc->gfu_boost); + fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", + cm->current_video_frame, rc->frames_till_gf_update_due, + rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type); fclose(fpfile); } #endif } - configure_buffer_updates(cpi); + vp9_configure_buffer_updates(cpi, gf_group->index); // Do the firstpass stats indicate that this frame is skippable for the // partition search? if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - (!cpi->use_svc || is_two_pass_svc(cpi))) { + !cpi->use_svc) { cpi->partition_search_skippable_frame = is_skippable_frame(cpi); } - target_rate = gf_group->bit_allocation[gf_group->index]; - rc->base_frame_target = target_rate; + rc->base_frame_target = gf_group->bit_allocation[gf_group->index]; // The multiplication by 256 reverses a scaling factor of (>> 8) // applied when combining MB error values for the frame. @@ -3329,8 +3420,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { rc->rate_error_estimate = 0; } - if (cpi->common.frame_type != KEY_FRAME && - !vp9_is_upper_layer_key_frame(cpi)) { + if (cpi->common.frame_type != KEY_FRAME) { twopass->kf_group_bits -= bits_used; twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; } @@ -3350,7 +3440,8 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { // Extend min or Max Q range to account for imbalance from the base // value when using AQ. - if (cpi->oxcf.aq_mode != NO_AQ) { + if (cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != PSNR_AQ && + cpi->oxcf.aq_mode != PERCEPTUAL_AQ) { if (cm->seg.aq_av_offset < 0) { // The balance of the AQ map tends towarda lowering the average Q. aq_extend_min = 0; diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.h b/libs/libvpx/vp9/encoder/vp9_firstpass.h index 000ecd7792..a0a96e6ef6 100644 --- a/libs/libvpx/vp9/encoder/vp9_firstpass.h +++ b/libs/libvpx/vp9/encoder/vp9_firstpass.h @@ -8,8 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ -#define VP9_ENCODER_VP9_FIRSTPASS_H_ +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ + +#include #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -39,7 +41,10 @@ typedef struct { } FIRSTPASS_MB_STATS; #endif -#define INVALID_ROW -1 +#define INVALID_ROW (-1) + +#define MAX_ARF_LAYERS 6 +#define SECTION_NOISE_DEF 250.0 typedef struct { double frame_mb_intra_factor; @@ -107,7 +112,9 @@ typedef enum { GF_UPDATE = 2, ARF_UPDATE = 3, OVERLAY_UPDATE = 4, - FRAME_UPDATE_TYPES = 5 + MID_OVERLAY_UPDATE = 5, + USE_BUF_FRAME = 6, // Use show existing frame, no ref buffer update + FRAME_UPDATE_TYPES = 7 } FRAME_UPDATE_TYPE; #define FC_ANIMATION_THRESH 0.15 @@ -119,17 +126,29 @@ typedef enum { typedef struct { unsigned char index; - unsigned char first_inter_index; - RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; - FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; - int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; + RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 2]; + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char frame_gop_index[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2]; + + int frame_start; + int frame_end; + // TODO(jingning): The array size of arf_stack could be reduced. + int arf_index_stack[MAX_LAG_BUFFERS * 2]; + int top_arf_idx; + int stack_size; + int gf_group_size; + int max_layer_depth; + int allowed_max_layer_depth; + int group_noise_energy; } GF_GROUP; typedef struct { unsigned int section_intra_rating; + unsigned int key_frame_section_intra_rating; FIRSTPASS_STATS total_stats; FIRSTPASS_STATS this_frame_stats; const FIRSTPASS_STATS *stats_in; @@ -182,7 +201,6 @@ struct ThreadData; struct TileDataEnc; void vp9_init_first_pass(struct VP9_COMP *cpi); -void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); void vp9_end_first_pass(struct VP9_COMP *cpi); @@ -194,7 +212,6 @@ void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi, void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); -void vp9_twopass_postencode_update(struct VP9_COMP *cpi); // Post encode update of the rate control parameters for 2-pass void vp9_twopass_postencode_update(struct VP9_COMP *cpi); @@ -206,4 +223,4 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_FIRSTPASS_H_ +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_job_queue.h b/libs/libvpx/vp9/encoder/vp9_job_queue.h index 89c08f207a..ad09c11198 100644 --- a/libs/libvpx/vp9/encoder/vp9_job_queue.h +++ b/libs/libvpx/vp9/encoder/vp9_job_queue.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_ -#define VP9_ENCODER_VP9_JOB_QUEUE_H_ +#ifndef VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ typedef enum { FIRST_PASS_JOB, @@ -43,4 +43,4 @@ typedef struct { int num_jobs_acquired; } JobQueueHandle; -#endif // VP9_ENCODER_VP9_JOB_QUEUE_H_ +#endif // VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_lookahead.h b/libs/libvpx/vp9/encoder/vp9_lookahead.h index 88be0ffcd5..c627bede23 100644 --- a/libs/libvpx/vp9/encoder/vp9_lookahead.h +++ b/libs/libvpx/vp9/encoder/vp9_lookahead.h @@ -8,17 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_LOOKAHEAD_H_ -#define VP9_ENCODER_VP9_LOOKAHEAD_H_ +#ifndef VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ +#define VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_encoder.h" #include "vpx/vpx_integer.h" -#if CONFIG_SPATIAL_SVC -#include "vpx/vp8cx.h" -#endif - #ifdef __cplusplus extern "C" { #endif @@ -115,4 +111,4 @@ unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_LOOKAHEAD_H_ +#endif // VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.c b/libs/libvpx/vp9/encoder/vp9_mbgraph.c index 46d626def1..831c79c175 100644 --- a/libs/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.c @@ -57,11 +57,12 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, { uint32_t distortion; uint32_t sse; + // TODO(yunqing): may use higher tap interp filter than 2 taps if needed. cpi->find_fractional_mv_step( x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, - &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, + &v_fn_ptr, 0, mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, - 0); + 0, USE_2_TAPS); } xd->mi[0]->mode = NEWMV; diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.h b/libs/libvpx/vp9/encoder/vp9_mbgraph.h index df2fb98efa..7b629861d5 100644 --- a/libs/libvpx/vp9/encoder/vp9_mbgraph.h +++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MBGRAPH_H_ -#define VP9_ENCODER_VP9_MBGRAPH_H_ +#ifndef VPX_VP9_ENCODER_VP9_MBGRAPH_H_ +#define VPX_VP9_ENCODER_VP9_MBGRAPH_H_ #ifdef __cplusplus extern "C" { @@ -25,7 +25,9 @@ typedef struct { } ref[MAX_REF_FRAMES]; } MBGRAPH_MB_STATS; -typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS; +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; struct VP9_COMP; @@ -35,4 +37,4 @@ void vp9_update_mbgraph_stats(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_MBGRAPH_H_ +#endif // VPX_VP9_ENCODER_VP9_MBGRAPH_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.c b/libs/libvpx/vp9/encoder/vp9_mcomp.c index 44f01be25a..d1688f9938 100644 --- a/libs/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libs/libvpx/vp9/encoder/vp9_mcomp.c @@ -29,11 +29,6 @@ // #define NEW_DIAMOND_SEARCH -static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, - const MV *mv) { - return &buf->buf[mv->row * buf->stride + mv->col]; -} - void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); @@ -263,27 +258,6 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { } \ } -// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of -// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten -// later in the same way. -#define SECOND_LEVEL_CHECKS_BEST \ - { \ - unsigned int second; \ - int br0 = br; \ - int bc0 = bc; \ - assert(tr == br || tc == bc); \ - if (tr == br && tc != bc) { \ - kc = bc - tc; \ - } else if (tr != br && tc == bc) { \ - kr = br - tr; \ - } \ - CHECK_BETTER(second, br0 + kr, bc0); \ - CHECK_BETTER(second, br0, bc0 + kc); \ - if (br0 != br || bc0 != bc) { \ - CHECK_BETTER(second, br0 + kr, bc0 + kc); \ - } \ - } - #define SETUP_SUBPEL_SEARCH \ const uint8_t *const z = x->plane[0].src.buf; \ const int src_stride = x->plane[0].src.stride; \ @@ -329,8 +303,8 @@ static unsigned int setup_center_error( if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); - vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, - y_stride); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, CONVERT_TO_SHORTPTR(y + offset), y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { @@ -388,14 +362,12 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { *ir = (int)divide_and_round(x1 * b, y1); } -uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, - const MV *ref_mv, int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, int iters_per_step, - int *cost_list, int *mvjcost, int *mvcost[2], - uint32_t *distortion, uint32_t *sse1, - const uint8_t *second_pred, int w, int h) { +uint32_t vp9_skip_sub_pixel_tree( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -418,6 +390,7 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, (void)sse; (void)thismse; (void)cost_list; + (void)use_accurate_subpel_search; return besterr; } @@ -427,7 +400,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -439,6 +412,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( (void)allow_hp; (void)forced_stop; (void)hstep; + (void)use_accurate_subpel_search; if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && @@ -492,8 +466,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -552,8 +528,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -638,12 +616,119 @@ static const MV search_step_table[12] = { }; /* clang-format on */ +static int accurate_sub_pel_search( + const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf, + const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src_address, const int src_stride, + const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred, + int w, int h, uint32_t *sse) { +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t besterr; + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]); + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride, + pred16, w, this_mv, sf, w, h, 0, kernel, + MV_PRECISION_Q3, 0, 0, xd->bd); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, pred16, w); + besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address, + src_stride, sse); + } else { + besterr = + vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse); + } + } else { + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + } + if (besterr >= UINT_MAX) return UINT_MAX; + return (int)besterr; +#else + int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + (void)xd; + + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + return besterr; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +// TODO(yunqing): this part can be further refactored. +#if CONFIG_VP9_HIGHBITDEPTH +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = \ + accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ + y, y_stride, second_pred, w, h, &sse); \ + tmpmse = thismse; \ + tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } +#else +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = \ + accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ + y, y_stride, second_pred, w, h, &sse); \ + if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +#endif + uint32_t vp9_find_best_sub_pixel_tree( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { const uint8_t *const z = x->plane[0].src.buf; const uint8_t *const src_address = z; const int src_stride = x->plane[0].src.stride; @@ -671,6 +756,17 @@ uint32_t vp9_find_best_sub_pixel_tree( int kr, kc; MvLimits subpel_mv_limits; + // TODO(yunqing): need to add 4-tap filter optimization to speed up the + // encoder. + const InterpKernel *kernel = + (use_accurate_subpel_search > 0) + ? ((use_accurate_subpel_search == USE_4_TAPS) + ? vp9_filter_kernels[FOURTAP] + : ((use_accurate_subpel_search == USE_8_TAPS) + ? vp9_filter_kernels[EIGHTTAP] + : vp9_filter_kernels[EIGHTTAP_SHARP])) + : vp9_filter_kernels[BILINEAR]; + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); minc = subpel_mv_limits.col_min; maxc = subpel_mv_limits.col_max; @@ -695,16 +791,25 @@ uint32_t vp9_find_best_sub_pixel_tree( tr = br + search_step[idx].row; tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv; this_mv.row = tr; this_mv.col = tc; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, + y_stride, second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -726,14 +831,21 @@ uint32_t vp9_find_best_sub_pixel_tree( tc = bc + kc; tr = br + kr; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv = { tr, tc }; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse, second_pred); + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, y_stride, + second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, + src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -755,10 +867,48 @@ uint32_t vp9_find_best_sub_pixel_tree( bc = tc; } - if (iters_per_step > 1 && best_idx != -1) SECOND_LEVEL_CHECKS_BEST; + if (iters_per_step > 0 && best_idx != -1) { + unsigned int second; + const int br0 = br; + const int bc0 = bc; + assert(tr == br || tc == bc); - tr = br; - tc = bc; + if (tr == br && tc != bc) { + kc = bc - tc; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0, bc0 + kc); + } else { + CHECK_BETTER(second, br0, bc0 + kc); + } + } + } else if (tr != br && tc == bc) { + kr = br - tr; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + } else { + CHECK_BETTER(second, br0 + kr, bc0); + } + } + } + + if (iters_per_step > 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + CHECK_BETTER1(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER1(second, br0 + kr, bc0 + kc); + } + } else { + CHECK_BETTER(second, br0 + kr, bc0); + CHECK_BETTER(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER(second, br0 + kr, bc0 + kc); + } + } + } + } search_step += 4; hstep >>= 1; @@ -780,6 +930,7 @@ uint32_t vp9_find_best_sub_pixel_tree( } #undef CHECK_BETTER +#undef CHECK_BETTER1 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, int range) { @@ -1490,7 +1641,7 @@ static int fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param, // Exhuastive motion search around a given centre position with a given // step size. -static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, +static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int range, int step, int sad_per_bit, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { @@ -1576,6 +1727,510 @@ static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, return best_sad; } +#define MIN_RANGE 7 +#define MAX_RANGE 256 +#define MIN_INTERVAL 1 +#if CONFIG_NON_GREEDY_MV + +#define LOG2_TABLE_SIZE 1024 +static const int log2_table[LOG2_TABLE_SIZE] = { + 0, // This is a dummy value + 0, 1048576, 1661954, 2097152, 2434718, 2710530, 2943725, + 3145728, 3323907, 3483294, 3627477, 3759106, 3880192, 3992301, + 4096672, 4194304, 4286015, 4372483, 4454275, 4531870, 4605679, + 4676053, 4743299, 4807682, 4869436, 4928768, 4985861, 5040877, + 5093962, 5145248, 5194851, 5242880, 5289431, 5334591, 5378443, + 5421059, 5462508, 5502851, 5542146, 5580446, 5617800, 5654255, + 5689851, 5724629, 5758625, 5791875, 5824409, 5856258, 5887450, + 5918012, 5947969, 5977344, 6006160, 6034437, 6062195, 6089453, + 6116228, 6142538, 6168398, 6193824, 6218829, 6243427, 6267632, + 6291456, 6314910, 6338007, 6360756, 6383167, 6405252, 6427019, + 6448477, 6469635, 6490501, 6511084, 6531390, 6551427, 6571202, + 6590722, 6609993, 6629022, 6647815, 6666376, 6684713, 6702831, + 6720734, 6738427, 6755916, 6773205, 6790299, 6807201, 6823917, + 6840451, 6856805, 6872985, 6888993, 6904834, 6920510, 6936026, + 6951384, 6966588, 6981641, 6996545, 7011304, 7025920, 7040397, + 7054736, 7068940, 7083013, 7096956, 7110771, 7124461, 7138029, + 7151476, 7164804, 7178017, 7191114, 7204100, 7216974, 7229740, + 7242400, 7254954, 7267405, 7279754, 7292003, 7304154, 7316208, + 7328167, 7340032, 7351805, 7363486, 7375079, 7386583, 7398000, + 7409332, 7420579, 7431743, 7442826, 7453828, 7464751, 7475595, + 7486362, 7497053, 7507669, 7518211, 7528680, 7539077, 7549404, + 7559660, 7569847, 7579966, 7590017, 7600003, 7609923, 7619778, + 7629569, 7639298, 7648964, 7658569, 7668114, 7677598, 7687023, + 7696391, 7705700, 7714952, 7724149, 7733289, 7742375, 7751407, + 7760385, 7769310, 7778182, 7787003, 7795773, 7804492, 7813161, + 7821781, 7830352, 7838875, 7847350, 7855777, 7864158, 7872493, + 7880782, 7889027, 7897226, 7905381, 7913492, 7921561, 7929586, + 7937569, 7945510, 7953410, 7961268, 7969086, 7976864, 7984602, + 7992301, 7999960, 8007581, 8015164, 8022709, 8030217, 8037687, + 8045121, 8052519, 8059880, 8067206, 8074496, 8081752, 8088973, + 8096159, 8103312, 8110431, 8117516, 8124569, 8131589, 8138576, + 8145532, 8152455, 8159347, 8166208, 8173037, 8179836, 8186605, + 8193343, 8200052, 8206731, 8213380, 8220001, 8226593, 8233156, + 8239690, 8246197, 8252676, 8259127, 8265550, 8271947, 8278316, + 8284659, 8290976, 8297266, 8303530, 8309768, 8315981, 8322168, + 8328330, 8334467, 8340579, 8346667, 8352730, 8358769, 8364784, + 8370775, 8376743, 8382687, 8388608, 8394506, 8400381, 8406233, + 8412062, 8417870, 8423655, 8429418, 8435159, 8440878, 8446576, + 8452252, 8457908, 8463542, 8469155, 8474748, 8480319, 8485871, + 8491402, 8496913, 8502404, 8507875, 8513327, 8518759, 8524171, + 8529564, 8534938, 8540293, 8545629, 8550947, 8556245, 8561525, + 8566787, 8572031, 8577256, 8582464, 8587653, 8592825, 8597980, + 8603116, 8608236, 8613338, 8618423, 8623491, 8628542, 8633576, + 8638593, 8643594, 8648579, 8653547, 8658499, 8663434, 8668354, + 8673258, 8678145, 8683017, 8687874, 8692715, 8697540, 8702350, + 8707145, 8711925, 8716690, 8721439, 8726174, 8730894, 8735599, + 8740290, 8744967, 8749628, 8754276, 8758909, 8763528, 8768134, + 8772725, 8777302, 8781865, 8786415, 8790951, 8795474, 8799983, + 8804478, 8808961, 8813430, 8817886, 8822328, 8826758, 8831175, + 8835579, 8839970, 8844349, 8848715, 8853068, 8857409, 8861737, + 8866053, 8870357, 8874649, 8878928, 8883195, 8887451, 8891694, + 8895926, 8900145, 8904353, 8908550, 8912734, 8916908, 8921069, + 8925220, 8929358, 8933486, 8937603, 8941708, 8945802, 8949885, + 8953957, 8958018, 8962068, 8966108, 8970137, 8974155, 8978162, + 8982159, 8986145, 8990121, 8994086, 8998041, 9001986, 9005920, + 9009844, 9013758, 9017662, 9021556, 9025440, 9029314, 9033178, + 9037032, 9040877, 9044711, 9048536, 9052352, 9056157, 9059953, + 9063740, 9067517, 9071285, 9075044, 9078793, 9082533, 9086263, + 9089985, 9093697, 9097400, 9101095, 9104780, 9108456, 9112123, + 9115782, 9119431, 9123072, 9126704, 9130328, 9133943, 9137549, + 9141146, 9144735, 9148316, 9151888, 9155452, 9159007, 9162554, + 9166092, 9169623, 9173145, 9176659, 9180165, 9183663, 9187152, + 9190634, 9194108, 9197573, 9201031, 9204481, 9207923, 9211357, + 9214784, 9218202, 9221613, 9225017, 9228412, 9231800, 9235181, + 9238554, 9241919, 9245277, 9248628, 9251971, 9255307, 9258635, + 9261956, 9265270, 9268577, 9271876, 9275169, 9278454, 9281732, + 9285002, 9288266, 9291523, 9294773, 9298016, 9301252, 9304481, + 9307703, 9310918, 9314126, 9317328, 9320523, 9323711, 9326892, + 9330067, 9333235, 9336397, 9339552, 9342700, 9345842, 9348977, + 9352106, 9355228, 9358344, 9361454, 9364557, 9367654, 9370744, + 9373828, 9376906, 9379978, 9383043, 9386102, 9389155, 9392202, + 9395243, 9398278, 9401306, 9404329, 9407345, 9410356, 9413360, + 9416359, 9419351, 9422338, 9425319, 9428294, 9431263, 9434226, + 9437184, 9440136, 9443082, 9446022, 9448957, 9451886, 9454809, + 9457726, 9460638, 9463545, 9466446, 9469341, 9472231, 9475115, + 9477994, 9480867, 9483735, 9486597, 9489454, 9492306, 9495152, + 9497993, 9500828, 9503659, 9506484, 9509303, 9512118, 9514927, + 9517731, 9520530, 9523324, 9526112, 9528895, 9531674, 9534447, + 9537215, 9539978, 9542736, 9545489, 9548237, 9550980, 9553718, + 9556451, 9559179, 9561903, 9564621, 9567335, 9570043, 9572747, + 9575446, 9578140, 9580830, 9583514, 9586194, 9588869, 9591540, + 9594205, 9596866, 9599523, 9602174, 9604821, 9607464, 9610101, + 9612735, 9615363, 9617987, 9620607, 9623222, 9625832, 9628438, + 9631040, 9633637, 9636229, 9638818, 9641401, 9643981, 9646556, + 9649126, 9651692, 9654254, 9656812, 9659365, 9661914, 9664459, + 9666999, 9669535, 9672067, 9674594, 9677118, 9679637, 9682152, + 9684663, 9687169, 9689672, 9692170, 9694665, 9697155, 9699641, + 9702123, 9704601, 9707075, 9709545, 9712010, 9714472, 9716930, + 9719384, 9721834, 9724279, 9726721, 9729159, 9731593, 9734024, + 9736450, 9738872, 9741291, 9743705, 9746116, 9748523, 9750926, + 9753326, 9755721, 9758113, 9760501, 9762885, 9765266, 9767642, + 9770015, 9772385, 9774750, 9777112, 9779470, 9781825, 9784175, + 9786523, 9788866, 9791206, 9793543, 9795875, 9798204, 9800530, + 9802852, 9805170, 9807485, 9809797, 9812104, 9814409, 9816710, + 9819007, 9821301, 9823591, 9825878, 9828161, 9830441, 9832718, + 9834991, 9837261, 9839527, 9841790, 9844050, 9846306, 9848559, + 9850808, 9853054, 9855297, 9857537, 9859773, 9862006, 9864235, + 9866462, 9868685, 9870904, 9873121, 9875334, 9877544, 9879751, + 9881955, 9884155, 9886352, 9888546, 9890737, 9892925, 9895109, + 9897291, 9899469, 9901644, 9903816, 9905985, 9908150, 9910313, + 9912473, 9914629, 9916783, 9918933, 9921080, 9923225, 9925366, + 9927504, 9929639, 9931771, 9933900, 9936027, 9938150, 9940270, + 9942387, 9944502, 9946613, 9948721, 9950827, 9952929, 9955029, + 9957126, 9959219, 9961310, 9963398, 9965484, 9967566, 9969645, + 9971722, 9973796, 9975866, 9977934, 9980000, 9982062, 9984122, + 9986179, 9988233, 9990284, 9992332, 9994378, 9996421, 9998461, + 10000498, 10002533, 10004565, 10006594, 10008621, 10010644, 10012665, + 10014684, 10016700, 10018713, 10020723, 10022731, 10024736, 10026738, + 10028738, 10030735, 10032729, 10034721, 10036710, 10038697, 10040681, + 10042662, 10044641, 10046617, 10048591, 10050562, 10052530, 10054496, + 10056459, 10058420, 10060379, 10062334, 10064287, 10066238, 10068186, + 10070132, 10072075, 10074016, 10075954, 10077890, 10079823, 10081754, + 10083682, 10085608, 10087532, 10089453, 10091371, 10093287, 10095201, + 10097112, 10099021, 10100928, 10102832, 10104733, 10106633, 10108529, + 10110424, 10112316, 10114206, 10116093, 10117978, 10119861, 10121742, + 10123620, 10125495, 10127369, 10129240, 10131109, 10132975, 10134839, + 10136701, 10138561, 10140418, 10142273, 10144126, 10145976, 10147825, + 10149671, 10151514, 10153356, 10155195, 10157032, 10158867, 10160699, + 10162530, 10164358, 10166184, 10168007, 10169829, 10171648, 10173465, + 10175280, 10177093, 10178904, 10180712, 10182519, 10184323, 10186125, + 10187925, 10189722, 10191518, 10193311, 10195103, 10196892, 10198679, + 10200464, 10202247, 10204028, 10205806, 10207583, 10209357, 10211130, + 10212900, 10214668, 10216435, 10218199, 10219961, 10221721, 10223479, + 10225235, 10226989, 10228741, 10230491, 10232239, 10233985, 10235728, + 10237470, 10239210, 10240948, 10242684, 10244417, 10246149, 10247879, + 10249607, 10251333, 10253057, 10254779, 10256499, 10258217, 10259933, + 10261647, 10263360, 10265070, 10266778, 10268485, 10270189, 10271892, + 10273593, 10275292, 10276988, 10278683, 10280376, 10282068, 10283757, + 10285444, 10287130, 10288814, 10290495, 10292175, 10293853, 10295530, + 10297204, 10298876, 10300547, 10302216, 10303883, 10305548, 10307211, + 10308873, 10310532, 10312190, 10313846, 10315501, 10317153, 10318804, + 10320452, 10322099, 10323745, 10325388, 10327030, 10328670, 10330308, + 10331944, 10333578, 10335211, 10336842, 10338472, 10340099, 10341725, + 10343349, 10344971, 10346592, 10348210, 10349828, 10351443, 10353057, + 10354668, 10356279, 10357887, 10359494, 10361099, 10362702, 10364304, + 10365904, 10367502, 10369099, 10370694, 10372287, 10373879, 10375468, + 10377057, 10378643, 10380228, 10381811, 10383393, 10384973, 10386551, + 10388128, 10389703, 10391276, 10392848, 10394418, 10395986, 10397553, + 10399118, 10400682, 10402244, 10403804, 10405363, 10406920, 10408476, + 10410030, 10411582, 10413133, 10414682, 10416230, 10417776, 10419320, + 10420863, 10422404, 10423944, 10425482, 10427019, 10428554, 10430087, + 10431619, 10433149, 10434678, 10436206, 10437731, 10439256, 10440778, + 10442299, 10443819, 10445337, 10446854, 10448369, 10449882, 10451394, + 10452905, 10454414, 10455921, 10457427, 10458932, 10460435, 10461936, + 10463436, 10464935, 10466432, 10467927, 10469422, 10470914, 10472405, + 10473895, 10475383, 10476870, 10478355, 10479839, 10481322, 10482802, + 10484282, +}; + +#define LOG2_PRECISION 20 +static int64_t log2_approximation(int64_t v) { + assert(v > 0); + if (v < LOG2_TABLE_SIZE) { + return log2_table[v]; + } else { + // use linear approximation when v >= 2^10 + const int slope = + 1477; // slope = 1 / (log(2) * 1024) * (1 << LOG2_PRECISION) + assert(LOG2_TABLE_SIZE == 1 << 10); + + return slope * (v - LOG2_TABLE_SIZE) + (10 << LOG2_PRECISION); + } +} + +int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs, + int mv_num) { + int i; + int update = 0; + int64_t best_cost = 0; + vpx_clear_system_state(); + for (i = 0; i < mv_num; ++i) { + if (nb_mvs[i].as_int != INVALID_MV) { + MV nb_mv = nb_mvs[i].as_mv; + const int64_t row_diff = abs(mv->row - nb_mv.row); + const int64_t col_diff = abs(mv->col - nb_mv.col); + const int64_t cost = + log2_approximation(1 + row_diff * row_diff + col_diff * col_diff); + if (update == 0) { + best_cost = cost; + update = 1; + } else { + best_cost = cost < best_cost ? cost : best_cost; + } + } + } + return best_cost; +} + +static int64_t exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv, + int range, int step, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, int lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + MV fcenter_mv = { center_mv->row, center_mv->col }; + int64_t best_sad; + int r, c, i; + int start_col, end_col, start_row, end_row; + int col_step = (step > 1) ? step : 4; + + assert(step >= 1); + + clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *best_mv = fcenter_mv; + best_sad = + ((int64_t)fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &fcenter_mv), + in_what->stride) + << LOG2_PRECISION) + + lambda * vp9_nb_mvs_inconsistency(&fcenter_mv, nb_full_mvs, full_mv_num); + start_row = VPXMAX(-range, x->mv_limits.row_min - fcenter_mv.row); + start_col = VPXMAX(-range, x->mv_limits.col_min - fcenter_mv.col); + end_row = VPXMIN(range, x->mv_limits.row_max - fcenter_mv.row); + end_col = VPXMIN(range, x->mv_limits.col_max - fcenter_mv.col); + + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c }; + int64_t sad = + (int64_t)fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride) + << LOG2_PRECISION; + if (sad < best_sad) { + sad += + lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } else { + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + addrs[i] = get_buf_from_mv(in_what, &mv); + } + fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads); + + for (i = 0; i < 4; ++i) { + int64_t sad = (int64_t)sads[i] << LOG2_PRECISION; + if (sad < best_sad) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + sad += lambda * + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + int64_t sad = (int64_t)fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), + in_what->stride) + << LOG2_PRECISION; + if (sad < best_sad) { + sad += lambda * + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } + } + } + } + + return best_sad; +} + +static int64_t full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x, + MV *centre_mv_full, + const vp9_variance_fn_ptr_t *fn_ptr, + MV *dst_mv, int lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const SPEED_FEATURES *const sf = &cpi->sf; + MV temp_mv = { centre_mv_full->row, centre_mv_full->col }; + int64_t bestsme; + int i; + int interval = sf->mesh_patterns[0].interval; + int range = sf->mesh_patterns[0].range; + int baseline_interval_divisor; + const MV dummy_mv = { 0, 0 }; + + // Trap illegal values for interval and range for this function. + if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || + (interval > range)) { + printf("ERROR: invalid range\n"); + assert(0); + } + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4); + range = VPXMIN(range, MAX_RANGE); + interval = VPXMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = + exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv, + lambda, nb_full_mvs, full_mv_num); + + if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search_new( + x, &temp_mv, sf->mesh_patterns[i].range, + sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs, + full_mv_num); + + if (sf->mesh_patterns[i].interval == 1) break; + } + } + + bestsme = vp9_get_mvpred_var(x, &temp_mv, &dummy_mv, fn_ptr, 0); + *dst_mv = temp_mv; + + return bestsme; +} + +static double diamond_search_sad_new(const MACROBLOCK *x, + const search_site_config *cfg, + const MV *init_full_mv, MV *best_full_mv, + int search_param, int lambda, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, + int full_mv_num) { + int i, j, step; + + const MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address; + + double bestsad; + int best_site = -1; + int last_site = -1; + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + // const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step]; + const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step]; + const int tot_steps = cfg->total_steps - search_param; + vpx_clear_system_state(); + + *best_full_mv = *init_full_mv; + clamp_mv(best_full_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *num00 = 0; + + // Work out the start point for the search + in_what = xd->plane[0].pre[0].buf + best_full_mv->row * in_what_stride + + best_full_mv->col; + best_address = in_what; + + // Check the starting position + { + const double mv_dist = + fn_ptr->sdf(what, what_stride, in_what, in_what_stride); + const double mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num) / + (double)(1 << LOG2_PRECISION); + bestsad = mv_dist + lambda * mv_cost; + } + + i = 0; + + for (step = 0; step < tot_steps; step++) { + int all_in = 1, t; + + // All_in is true if every one of the points we are checking are within + // the bounds of the image. + all_in &= ((best_full_mv->row + ss_mv[i].row) > x->mv_limits.row_min); + all_in &= ((best_full_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max); + all_in &= ((best_full_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min); + all_in &= ((best_full_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max); + + // If all the pixels are within the bounds we don't check whether the + // search point is valid in this loop, otherwise we check each point + // for validity.. + if (all_in) { + unsigned int sad_array[4]; + + for (j = 0; j < cfg->searches_per_step; j += 4) { + unsigned char const *block_offset[4]; + + for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (t = 0; t < 4; t++, i++) { + if (sad_array[t] < bestsad) { + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + const double mv_dist = sad_array[t]; + const double mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num) / + (double)(1 << LOG2_PRECISION); + double thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + } + } else { + for (j = 0; j < cfg->searches_per_step; j++) { + // Trap illegal vectors + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss_os[i] + best_address; + const double mv_dist = + fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + if (mv_dist < bestsad) { + const double mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num) / + (double)(1 << LOG2_PRECISION); + double thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + i++; + } + } + if (best_site != last_site) { + best_full_mv->row += ss_mv[best_site].row; + best_full_mv->col += ss_mv[best_site].col; + best_address += ss_os[best_site]; + last_site = best_site; + } else if (best_address == in_what) { + (*num00)++; + } + } + return bestsad; +} + +void vp9_prepare_nb_full_mvs(const TplDepFrame *tpl_frame, int mi_row, + int mi_col, int rf_idx, BLOCK_SIZE bsize, + int_mv *nb_full_mvs) { + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } }; + int i; + for (i = 0; i < NB_MVS_NUM; ++i) { + int r = dirs[i][0] * mi_height; + int c = dirs[i][1] * mi_width; + if (mi_row + r >= 0 && mi_row + r < tpl_frame->mi_rows && mi_col + c >= 0 && + mi_col + c < tpl_frame->mi_cols) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[(mi_row + r) * tpl_frame->stride + mi_col + c]; + int_mv *mv = + get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row + r, mi_col + c); + if (tpl_ptr->ready[rf_idx]) { + nb_full_mvs[i].as_mv = get_full_mv(&mv->as_mv); + } else { + nb_full_mvs[i].as_int = INVALID_MV; + } + } else { + nb_full_mvs[i].as_int = INVALID_MV; + } + } +} +#endif // CONFIG_NON_GREEDY_MV + int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, @@ -1785,12 +2440,15 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { } static const MV search_pos[4] = { - { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, }; unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, - int mi_col) { + int mi_col, const MV *ref_mv) { MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; @@ -1812,6 +2470,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, const int norm_factor = 3 + (bw >> 5); const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + MvLimits subpel_mv_limits; if (scaled_ref_frame) { int i; @@ -1876,7 +2535,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, { const uint8_t *const pos[4] = { - ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride, + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, }; cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); @@ -1911,6 +2573,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, tmp_mv->row *= 8; tmp_mv->col *= 8; + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max, + subpel_mv_limits.row_min, subpel_mv_limits.row_max); + if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; @@ -1919,11 +2585,78 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, return best_sad; } +#if CONFIG_NON_GREEDY_MV // Runs sequence of diamond searches in smaller steps for RD. /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond refining search */ -static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, +double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int lambda, + int do_refine, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv) { + int n, num00 = 0; + double thissme; + double bestsme; + const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param; + const MV center_mv = { 0, 0 }; + vpx_clear_system_state(); + bestsme = + diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, best_mv, step_param, + lambda, &n, fn_ptr, nb_full_mvs, full_mv_num); + + bestsme = vp9_get_mvpred_var(x, best_mv, ¢er_mv, fn_ptr, 0); + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + if (num00) { + num00--; + } else { + MV temp_mv; + thissme = diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, &temp_mv, + step_param + n, lambda, &num00, fn_ptr, + nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV temp_mv = *best_mv; + thissme = vp9_refining_search_sad_new(x, &temp_mv, lambda, search_range, + fn_ptr, nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + } + } + + bestsme = (double)full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, + lambda, nb_full_mvs, full_mv_num); + return bestsme; +} +#endif // CONFIG_NON_GREEDY_MV + +// Runs sequence of diamond searches in smaller steps for RD. +/* do_refine: If last step (1-away) of n-step search doesn't pick the center + point as the best match, we will do a final 1-away diamond + refining search */ +static int full_pixel_diamond(const VP9_COMP *const cpi, + const MACROBLOCK *const x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, @@ -1983,13 +2716,11 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, return bestsme; } -#define MIN_RANGE 7 -#define MAX_RANGE 256 -#define MIN_INTERVAL 1 // Runs an limited range exhaustive mesh search using a pattern set // according to the encode speed profile. -static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, - MV *centre_mv_full, int sadpb, int *cost_list, +static int full_pixel_exhaustive(const VP9_COMP *const cpi, + const MACROBLOCK *const x, MV *centre_mv_full, + int sadpb, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv) { const SPEED_FEATURES *const sf = &cpi->sf; @@ -2015,7 +2746,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, interval = VPXMAX(interval, range / baseline_interval_divisor); // initial search - bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, + bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, sadpb, fn_ptr, &temp_mv); if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { @@ -2023,7 +2754,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, // till we reach a step size of 1. Then break out. for (i = 1; i < MAX_MESH_STEP; ++i) { // First pass with coarser step and longer range - bestsme = exhuastive_mesh_search( + bestsme = exhaustive_mesh_search( x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range, sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv); @@ -2042,6 +2773,90 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, return bestsme; } +#if CONFIG_NON_GREEDY_MV +double vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + int lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const uint8_t *best_address = get_buf_from_mv(in_what, best_full_mv); + double best_sad; + int i, j; + vpx_clear_system_state(); + { + const double mv_dist = + fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride); + const double mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num) / + (double)(1 << LOG2_PRECISION); + best_sad = mv_dist + lambda * mv_cost; + } + + for (i = 0; i < search_range; i++) { + int best_site = -1; + const int all_in = ((best_full_mv->row - 1) > x->mv_limits.row_min) & + ((best_full_mv->row + 1) < x->mv_limits.row_max) & + ((best_full_mv->col - 1) > x->mv_limits.col_min) & + ((best_full_mv->col + 1) < x->mv_limits.col_max); + + if (all_in) { + unsigned int sads[4]; + const uint8_t *const positions[4] = { best_address - in_what->stride, + best_address - 1, best_address + 1, + best_address + in_what->stride }; + + fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; + const double mv_dist = sads[j]; + const double mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num) / + (double)(1 << LOG2_PRECISION); + const double thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + best_site = j; + } + } + } else { + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; + + if (is_mv_in(&x->mv_limits, &mv)) { + const double mv_dist = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + const double mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num) / + (double)(1 << LOG2_PRECISION); + const double thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_full_mv->row += neighbors[best_site].row; + best_full_mv->col += neighbors[best_site].col; + best_address = get_buf_from_mv(in_what, best_full_mv); + } + } + + return best_sad; +} +#endif // CONFIG_NON_GREEDY_MV + int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, @@ -2167,14 +2982,16 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, return best_sad; } -int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int search_method, - int error_per_bit, int *cost_list, const MV *ref_mv, - MV *tmp_mv, int var_max, int rd) { +int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int search_method, int error_per_bit, int *cost_list, + const MV *ref_mv, MV *tmp_mv, int var_max, int rd) { const SPEED_FEATURES *const sf = &cpi->sf; const SEARCH_METHODS method = (SEARCH_METHODS)search_method; - vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; + int run_exhaustive_search = 0; + if (cost_list) { cost_list[0] = INT_MAX; cost_list[1] = INT_MAX; @@ -2205,35 +3022,38 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, fn_ptr, 1, ref_mv, tmp_mv); break; case NSTEP: + case MESH: var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, MAX_MVSEARCH_STEPS - 1 - step_param, 1, cost_list, fn_ptr, ref_mv, tmp_mv); - - // Should we allow a follow on exhaustive search? - if ((sf->exhaustive_searches_thresh < INT_MAX) && - !cpi->rc.is_src_frame_alt_ref) { - int64_t exhuastive_thr = sf->exhaustive_searches_thresh; - exhuastive_thr >>= - 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - - // Threshold variance for an exhaustive full search. - if (var > exhuastive_thr) { - int var_ex; - MV tmp_mv_ex; - var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, - cost_list, fn_ptr, ref_mv, &tmp_mv_ex); - - if (var_ex < var) { - var = var_ex; - *tmp_mv = tmp_mv_ex; - } - } - } break; - default: assert(0 && "Invalid search method."); + default: assert(0 && "Unknown search method"); } - if (method != NSTEP && rd && var < var_max) + if (method == NSTEP) { + if (sf->exhaustive_searches_thresh < INT_MAX && + !cpi->rc.is_src_frame_alt_ref) { + const int64_t exhaustive_thr = + sf->exhaustive_searches_thresh >> + (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); + if (var > exhaustive_thr) run_exhaustive_search = 1; + } + } else if (method == MESH) { + run_exhaustive_search = 1; + } + + if (run_exhaustive_search) { + int var_ex; + MV tmp_mv_ex; + var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list, + fn_ptr, ref_mv, &tmp_mv_ex); + if (var_ex < var) { + var = var_ex; + *tmp_mv = tmp_mv_ex; + } + } + + if (method != NSTEP && method != MESH && rd && var < var_max) var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); return var; @@ -2274,7 +3094,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, (void)tc; \ (void)sse; \ (void)thismse; \ - (void)cost_list; + (void)cost_list; \ + (void)use_accurate_subpel_search; // Return the maximum MV. uint32_t vp9_return_max_sub_pixel_mv( @@ -2282,7 +3103,7 @@ uint32_t vp9_return_max_sub_pixel_mv( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { COMMON_MV_TEST; (void)minr; @@ -2304,7 +3125,7 @@ uint32_t vp9_return_min_sub_pixel_mv( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { COMMON_MV_TEST; (void)maxr; diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.h b/libs/libvpx/vp9/encoder/vp9_mcomp.h index b8db2c3536..cafa2d1504 100644 --- a/libs/libvpx/vp9/encoder/vp9_mcomp.h +++ b/libs/libvpx/vp9/encoder/vp9_mcomp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MCOMP_H_ -#define VP9_ENCODER_VP9_MCOMP_H_ +#ifndef VPX_VP9_ENCODER_VP9_MCOMP_H_ +#define VPX_VP9_ENCODER_VP9_MCOMP_H_ #include "vp9/encoder/vp9_block.h" #include "vpx_dsp/variance.h" @@ -38,6 +38,11 @@ typedef struct search_site_config { int total_steps; } search_site_config; +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV *mv) { + return &buf->buf[mv->row * buf->stride + mv->col]; +} + void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride); void vp9_init3smotion_compensation(search_site_config *cfg, int stride); @@ -59,14 +64,15 @@ struct SPEED_FEATURES; int vp9_init_search_range(int size); int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, - int sad_per_bit, int distance, + int error_per_bit, int search_range, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); // Perform integral projection based motion estimation. unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col); + int mi_row, int mi_col, + const MV *ref_mv); typedef uint32_t(fractional_mv_step_fp)( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, @@ -74,7 +80,7 @@ typedef uint32_t(fractional_mv_step_fp)( int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h); + int h, int use_accurate_subpel_search); extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; @@ -106,7 +112,11 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, struct VP9_COMP; -int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, +// "mvp_full" is the MV search starting point; +// "ref_mv" is the context reference MV; +// "tmp_mv" is the searched best MV. +int vp9_full_pixel_search(const struct VP9_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int search_method, int error_per_bit, int *cost_list, const MV *ref_mv, MV *tmp_mv, int var_max, int rd); @@ -115,8 +125,60 @@ void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, const MvLimits *umv_window_limits, const MV *ref_mv); +#if CONFIG_NON_GREEDY_MV +#define NB_MVS_NUM 4 +struct TplDepStats; +double vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + int lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num); + +double vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int lambda, + int do_refine, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv); + +int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs, + int mv_num); +static INLINE MV get_full_mv(const MV *mv) { + MV out_mv; + out_mv.row = mv->row >> 3; + out_mv.col = mv->col >> 3; + return out_mv; +} +struct TplDepFrame; +void vp9_prepare_nb_full_mvs(const struct TplDepFrame *tpl_frame, int mi_row, + int mi_col, int rf_idx, BLOCK_SIZE bsize, + int_mv *nb_full_mvs); + +static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) { + BLOCK_SIZE square_bsize; + switch (bsize) { + case BLOCK_4X4: + case BLOCK_4X8: + case BLOCK_8X4: square_bsize = BLOCK_4X4; break; + case BLOCK_8X8: + case BLOCK_8X16: + case BLOCK_16X8: square_bsize = BLOCK_8X8; break; + case BLOCK_16X16: + case BLOCK_16X32: + case BLOCK_32X16: square_bsize = BLOCK_16X16; break; + case BLOCK_32X32: + case BLOCK_32X64: + case BLOCK_64X32: + case BLOCK_64X64: square_bsize = BLOCK_32X32; break; + default: + square_bsize = BLOCK_INVALID; + assert(0 && "ERROR: invalid block size"); + break; + } + return square_bsize; +} +#endif // CONFIG_NON_GREEDY_MV #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_MCOMP_H_ +#endif // VPX_VP9_ENCODER_VP9_MCOMP_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.c b/libs/libvpx/vp9/encoder/vp9_multi_thread.c index da06fb151d..c66c035492 100644 --- a/libs/libvpx/vp9/encoder/vp9_multi_thread.c +++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.c @@ -13,6 +13,7 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_temporal_filter.h" void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, int tile_id) { @@ -50,6 +51,20 @@ void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, return job_info; } +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile) { + VP9_COMMON *const cm = &cpi->common; + const int sb_rows = + (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; + int i; + + this_tile->row_base_thresh_freq_fact = + (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, + sizeof(*(this_tile->row_base_thresh_freq_fact))); + for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) + this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; +} + void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { struct VP9Common *cm = &cpi->common; MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; @@ -59,6 +74,8 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; int jobs_per_tile_col, total_jobs; + // Allocate memory that is large enough for all row_mt stages. First pass + // uses 16x16 block size. jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows); // Calculate the total number of jobs total_jobs = jobs_per_tile_col * tile_cols; @@ -83,14 +100,11 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { TileDataEnc *this_tile = &cpi->tile_data[tile_col]; vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col); if (cpi->sf.adaptive_rd_thresh_row_mt) { - const int sb_rows = - (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; - int i; - this_tile->row_base_thresh_freq_fact = - (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, - sizeof(*(this_tile->row_base_thresh_freq_fact))); - for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) - this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + vp9_row_mt_alloc_rd_thresh(cpi, this_tile); } } @@ -146,11 +160,9 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { TileDataEnc *this_tile = &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + tile_col]; - if (cpi->sf.adaptive_rd_thresh_row_mt) { - if (this_tile->row_base_thresh_freq_fact != NULL) { - vpx_free(this_tile->row_base_thresh_freq_fact); - this_tile->row_base_thresh_freq_fact = NULL; - } + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; } } } @@ -219,11 +231,19 @@ void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) { MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; JobQueue *job_queue = multi_thread_ctxt->job_queue; const int tile_cols = 1 << cm->log2_tile_cols; - int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs; + int job_row_num, jobs_per_tile, jobs_per_tile_col = 0, total_jobs; const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; int tile_col, i; - jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows; + switch (job_type) { + case ENCODE_JOB: jobs_per_tile_col = sb_rows; break; + case FIRST_PASS_JOB: jobs_per_tile_col = cm->mb_rows; break; + case ARNR_JOB: + jobs_per_tile_col = ((cm->mi_rows + TF_ROUND) >> TF_SHIFT); + break; + default: assert(0); + } + total_jobs = jobs_per_tile_col * tile_cols; multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col; diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.h b/libs/libvpx/vp9/encoder/vp9_multi_thread.h index bfc0c0ae4f..a2276f4fe6 100644 --- a/libs/libvpx/vp9/encoder/vp9_multi_thread.h +++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H -#define VP9_ENCODER_VP9_MULTI_THREAD_H +#ifndef VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ +#define VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_job_queue.h" @@ -29,10 +29,13 @@ void vp9_multi_thread_tile_init(VP9_COMP *cpi); void vp9_row_mt_mem_alloc(VP9_COMP *cpi); +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile); + void vp9_row_mt_mem_dealloc(VP9_COMP *cpi); int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, int *tile_completion_status, int *cur_tile_id, int tile_cols); -#endif // VP9_ENCODER_VP9_MULTI_THREAD_H +#endif // VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c index 276a0c7852..9696529c50 100644 --- a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c +++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -32,7 +32,7 @@ static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) { void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->enabled = 0; - ne->level = kLowLow; + ne->level = (width * height < 1280 * 720) ? kLowLow : kLow; ne->value = 0; ne->count = 0; ne->thresh = 90; @@ -46,6 +46,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->thresh = 115; } ne->num_frames_estimate = 15; + ne->adapt_thresh = (3 * ne->thresh) >> 1; } static int enable_noise_estimation(VP9_COMP *const cpi) { @@ -97,7 +98,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { } else { if (ne->value > ne->thresh) noise_level = kMedium; - else if (ne->value > ((9 * ne->thresh) >> 4)) + else if (ne->value > (ne->thresh >> 1)) noise_level = kLow; else noise_level = kLowLow; @@ -112,10 +113,6 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Estimate of noise level every frame_period frames. int frame_period = 8; int thresh_consec_zeromv = 6; - unsigned int thresh_sum_diff = 100; - unsigned int thresh_sum_spatial = (200 * 200) << 8; - unsigned int thresh_spatial_var = (32 * 32) << 8; - int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7; int frame_counter = cm->current_video_frame; // Estimate is between current source and last source. YV12_BUFFER_CONFIG *last_source = cpi->Last_Source; @@ -124,11 +121,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { last_source = &cpi->denoiser.last_source; // Tune these thresholds for different resolutions when denoising is // enabled. - if (cm->width > 640 && cm->width < 1920) { - thresh_consec_zeromv = 4; - thresh_sum_diff = 200; - thresh_sum_spatial = (120 * 120) << 8; - thresh_spatial_var = (48 * 48) << 8; + if (cm->width > 640 && cm->width <= 1920) { + thresh_consec_zeromv = 2; } } #endif @@ -148,8 +142,10 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->last_h = cm->height; } return; - } else if (cm->current_video_frame > 60 && - cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) { + } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 && + cpi->rc.frames_since_key > cpi->svc.number_spatial_layers && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) { // Force noise estimation to 0 and denoiser off if content has high motion. ne->level = kLowLow; ne->count = 0; @@ -157,17 +153,19 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && cpi->svc.current_superframe > 1) { - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); + vp9_denoiser_set_noise_level(cpi, ne->level); copy_frame(&cpi->denoiser.last_source, cpi->Source); } #endif return; } else { - int num_samples = 0; - uint64_t avg_est = 0; + unsigned int bin_size = 100; + unsigned int hist[MAX_VAR_HIST_BINS] = { 0 }; + unsigned int hist_avg[MAX_VAR_HIST_BINS]; + unsigned int max_bin = 0; + unsigned int max_bin_count = 0; + unsigned int bin_cnt; int bsize = BLOCK_16X16; - static const unsigned char const_source[16] = { 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 }; // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have // been encoded as zero/small mv at least x consecutive frames, compute // the variance to update estimate of noise in the source. @@ -207,8 +205,11 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Only consider blocks that are likely steady background. i.e, have // been encoded as zero/low motion x (= thresh_consec_zeromv) frames // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all - // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. - if (frame_low_motion && consec_zeromv > thresh_consec_zeromv) { + // 4 sub-blocks for 16x16 block. And exclude this frame if + // high_source_sad is true (i.e., scene/content change). + if (frame_low_motion && consec_zeromv > thresh_consec_zeromv && + !cpi->rc.high_source_sad && + !cpi->svc.high_source_sad_superframe) { int is_skin = 0; if (cpi->use_skin_detection) { is_skin = @@ -217,25 +218,15 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { } if (!is_skin) { unsigned int sse; - // Compute variance. + // Compute variance between co-located blocks from current and + // last input frames. unsigned int variance = cpi->fn_ptr[bsize].vf( src_y, src_ystride, last_src_y, last_src_ystride, &sse); - // Only consider this block as valid for noise measurement if the - // average term (sse - variance = N * avg^{2}, N = 16X16) of the - // temporal residual is small (avoid effects from lighting - // change). - if ((sse - variance) < thresh_sum_diff) { - unsigned int sse2; - const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf( - src_y, src_ystride, const_source, 0, &sse2); - // Avoid blocks with high brightness and high spatial variance. - if ((sse2 - spatial_variance) < thresh_sum_spatial && - spatial_variance < thresh_spatial_var) { - avg_est += low_res ? variance >> 4 - : variance / ((spatial_variance >> 9) + 1); - num_samples++; - } - } + unsigned int hist_index = variance / bin_size; + if (hist_index < MAX_VAR_HIST_BINS) + hist[hist_index]++; + else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1)) + hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail } } } @@ -251,26 +242,58 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { } ne->last_w = cm->width; ne->last_h = cm->height; - // Update noise estimate if we have at a minimum number of block samples, - // and avg_est > 0 (avg_est == 0 can happen if the application inputs - // duplicate frames). - if (num_samples > min_blocks_estimate && avg_est > 0) { - // Normalize. - avg_est = avg_est / num_samples; - // Update noise estimate. - ne->value = (int)((15 * ne->value + avg_est) >> 4); - ne->count++; - if (ne->count == ne->num_frames_estimate) { - // Reset counter and check noise level condition. - ne->num_frames_estimate = 30; - ne->count = 0; - ne->level = vp9_noise_estimate_extract_level(ne); -#if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); -#endif + // Adjust histogram to account for effect that histogram flattens + // and shifts to zero as scene darkens. + if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) { + hist[0] = 0; + hist[1] >>= 2; + hist[2] >>= 2; + hist[3] >>= 2; + hist[4] >>= 1; + hist[5] >>= 1; + hist[6] = 3 * hist[6] >> 1; + hist[MAX_VAR_HIST_BINS - 1] >>= 1; + } + + // Average hist[] and find largest bin + for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) { + if (bin_cnt == 0) + hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3; + else if (bin_cnt == MAX_VAR_HIST_BINS - 1) + hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2; + else if (bin_cnt == MAX_VAR_HIST_BINS - 2) + hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + + (hist[bin_cnt + 1] >> 1) + 2) >> + 2; + else + hist_avg[bin_cnt] = + (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >> + 2; + + if (hist_avg[bin_cnt] > max_bin_count) { + max_bin_count = hist_avg[bin_cnt]; + max_bin = bin_cnt; } } + + // Scale by 40 to work with existing thresholds + ne->value = (int)((3 * ne->value + max_bin * 40) >> 2); + // Quickly increase VNR strength when the noise level increases suddenly. + if (ne->level < kMedium && ne->value > ne->adapt_thresh) { + ne->count = ne->num_frames_estimate; + } else { + ne->count++; + } + if (ne->count == ne->num_frames_estimate) { + // Reset counter and check noise level condition. + ne->num_frames_estimate = 30; + ne->count = 0; + ne->level = vp9_noise_estimate_extract_level(ne); +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + vp9_denoiser_set_noise_level(cpi, ne->level); +#endif + } } #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.h b/libs/libvpx/vp9/encoder/vp9_noise_estimate.h index 335cdbe643..7fc94ff8c9 100644 --- a/libs/libvpx/vp9/encoder/vp9_noise_estimate.h +++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_NOISE_ESTIMATE_H_ -#define VP9_ENCODER_NOISE_ESTIMATE_H_ +#ifndef VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ +#define VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_skin_detection.h" @@ -23,6 +23,8 @@ extern "C" { #endif +#define MAX_VAR_HIST_BINS 20 + typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL; typedef struct noise_estimate { @@ -30,6 +32,7 @@ typedef struct noise_estimate { NOISE_LEVEL level; int value; int thresh; + int adapt_thresh; int count; int last_w; int last_h; @@ -48,4 +51,4 @@ void vp9_update_noise_estimate(struct VP9_COMP *const cpi); } // extern "C" #endif -#endif // VP9_ENCODER_NOISE_ESTIMATE_H_ +#endif // VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_partition_models.h b/libs/libvpx/vp9/encoder/vp9_partition_models.h new file mode 100644 index 0000000000..09c0e30a47 --- /dev/null +++ b/libs/libvpx/vp9/encoder/vp9_partition_models.h @@ -0,0 +1,975 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ +#define VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +// Neural net model config. It defines the layout of a neural net model, such as +// the number of inputs/outputs, number of layers, the number of nodes in each +// layer, as well as the weights and bias of each node. +typedef struct { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +} NN_CONFIG; + +// Partition search breakout model. +#define FEATURES 4 +#define Q_CTX 3 +#define RESOLUTION_CTX 2 +static const float + vp9_partition_breakout_weights_64[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.016673f, + -0.001025f, + -0.000032f, + 0.000833f, + 1.94261885f - 2.1f, + }, + { + -0.160867f, + -0.002101f, + 0.000011f, + 0.002448f, + 1.65738142f - 2.5f, + }, + { + -0.628934f, + -0.011459f, + -0.000009f, + 0.013833f, + 1.47982645f - 1.6f, + }, + }, + { + { + -0.064309f, + -0.006121f, + 0.000232f, + 0.005778f, + 0.7989465f - 5.0f, + }, + { + -0.314957f, + -0.009346f, + -0.000225f, + 0.010072f, + 2.80695581f - 5.5f, + }, + { + -0.635535f, + -0.015135f, + 0.000091f, + 0.015247f, + 2.90381241f - 5.0f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_32[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.010554f, + -0.003081f, + -0.000134f, + 0.004491f, + 1.68445992f - 3.5f, + }, + { + -0.051489f, + -0.007609f, + 0.000016f, + 0.009792f, + 1.28089404f - 2.5f, + }, + { + -0.163097f, + -0.013081f, + 0.000022f, + 0.019006f, + 1.36129403f - 3.2f, + }, + }, + { + { + -0.024629f, + -0.006492f, + -0.000254f, + 0.004895f, + 1.27919173f - 4.5f, + }, + { + -0.083936f, + -0.009827f, + -0.000200f, + 0.010399f, + 2.73731065f - 4.5f, + }, + { + -0.279052f, + -0.013334f, + 0.000289f, + 0.023203f, + 2.43595719f - 3.5f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_16[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.013154f, + -0.002404f, + -0.000977f, + 0.008450f, + 2.57404566f - 5.5f, + }, + { + -0.019146f, + -0.004018f, + 0.000064f, + 0.008187f, + 2.15043926f - 2.5f, + }, + { + -0.075755f, + -0.010858f, + 0.000030f, + 0.024505f, + 2.06848121f - 2.5f, + }, + }, + { + { + -0.007636f, + -0.002751f, + -0.000682f, + 0.005968f, + 0.19225763f - 4.5f, + }, + { + -0.047306f, + -0.009113f, + -0.000518f, + 0.016007f, + 2.61068869f - 4.0f, + }, + { + -0.069336f, + -0.010448f, + -0.001120f, + 0.023083f, + 1.47591054f - 5.5f, + }, + }, + }; + +static const float vp9_partition_breakout_weights_8[RESOLUTION_CTX][Q_CTX] + [FEATURES + 1] = { + { + { + -0.011807f, + -0.009873f, + -0.000931f, + 0.034768f, + 1.32254851f - 2.0f, + }, + { + -0.003861f, + -0.002701f, + 0.000100f, + 0.013876f, + 1.96755111f - 1.5f, + }, + { + -0.013522f, + -0.008677f, + -0.000562f, + 0.034468f, + 1.53440356f - 1.5f, + }, + }, + { + { + -0.003221f, + -0.002125f, + 0.000993f, + 0.012768f, + 0.03541421f - 2.0f, + }, + { + -0.006069f, + -0.007335f, + 0.000229f, + 0.026104f, + 0.17135315f - 1.5f, + }, + { + -0.039894f, + -0.011419f, + 0.000070f, + 0.061817f, + 0.6739977f - 1.5f, + }, + }, + }; +#undef FEATURES +#undef Q_CTX +#undef RESOLUTION_CTX + +// Rectangular partition search pruning model. +#define FEATURES 8 +#define LABELS 4 +#define NODES 16 +static const float vp9_rect_part_nn_weights_16_layer0[FEATURES * NODES] = { + -0.432522f, 0.133070f, -0.169187f, 0.768340f, 0.891228f, 0.554458f, + 0.356000f, 0.403621f, 0.809165f, 0.778214f, -0.520357f, 0.301451f, + -0.386972f, -0.314402f, 0.021878f, 1.148746f, -0.462258f, -0.175524f, + -0.344589f, -0.475159f, -0.232322f, 0.471147f, -0.489948f, 0.467740f, + -0.391550f, 0.208601f, 0.054138f, 0.076859f, -0.309497f, -0.095927f, + 0.225917f, 0.011582f, -0.520730f, -0.585497f, 0.174036f, 0.072521f, + 0.120771f, -0.517234f, -0.581908f, -0.034003f, -0.694722f, -0.364368f, + 0.290584f, 0.038373f, 0.685654f, 0.394019f, 0.759667f, 1.257502f, + -0.610516f, -0.185434f, 0.211997f, -0.172458f, 0.044605f, 0.145316f, + -0.182525f, -0.147376f, 0.578742f, 0.312412f, -0.446135f, -0.389112f, + 0.454033f, 0.260490f, 0.664285f, 0.395856f, -0.231827f, 0.215228f, + 0.014856f, -0.395462f, 0.479646f, -0.391445f, -0.357788f, 0.166238f, + -0.056818f, -0.027783f, 0.060880f, -1.604710f, 0.531268f, 0.282184f, + 0.714944f, 0.093523f, -0.218312f, -0.095546f, -0.285621f, -0.190871f, + -0.448340f, -0.016611f, 0.413913f, -0.286720f, -0.158828f, -0.092635f, + -0.279551f, 0.166509f, -0.088162f, 0.446543f, -0.276830f, -0.065642f, + -0.176346f, -0.984754f, 0.338738f, 0.403809f, 0.738065f, 1.154439f, + 0.750764f, 0.770959f, -0.269403f, 0.295651f, -0.331858f, 0.367144f, + 0.279279f, 0.157419f, -0.348227f, -0.168608f, -0.956000f, -0.647136f, + 0.250516f, 0.858084f, 0.809802f, 0.492408f, 0.804841f, 0.282802f, + 0.079395f, -0.291771f, -0.024382f, -1.615880f, -0.445166f, -0.407335f, + -0.483044f, 0.141126f, +}; + +static const float vp9_rect_part_nn_bias_16_layer0[NODES] = { + 0.275384f, -0.053745f, 0.000000f, 0.000000f, -0.178103f, 0.513965f, + -0.161352f, 0.228551f, 0.000000f, 1.013712f, 0.000000f, 0.000000f, + -1.144009f, -0.000006f, -0.241727f, 2.048764f, +}; + +static const float vp9_rect_part_nn_weights_16_layer1[NODES * LABELS] = { + -1.435278f, 2.204691f, -0.410718f, 0.202708f, 0.109208f, 1.059142f, + -0.306360f, 0.845906f, 0.489654f, -1.121915f, -0.169133f, -0.003385f, + 0.660590f, -0.018711f, 1.227158f, -2.967504f, 1.407345f, -1.293243f, + -0.386921f, 0.300492f, 0.338824f, -0.083250f, -0.069454f, -1.001827f, + -0.327891f, 0.899353f, 0.367397f, -0.118601f, -0.171936f, -0.420646f, + -0.803319f, 2.029634f, 0.940268f, -0.664484f, 0.339916f, 0.315944f, + 0.157374f, -0.402482f, -0.491695f, 0.595827f, 0.015031f, 0.255887f, + -0.466327f, -0.212598f, 0.136485f, 0.033363f, -0.796921f, 1.414304f, + -0.282185f, -2.673571f, -0.280994f, 0.382658f, -0.350902f, 0.227926f, + 0.062602f, -1.000199f, 0.433731f, 1.176439f, -0.163216f, -0.229015f, + -0.640098f, -0.438852f, -0.947700f, 2.203434f, +}; + +static const float vp9_rect_part_nn_bias_16_layer1[LABELS] = { + -0.875510f, + 0.982408f, + 0.560854f, + -0.415209f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_16_layer0, + vp9_rect_part_nn_weights_16_layer1, + }, + { + vp9_rect_part_nn_bias_16_layer0, + vp9_rect_part_nn_bias_16_layer1, + }, +}; + +static const float vp9_rect_part_nn_weights_32_layer0[FEATURES * NODES] = { + -0.147312f, -0.753248f, 0.540206f, 0.661415f, 0.484117f, -0.341609f, + 0.016183f, 0.064177f, 0.781580f, 0.902232f, -0.505342f, 0.325183f, + -0.231072f, -0.120107f, -0.076216f, 0.120038f, 0.403695f, -0.463301f, + -0.192158f, 0.407442f, 0.106633f, 1.072371f, -0.446779f, 0.467353f, + 0.318812f, -0.505996f, -0.008768f, -0.239598f, 0.085480f, 0.284640f, + -0.365045f, -0.048083f, -0.112090f, -0.067089f, 0.304138f, -0.228809f, + 0.383651f, -0.196882f, 0.477039f, -0.217978f, -0.506931f, -0.125675f, + 0.050456f, 1.086598f, 0.732128f, 0.326941f, 0.103952f, 0.121769f, + -0.154487f, -0.255514f, 0.030591f, -0.382797f, -0.019981f, -0.326570f, + 0.149691f, -0.435633f, -0.070795f, 0.167691f, 0.251413f, -0.153405f, + 0.160347f, 0.455107f, -0.968580f, -0.575879f, 0.623115f, -0.069793f, + -0.379768f, -0.965807f, -0.062057f, 0.071312f, 0.457098f, 0.350372f, + -0.460659f, -0.985393f, 0.359963f, -0.093677f, 0.404272f, -0.326896f, + -0.277752f, 0.609322f, -0.114193f, -0.230701f, 0.089208f, 0.645381f, + 0.494485f, 0.467876f, -0.166187f, 0.251044f, -0.394661f, 0.192895f, + -0.344777f, -0.041893f, -0.111163f, 0.066347f, 0.378158f, -0.455465f, + 0.339839f, -0.418207f, -0.356515f, -0.227536f, -0.211091f, -0.122945f, + 0.361772f, -0.338095f, 0.004564f, -0.398510f, 0.060876f, -2.132504f, + -0.086776f, -0.029166f, 0.039241f, 0.222534f, -0.188565f, -0.288792f, + -0.160789f, -0.123905f, 0.397916f, -0.063779f, 0.167210f, -0.445004f, + 0.056889f, 0.207280f, 0.000101f, 0.384507f, -1.721239f, -2.036402f, + -2.084403f, -2.060483f, +}; + +static const float vp9_rect_part_nn_bias_32_layer0[NODES] = { + -0.859251f, -0.109938f, 0.091838f, 0.187817f, -0.728265f, 0.253080f, + 0.000000f, -0.357195f, -0.031290f, -1.373237f, -0.761086f, 0.000000f, + -0.024504f, 1.765711f, 0.000000f, 1.505390f, +}; + +static const float vp9_rect_part_nn_weights_32_layer1[NODES * LABELS] = { + 0.680940f, 1.367178f, 0.403075f, 0.029957f, 0.500917f, 1.407776f, + -0.354002f, 0.011667f, 1.663767f, 0.959155f, 0.428323f, -0.205345f, + -0.081850f, -3.920103f, -0.243802f, -4.253933f, -0.034020f, -1.361057f, + 0.128236f, -0.138422f, -0.025790f, -0.563518f, -0.148715f, -0.344381f, + -1.677389f, -0.868332f, -0.063792f, 0.052052f, 0.359591f, 2.739808f, + -0.414304f, 3.036597f, -0.075368f, -1.019680f, 0.642501f, 0.209779f, + -0.374539f, -0.718294f, -0.116616f, -0.043212f, -1.787809f, -0.773262f, + 0.068734f, 0.508309f, 0.099334f, 1.802239f, -0.333538f, 2.708645f, + -0.447682f, -2.355555f, -0.506674f, -0.061028f, -0.310305f, -0.375475f, + 0.194572f, 0.431788f, -0.789624f, -0.031962f, 0.358353f, 0.382937f, + 0.232002f, 2.321813f, -0.037523f, 2.104652f, +}; + +static const float vp9_rect_part_nn_bias_32_layer1[LABELS] = { + -0.693383f, + 0.773661f, + 0.426878f, + -0.070619f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_32_layer0, + vp9_rect_part_nn_weights_32_layer1, + }, + { + vp9_rect_part_nn_bias_32_layer0, + vp9_rect_part_nn_bias_32_layer1, + }, +}; +#undef NODES + +#define NODES 24 +static const float vp9_rect_part_nn_weights_64_layer0[FEATURES * NODES] = { + 0.024671f, -0.220610f, -0.284362f, -0.069556f, -0.315700f, 0.187861f, + 0.139782f, 0.063110f, 0.796561f, 0.172868f, -0.662194f, -1.393074f, + 0.085003f, 0.393381f, 0.358477f, -0.187268f, -0.370745f, 0.218287f, + 0.027271f, -0.254089f, -0.048236f, -0.459137f, 0.253171f, 0.122598f, + -0.550107f, -0.568456f, 0.159866f, -0.246534f, 0.096384f, -0.255460f, + 0.077864f, -0.334837f, 0.026921f, -0.697252f, 0.345262f, 1.343578f, + 0.815984f, 1.118211f, 1.574016f, 0.578476f, -0.285967f, -0.508672f, + 0.118137f, 0.037695f, 1.540510f, 1.256648f, 1.163819f, 1.172027f, + 0.661551f, -0.111980f, -0.434204f, -0.894217f, 0.570524f, 0.050292f, + -0.113680f, 0.000784f, -0.211554f, -0.369394f, 0.158306f, -0.512505f, + -0.238696f, 0.091498f, -0.448490f, -0.491268f, -0.353112f, -0.303315f, + -0.428438f, 0.127998f, -0.406790f, -0.401786f, -0.279888f, -0.384223f, + 0.026100f, 0.041621f, -0.315818f, -0.087888f, 0.353497f, 0.163123f, + -0.380128f, -0.090334f, -0.216647f, -0.117849f, -0.173502f, 0.301871f, + 0.070854f, 0.114627f, -0.050545f, -0.160381f, 0.595294f, 0.492696f, + -0.453858f, -1.154139f, 0.126000f, 0.034550f, 0.456665f, -0.236618f, + -0.112640f, 0.050759f, -0.449162f, 0.110059f, 0.147116f, 0.249358f, + -0.049894f, 0.063351f, -0.004467f, 0.057242f, -0.482015f, -0.174335f, + -0.085617f, -0.333808f, -0.358440f, -0.069006f, 0.099260f, -1.243430f, + -0.052963f, 0.112088f, -2.661115f, -2.445893f, -2.688174f, -2.624232f, + 0.030494f, 0.161311f, 0.012136f, 0.207564f, -2.776856f, -2.791940f, + -2.623962f, -2.918820f, 1.231619f, -0.376692f, -0.698078f, 0.110336f, + -0.285378f, 0.258367f, -0.180159f, -0.376608f, -0.034348f, -0.130206f, + 0.160020f, 0.852977f, 0.580573f, 1.450782f, 1.357596f, 0.787382f, + -0.544004f, -0.014795f, 0.032121f, -0.557696f, 0.159994f, -0.540908f, + 0.180380f, -0.398045f, 0.705095f, 0.515103f, -0.511521f, -1.271374f, + -0.231019f, 0.423647f, 0.064907f, -0.255338f, -0.877748f, -0.667205f, + 0.267847f, 0.135229f, 0.617844f, 1.349849f, 1.012623f, 0.730506f, + -0.078571f, 0.058401f, 0.053221f, -2.426146f, -0.098808f, -0.138508f, + -0.153299f, 0.149116f, -0.444243f, 0.301807f, 0.065066f, 0.092929f, + -0.372784f, -0.095540f, 0.192269f, 0.237894f, 0.080228f, -0.214074f, + -0.011426f, -2.352367f, -0.085394f, -0.190361f, -0.001177f, 0.089197f, +}; + +static const float vp9_rect_part_nn_bias_64_layer0[NODES] = { + 0.000000f, -0.057652f, -0.175413f, -0.175389f, -1.084097f, -1.423801f, + -0.076307f, -0.193803f, 0.000000f, -0.066474f, -0.050318f, -0.019832f, + -0.038814f, -0.144184f, 2.652451f, 2.415006f, 0.197464f, -0.729842f, + -0.173774f, 0.239171f, 0.486425f, 2.463304f, -0.175279f, 2.352637f, +}; + +static const float vp9_rect_part_nn_weights_64_layer1[NODES * LABELS] = { + -0.063237f, 1.925696f, -0.182145f, -0.226687f, 0.602941f, -0.941140f, + 0.814598f, -0.117063f, 0.282988f, 0.066369f, 0.096951f, 1.049735f, + -0.188188f, -0.281227f, -4.836746f, -5.047797f, 0.892358f, 0.417145f, + -0.279849f, 1.335945f, 0.660338f, -2.757938f, -0.115714f, -1.862183f, + -0.045980f, -1.597624f, -0.586822f, -0.615589f, -0.330537f, 1.068496f, + -0.167290f, 0.141290f, -0.112100f, 0.232761f, 0.252307f, -0.399653f, + 0.353118f, 0.241583f, 2.635241f, 4.026119f, -1.137327f, -0.052446f, + -0.139814f, -1.104256f, -0.759391f, 2.508457f, -0.526297f, 2.095348f, + -0.444473f, -1.090452f, 0.584122f, 0.468729f, -0.368865f, 1.041425f, + -1.079504f, 0.348837f, 0.390091f, 0.416191f, 0.212906f, -0.660255f, + 0.053630f, 0.209476f, 3.595525f, 2.257293f, -0.514030f, 0.074203f, + -0.375862f, -1.998307f, -0.930310f, 1.866686f, -0.247137f, 1.087789f, + 0.100186f, 0.298150f, 0.165265f, 0.050478f, 0.249167f, 0.371789f, + -0.294497f, 0.202954f, 0.037310f, 0.193159f, 0.161551f, 0.301597f, + 0.299286f, 0.185946f, 0.822976f, 2.066130f, -1.724588f, 0.055977f, + -0.330747f, -0.067747f, -0.475801f, 1.555958f, -0.025808f, -0.081516f, +}; + +static const float vp9_rect_part_nn_bias_64_layer1[LABELS] = { + -0.090723f, + 0.894968f, + 0.844754f, + -3.496194f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_64_layer0, + vp9_rect_part_nn_weights_64_layer1, + }, + { + vp9_rect_part_nn_bias_64_layer0, + vp9_rect_part_nn_bias_64_layer1, + }, +}; +#undef FEATURES +#undef LABELS +#undef NODES + +#define FEATURES 7 +// Partition pruning model(neural nets). +static const float vp9_partition_nn_weights_64x64_layer0[FEATURES * 8] = { + -3.571348f, 0.014835f, -3.255393f, -0.098090f, -0.013120f, 0.000221f, + 0.056273f, 0.190179f, -0.268130f, -1.828242f, -0.010655f, 0.937244f, + -0.435120f, 0.512125f, 1.610679f, 0.190816f, -0.799075f, -0.377348f, + -0.144232f, 0.614383f, -0.980388f, 1.754150f, -0.185603f, -0.061854f, + -0.807172f, 1.240177f, 1.419531f, -0.438544f, -5.980774f, 0.139045f, + -0.032359f, -0.068887f, -1.237918f, 0.115706f, 0.003164f, 2.924212f, + 1.246838f, -0.035833f, 0.810011f, -0.805894f, 0.010966f, 0.076463f, + -4.226380f, -2.437764f, -0.010619f, -0.020935f, -0.451494f, 0.300079f, + -0.168961f, -3.326450f, -2.731094f, 0.002518f, 0.018840f, -1.656815f, + 0.068039f, 0.010586f, +}; + +static const float vp9_partition_nn_bias_64x64_layer0[8] = { + -3.469882f, 0.683989f, 0.194010f, 0.313782f, + -3.153335f, 2.245849f, -1.946190f, -3.740020f, +}; + +static const float vp9_partition_nn_weights_64x64_layer1[8] = { + -8.058566f, 0.108306f, -0.280620f, -0.818823f, + -6.445117f, 0.865364f, -1.127127f, -8.808660f, +}; + +static const float vp9_partition_nn_bias_64x64_layer1[1] = { + 6.46909416f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_64x64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_64x64_layer0, + vp9_partition_nn_weights_64x64_layer1, + }, + { + vp9_partition_nn_bias_64x64_layer0, + vp9_partition_nn_bias_64x64_layer1, + }, +}; + +static const float vp9_partition_nn_weights_32x32_layer0[FEATURES * 8] = { + -0.295437f, -4.002648f, -0.205399f, -0.060919f, 0.708037f, 0.027221f, + -0.039137f, -0.907724f, -3.151662f, 0.007106f, 0.018726f, -0.534928f, + 0.022744f, 0.000159f, -1.717189f, -3.229031f, -0.027311f, 0.269863f, + -0.400747f, -0.394366f, -0.108878f, 0.603027f, 0.455369f, -0.197170f, + 1.241746f, -1.347820f, -0.575636f, -0.462879f, -2.296426f, 0.196696f, + -0.138347f, -0.030754f, -0.200774f, 0.453795f, 0.055625f, -3.163116f, + -0.091003f, -0.027028f, -0.042984f, -0.605185f, 0.143240f, -0.036439f, + -0.801228f, 0.313409f, -0.159942f, 0.031267f, 0.886454f, -1.531644f, + -0.089655f, 0.037683f, -0.163441f, -0.130454f, -0.058344f, 0.060011f, + 0.275387f, 1.552226f, +}; + +static const float vp9_partition_nn_bias_32x32_layer0[8] = { + -0.838372f, -2.609089f, -0.055763f, 1.329485f, + -1.297638f, -2.636622f, -0.826909f, 1.012644f, +}; + +static const float vp9_partition_nn_weights_32x32_layer1[8] = { + -1.792632f, -7.322353f, -0.683386f, 0.676564f, + -1.488118f, -7.527719f, 1.240163f, 0.614309f, +}; + +static const float vp9_partition_nn_bias_32x32_layer1[1] = { + 4.97422546f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_32x32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_32x32_layer0, + vp9_partition_nn_weights_32x32_layer1, + }, + { + vp9_partition_nn_bias_32x32_layer0, + vp9_partition_nn_bias_32x32_layer1, + }, +}; + +static const float vp9_partition_nn_weights_16x16_layer0[FEATURES * 8] = { + -1.717673f, -4.718130f, -0.125725f, -0.183427f, -0.511764f, 0.035328f, + 0.130891f, -3.096753f, 0.174968f, -0.188769f, -0.640796f, 1.305661f, + 1.700638f, -0.073806f, -4.006781f, -1.630999f, -0.064863f, -0.086410f, + -0.148617f, 0.172733f, -0.018619f, 2.152595f, 0.778405f, -0.156455f, + 0.612995f, -0.467878f, 0.152022f, -0.236183f, 0.339635f, -0.087119f, + -3.196610f, -1.080401f, -0.637704f, -0.059974f, 1.706298f, -0.793705f, + -6.399260f, 0.010624f, -0.064199f, -0.650621f, 0.338087f, -0.001531f, + 1.023655f, -3.700272f, -0.055281f, -0.386884f, 0.375504f, -0.898678f, + 0.281156f, -0.314611f, 0.863354f, -0.040582f, -0.145019f, 0.029329f, + -2.197880f, -0.108733f, +}; + +static const float vp9_partition_nn_bias_16x16_layer0[8] = { + 0.411516f, -2.143737f, -3.693192f, 2.123142f, + -1.356910f, -3.561016f, -0.765045f, -2.417082f, +}; + +static const float vp9_partition_nn_weights_16x16_layer1[8] = { + -0.619755f, -2.202391f, -4.337171f, 0.611319f, + 0.377677f, -4.998723f, -1.052235f, 1.949922f, +}; + +static const float vp9_partition_nn_bias_16x16_layer1[1] = { + 3.20981717f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_16x16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_16x16_layer0, + vp9_partition_nn_weights_16x16_layer1, + }, + { + vp9_partition_nn_bias_16x16_layer0, + vp9_partition_nn_bias_16x16_layer1, + }, +}; +#undef FEATURES + +#define FEATURES 6 +static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.249572f, 0.205532f, -2.175608f, 1.094836f, -2.986370f, 0.193160f, + -0.143823f, 0.378511f, -1.997788f, -2.166866f, -1.930158f, -1.202127f, + -0.611875f, -0.506422f, -0.432487f, 0.071205f, 0.578172f, -0.154285f, + -0.051830f, 0.331681f, -1.457177f, -2.443546f, -2.000302f, -1.389283f, + 0.372084f, -0.464917f, 2.265235f, 2.385787f, 2.312722f, 2.127868f, + -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f, 0.193976f, + -0.305611f, 0.256632f, 0.309388f, -0.437439f, 1.702640f, -5.007069f, + -0.323450f, 0.294227f, 1.267193f, 1.056601f, 0.387181f, -0.191215f, +}; + +static const float vp9_var_part_nn_bias_64_layer0[8] = { + -0.044396f, -0.938166f, 0.000000f, -0.916375f, + 1.242299f, 0.000000f, -0.405734f, 0.014206f, +}; + +static const float vp9_var_part_nn_weights_64_layer1[8] = { + 1.635945f, 0.979557f, 0.455315f, 1.197199f, + -2.251024f, -0.464953f, 1.378676f, -0.111927f, +}; + +static const float vp9_var_part_nn_bias_64_layer1[1] = { + -0.37972447f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_64_layer0, + vp9_var_part_nn_weights_64_layer1, + }, + { + vp9_var_part_nn_bias_64_layer0, + vp9_var_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.067243f, -0.083598f, -2.191159f, 2.726434f, -3.324013f, 3.477977f, + 0.323736f, -0.510199f, 2.960693f, 2.937661f, 2.888476f, 2.938315f, + -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f, + 0.665153f, -0.273210f, 0.028279f, 0.972220f, -0.445596f, 1.756611f, + -0.177892f, -0.091758f, 0.436661f, -0.521506f, 0.133786f, 0.266743f, + 0.637367f, -0.160084f, -1.396269f, 1.020841f, -1.112971f, 0.919496f, + -0.235883f, 0.651954f, 0.109061f, -0.429463f, 0.740839f, -0.962060f, + 0.299519f, -0.386298f, 1.550231f, 2.464915f, 1.311969f, 2.561612f, +}; + +static const float vp9_var_part_nn_bias_32_layer0[8] = { + 0.368242f, 0.736617f, 0.000000f, 0.757287f, + 0.000000f, 0.613248f, -0.776390f, 0.928497f, +}; + +static const float vp9_var_part_nn_weights_32_layer1[8] = { + 0.939884f, -2.420850f, -0.410489f, -0.186690f, + 0.063287f, -0.522011f, 0.484527f, -0.639625f, +}; + +static const float vp9_var_part_nn_bias_32_layer1[1] = { + -0.6455006f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_32_layer0, + vp9_var_part_nn_weights_32_layer1, + }, + { + vp9_var_part_nn_bias_32_layer0, + vp9_var_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = { + 0.742567f, -0.580624f, -0.244528f, 0.331661f, -0.113949f, -0.559295f, + -0.386061f, 0.438653f, 1.467463f, 0.211589f, 0.513972f, 1.067855f, + -0.876679f, 0.088560f, -0.687483f, -0.380304f, -0.016412f, 0.146380f, + 0.015318f, 0.000351f, -2.764887f, 3.269717f, 2.752428f, -2.236754f, + 0.561539f, -0.852050f, -0.084667f, 0.202057f, 0.197049f, 0.364922f, + -0.463801f, 0.431790f, 1.872096f, -0.091887f, -0.055034f, 2.443492f, + -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f, + 0.642021f, -0.875117f, 2.040794f, 1.921070f, 1.792413f, 1.839727f, +}; + +static const float vp9_var_part_nn_bias_16_layer0[8] = { + 2.901234f, -1.940932f, -0.198970f, -0.406524f, + 0.059422f, -1.879207f, -0.232340f, 2.979821f, +}; + +static const float vp9_var_part_nn_weights_16_layer1[8] = { + -0.528731f, 0.375234f, -0.088422f, 0.668629f, + 0.870449f, 0.578735f, 0.546103f, -1.957207f, +}; + +static const float vp9_var_part_nn_bias_16_layer1[1] = { + -1.95769405f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_16_layer0, + vp9_var_part_nn_weights_16_layer1, + }, + { + vp9_var_part_nn_bias_16_layer0, + vp9_var_part_nn_bias_16_layer1, + }, +}; +#undef FEATURES + +#define FEATURES 12 +#define LABELS 1 +#define NODES 8 +static const float vp9_part_split_nn_weights_64_layer0[FEATURES * NODES] = { + -0.609728f, -0.409099f, -0.472449f, 0.183769f, -0.457740f, 0.081089f, + 0.171003f, 0.578696f, -0.019043f, -0.856142f, 0.557369f, -1.779424f, + -0.274044f, -0.320632f, -0.392531f, -0.359462f, -0.404106f, -0.288357f, + 0.200620f, 0.038013f, -0.430093f, 0.235083f, -0.487442f, 0.424814f, + -0.232758f, -0.442943f, 0.229397f, -0.540301f, -0.648421f, -0.649747f, + -0.171638f, 0.603824f, 0.468497f, -0.421580f, 0.178840f, -0.533838f, + -0.029471f, -0.076296f, 0.197426f, -0.187908f, -0.003950f, -0.065740f, + 0.085165f, -0.039674f, -5.640702f, 1.909538f, -1.434604f, 3.294606f, + -0.788812f, 0.196864f, 0.057012f, -0.019757f, 0.336233f, 0.075378f, + 0.081503f, 0.491864f, -1.899470f, -1.764173f, -1.888137f, -1.762343f, + 0.845542f, 0.202285f, 0.381948f, -0.150996f, 0.556893f, -0.305354f, + 0.561482f, -0.021974f, -0.703117f, 0.268638f, -0.665736f, 1.191005f, + -0.081568f, -0.115653f, 0.272029f, -0.140074f, 0.072683f, 0.092651f, + -0.472287f, -0.055790f, -0.434425f, 0.352055f, 0.048246f, 0.372865f, + 0.111499f, -0.338304f, 0.739133f, 0.156519f, -0.594644f, 0.137295f, + 0.613350f, -0.165102f, -1.003731f, 0.043070f, -0.887896f, -0.174202f, +}; + +static const float vp9_part_split_nn_bias_64_layer0[NODES] = { + 1.182714f, 0.000000f, 0.902019f, 0.953115f, + -1.372486f, -1.288740f, -0.155144f, -3.041362f, +}; + +static const float vp9_part_split_nn_weights_64_layer1[NODES * LABELS] = { + 0.841214f, 0.456016f, 0.869270f, 1.692999f, + -1.700494f, -0.911761f, 0.030111f, -1.447548f, +}; + +static const float vp9_part_split_nn_bias_64_layer1[LABELS] = { + 1.17782545f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_64_layer0, + vp9_part_split_nn_weights_64_layer1, + }, + { + vp9_part_split_nn_bias_64_layer0, + vp9_part_split_nn_bias_64_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_32_layer0[FEATURES * NODES] = { + -0.105488f, -0.218662f, 0.010980f, -0.226979f, 0.028076f, 0.743430f, + 0.789266f, 0.031907f, -1.464200f, 0.222336f, -1.068493f, -0.052712f, + -0.176181f, -0.102654f, -0.973932f, -0.182637f, -0.198000f, 0.335977f, + 0.271346f, 0.133005f, 1.674203f, 0.689567f, 0.657133f, 0.283524f, + 0.115529f, 0.738327f, 0.317184f, -0.179736f, 0.403691f, 0.679350f, + 0.048925f, 0.271338f, -1.538921f, -0.900737f, -1.377845f, 0.084245f, + 0.803122f, -0.107806f, 0.103045f, -0.023335f, -0.098116f, -0.127809f, + 0.037665f, -0.523225f, 1.622185f, 1.903999f, 1.358889f, 1.680785f, + 0.027743f, 0.117906f, -0.158810f, 0.057775f, 0.168257f, 0.062414f, + 0.086228f, -0.087381f, -3.066082f, 3.021855f, -4.092155f, 2.550104f, + -0.230022f, -0.207445f, -0.000347f, 0.034042f, 0.097057f, 0.220088f, + -0.228841f, -0.029405f, -1.507174f, -1.455184f, 2.624904f, 2.643355f, + 0.319912f, 0.585531f, -1.018225f, -0.699606f, 1.026490f, 0.169952f, + -0.093579f, -0.142352f, -0.107256f, 0.059598f, 0.043190f, 0.507543f, + -0.138617f, 0.030197f, 0.059574f, -0.634051f, -0.586724f, -0.148020f, + -0.334380f, 0.459547f, 1.620600f, 0.496850f, 0.639480f, -0.465715f, +}; + +static const float vp9_part_split_nn_bias_32_layer0[NODES] = { + -1.125885f, 0.753197f, -0.825808f, 0.004839f, + 0.583920f, 0.718062f, 0.976741f, 0.796188f, +}; + +static const float vp9_part_split_nn_weights_32_layer1[NODES * LABELS] = { + -0.458745f, 0.724624f, -0.479720f, -2.199872f, + 1.162661f, 1.194153f, -0.716896f, 0.824080f, +}; + +static const float vp9_part_split_nn_bias_32_layer1[LABELS] = { + 0.71644074f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_32_layer0, + vp9_part_split_nn_weights_32_layer1, + }, + { + vp9_part_split_nn_bias_32_layer0, + vp9_part_split_nn_bias_32_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_16_layer0[FEATURES * NODES] = { + -0.003629f, -0.046852f, 0.220428f, -0.033042f, 0.049365f, 0.112818f, + -0.306149f, -0.005872f, 1.066947f, -2.290226f, 2.159505f, -0.618714f, + -0.213294f, 0.451372f, -0.199459f, 0.223730f, -0.321709f, 0.063364f, + 0.148704f, -0.293371f, 0.077225f, -0.421947f, -0.515543f, -0.240975f, + -0.418516f, 1.036523f, -0.009165f, 0.032484f, 1.086549f, 0.220322f, + -0.247585f, -0.221232f, -0.225050f, 0.993051f, 0.285907f, 1.308846f, + 0.707456f, 0.335152f, 0.234556f, 0.264590f, -0.078033f, 0.542226f, + 0.057777f, 0.163471f, 0.039245f, -0.725960f, 0.963780f, -0.972001f, + 0.252237f, -0.192745f, -0.836571f, -0.460539f, -0.528713f, -0.160198f, + -0.621108f, 0.486405f, -0.221923f, 1.519426f, -0.857871f, 0.411595f, + 0.947188f, 0.203339f, 0.174526f, 0.016382f, 0.256879f, 0.049818f, + 0.057836f, -0.659096f, 0.459894f, 0.174695f, 0.379359f, 0.062530f, + -0.210201f, -0.355788f, -0.208432f, -0.401723f, -0.115373f, 0.191336f, + -0.109342f, 0.002455f, -0.078746f, -0.391871f, 0.149892f, -0.239615f, + -0.520709f, 0.118568f, -0.437975f, 0.118116f, -0.565426f, -0.206446f, + 0.113407f, 0.558894f, 0.534627f, 1.154350f, -0.116833f, 1.723311f, +}; + +static const float vp9_part_split_nn_bias_16_layer0[NODES] = { + 0.013109f, -0.034341f, 0.679845f, -0.035781f, + -0.104183f, 0.098055f, -0.041130f, 0.160107f, +}; + +static const float vp9_part_split_nn_weights_16_layer1[NODES * LABELS] = { + 1.499564f, -0.403259f, 1.366532f, -0.469868f, + 0.482227f, -2.076697f, 0.527691f, 0.540495f, +}; + +static const float vp9_part_split_nn_bias_16_layer1[LABELS] = { + 0.01134653f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_16_layer0, + vp9_part_split_nn_weights_16_layer1, + }, + { + vp9_part_split_nn_bias_16_layer0, + vp9_part_split_nn_bias_16_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_8_layer0[FEATURES * NODES] = { + -0.668875f, -0.159078f, -0.062663f, -0.483785f, -0.146814f, -0.608975f, + -0.589145f, 0.203704f, -0.051007f, -0.113769f, -0.477511f, -0.122603f, + -1.329890f, 1.403386f, 0.199636f, -0.161139f, 2.182090f, -0.014307f, + 0.015755f, -0.208468f, 0.884353f, 0.815920f, 0.632464f, 0.838225f, + 1.369483f, -0.029068f, 0.570213f, -0.573546f, 0.029617f, 0.562054f, + -0.653093f, -0.211910f, -0.661013f, -0.384418f, -0.574038f, -0.510069f, + 0.173047f, -0.274231f, -1.044008f, -0.422040f, -0.810296f, 0.144069f, + -0.406704f, 0.411230f, -0.144023f, 0.745651f, -0.595091f, 0.111787f, + 0.840651f, 0.030123f, -0.242155f, 0.101486f, -0.017889f, -0.254467f, + -0.285407f, -0.076675f, -0.549542f, -0.013544f, -0.686566f, -0.755150f, + 1.623949f, -0.286369f, 0.170976f, 0.016442f, -0.598353f, -0.038540f, + 0.202597f, -0.933582f, 0.599510f, 0.362273f, 0.577722f, 0.477603f, + 0.767097f, 0.431532f, 0.457034f, 0.223279f, 0.381349f, 0.033777f, + 0.423923f, -0.664762f, 0.385662f, 0.075744f, 0.182681f, 0.024118f, + 0.319408f, -0.528864f, 0.976537f, -0.305971f, -0.189380f, -0.241689f, + -1.318092f, 0.088647f, -0.109030f, -0.945654f, 1.082797f, 0.184564f, +}; + +static const float vp9_part_split_nn_bias_8_layer0[NODES] = { + -0.237472f, 2.051396f, 0.297062f, -0.730194f, + 0.060472f, -0.565959f, 0.560869f, -0.395448f, +}; + +static const float vp9_part_split_nn_weights_8_layer1[NODES * LABELS] = { + 0.568121f, 1.575915f, -0.544309f, 0.751595f, + -0.117911f, -1.340730f, -0.739671f, 0.661216f, +}; + +static const float vp9_part_split_nn_bias_8_layer1[LABELS] = { + -0.63375306f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_8 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_8_layer0, + vp9_part_split_nn_weights_8_layer1, + }, + { + vp9_part_split_nn_bias_8_layer0, + vp9_part_split_nn_bias_8_layer1, + }, +}; +#undef NODES +#undef FEATURES +#undef LABELS + +// Partition pruning model(linear). +static const float vp9_partition_feature_mean[24] = { + 303501.697372f, 3042630.372158f, 24.694696f, 1.392182f, + 689.413511f, 162.027012f, 1.478213f, 0.0, + 135382.260230f, 912738.513263f, 28.845217f, 1.515230f, + 544.158492f, 131.807995f, 1.436863f, 0.0f, + 43682.377587f, 208131.711766f, 28.084737f, 1.356677f, + 138.254122f, 119.522553f, 1.252322f, 0.0f, +}; + +static const float vp9_partition_feature_std[24] = { + 673689.212982f, 5996652.516628f, 0.024449f, 1.989792f, + 985.880847f, 0.014638f, 2.001898f, 0.0f, + 208798.775332f, 1812548.443284f, 0.018693f, 1.838009f, + 396.986910f, 0.015657f, 1.332541f, 0.0f, + 55888.847031f, 448587.962714f, 0.017900f, 1.904776f, + 98.652832f, 0.016598f, 1.320992f, 0.0f, +}; + +// Error tolerance: 0.01%-0.0.05%-0.1% +static const float vp9_partition_linear_weights[24] = { + 0.111736f, 0.289977f, 0.042219f, 0.204765f, 0.120410f, -0.143863f, + 0.282376f, 0.847811f, 0.637161f, 0.131570f, 0.018636f, 0.202134f, + 0.112797f, 0.028162f, 0.182450f, 1.124367f, 0.386133f, 0.083700f, + 0.050028f, 0.150873f, 0.061119f, 0.109318f, 0.127255f, 0.625211f, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.c b/libs/libvpx/vp9/encoder/vp9_picklpf.c index 1c2c55b9e4..3a620df693 100644 --- a/libs/libvpx/vp9/encoder/vp9_picklpf.c +++ b/libs/libvpx/vp9/encoder/vp9_picklpf.c @@ -24,10 +24,20 @@ #include "vp9/encoder/vp9_picklpf.h" #include "vp9/encoder/vp9_quantize.h" +static unsigned int get_section_intra_rating(const VP9_COMP *cpi) { + unsigned int section_intra_rating; + + section_intra_rating = (cpi->common.frame_type == KEY_FRAME) + ? cpi->twopass.key_frame_section_intra_rating + : cpi->twopass.section_intra_rating; + + return section_intra_rating; +} + static int get_max_filter_level(const VP9_COMP *cpi) { if (cpi->oxcf.pass == 2) { - return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 - : MAX_LOOP_FILTER; + unsigned int section_intra_rating = get_section_intra_rating(cpi); + return section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER; } else { return MAX_LOOP_FILTER; } @@ -81,6 +91,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; // Sum squared error at each filter level int64_t ss_err[MAX_LOOP_FILTER + 1]; + unsigned int section_intra_rating = get_section_intra_rating(cpi); // Set each entry to -1 memset(ss_err, 0xFF, sizeof(ss_err)); @@ -99,8 +110,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, // Bias against raising loop filter in favor of lowering it. int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; - if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20)) - bias = (bias * cpi->twopass.section_intra_rating) / 20; + if ((cpi->oxcf.pass == 2) && (section_intra_rating < 20)) + bias = (bias * section_intra_rating) / 20; // yx, bias less for large block size if (cm->tx_mode != ONLY_4X4) bias >>= 1; @@ -150,7 +161,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; struct loopfilter *const lf = &cm->lf; - lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness; + lf->sharpness_level = 0; if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) { lf->filter_level = 0; @@ -169,20 +180,17 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, case VPX_BITS_10: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); break; - default: - assert(0 && - "bit_depth should be VPX_BITS_8, VPX_BITS_10 " - "or VPX_BITS_12"); - return; } #else int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); #endif // CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + (cm->base_qindex < 200 || cm->width * cm->height > 320 * 240) && cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME) filt_guess = 5 * filt_guess >> 3; diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.h b/libs/libvpx/vp9/encoder/vp9_picklpf.h index cecca058b4..8881b44daa 100644 --- a/libs/libvpx/vp9/encoder/vp9_picklpf.h +++ b/libs/libvpx/vp9/encoder/vp9_picklpf.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_PICKLPF_H_ -#define VP9_ENCODER_VP9_PICKLPF_H_ +#ifndef VPX_VP9_ENCODER_VP9_PICKLPF_H_ +#define VPX_VP9_ENCODER_VP9_PICKLPF_H_ #ifdef __cplusplus extern "C" { @@ -26,4 +26,4 @@ void vp9_pick_filter_level(const struct yv12_buffer_config *sd, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_PICKLPF_H_ +#endif // VPX_VP9_ENCODER_VP9_PICKLPF_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.c b/libs/libvpx/vp9/encoder/vp9_pickmode.c index f2f323a282..513b9f678c 100644 --- a/libs/libvpx/vp9/encoder/vp9_pickmode.c +++ b/libs/libvpx/vp9/encoder/vp9_pickmode.c @@ -41,6 +41,17 @@ typedef struct { int in_use; } PRED_BUFFER; +typedef struct { + PRED_BUFFER *best_pred; + PREDICTION_MODE best_mode; + TX_SIZE best_tx_size; + TX_SIZE best_intra_tx_size; + MV_REFERENCE_FRAME best_ref_frame; + MV_REFERENCE_FRAME best_second_ref_frame; + uint8_t best_mode_skip_txfm; + INTERP_FILTER best_pred_filter; +} BEST_PICKMODE; + static const int pos_shift_16x16[4][4] = { { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } }; @@ -222,13 +233,22 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } if (rv && search_subpel) { - int subpel_force_stop = cpi->sf.mv.subpel_force_stop; - if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2; + SUBPEL_FORCE_STOP subpel_force_stop = cpi->sf.mv.subpel_force_stop; + if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = HALF_PEL; + if (cpi->sf.mv.enable_adaptive_subpel_force_stop) { + const int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh; + if (abs(tmp_mv->as_mv.row) >= mv_thresh || + abs(tmp_mv->as_mv.col) >= mv_thresh) + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above; + else + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below; + } cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } @@ -326,6 +346,82 @@ static int ac_thr_factor(const int speed, const int width, const int height, return 1; } +static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCKD *const xd, unsigned int var, + unsigned int sse, int64_t ac_thr, + unsigned int source_variance, int is_intra) { + // TODO(marpan): Tune selection for intra-modes, screen content, etc. + TX_SIZE tx_size; + unsigned int var_thresh = is_intra ? (unsigned int)ac_thr : 1; + int limit_tx = 1; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + (source_variance == 0 || var < var_thresh)) + limit_tx = 0; + if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (sse > (var << 2)) + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + else + tx_size = TX_8X8; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && limit_tx && + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) + tx_size = TX_8X8; + else if (tx_size > TX_16X16 && limit_tx) + tx_size = TX_16X16; + // For screen-content force 4X4 tx_size over 8X8, for large variance. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 && + bsize <= BLOCK_16X16 && ((var >> 5) > (unsigned int)ac_thr)) + tx_size = TX_4X4; + } else { + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + } + return tx_size; +} + +static void compute_intra_yprediction(PREDICTION_MODE mode, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd) { + struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int row, col; + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, + x->skip_encode ? p->src.buf : pd->dst.buf, + x->skip_encode ? src_stride : dst_stride, + pd->dst.buf, dst_stride, col, row, 0); + } + } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; +} + static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, @@ -342,7 +438,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, struct macroblockd_plane *const pd = &xd->plane[0]; const uint32_t dc_quant = pd->dequant[0]; const uint32_t ac_quant = pd->dequant[1]; - const int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t dc_thr = dc_quant * dc_quant >> 6; int64_t ac_thr = ac_quant * ac_quant >> 6; unsigned int var; int sum; @@ -386,26 +482,17 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, cpi->common.height, abs(sum) >> (bw + bh)); #endif - if (cpi->common.tx_mode == TX_MODE_SELECT) { - if (sse > (var << 2)) - tx_size = VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - else - tx_size = TX_8X8; - - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) - tx_size = TX_8X8; - else if (tx_size > TX_16X16) - tx_size = TX_16X16; - } else { - tx_size = VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - } - - assert(tx_size >= TX_8X8); + tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr, + x->source_variance, 0); + // The code below for setting skip flag assumes tranform size of at least 8x8, + // so force this lower limit on transform. + if (tx_size < TX_8X8) tx_size = TX_8X8; xd->mi[0]->tx_size = tx_size; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source && + x->source_variance == 0) + dc_thr = dc_thr << 1; + // Evaluate if the partition block is a skippable block in Y plane. { unsigned int sse16x16[16] = { 0 }; @@ -473,33 +560,29 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, // Transform skipping test in UV planes. for (i = 1; i <= 2; i++) { - if (cpi->oxcf.speed < 8 || x->color_sensitivity[i - 1]) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); - const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; - const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); - const int uv_bw = b_width_log2_lookup[uv_bsize]; - const int uv_bh = b_height_log2_lookup[uv_bsize]; - const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + - (uv_bh - b_height_log2_lookup[unit_size]); - const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); - const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); - int j = i - 1; + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); + const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); + const int uv_bw = b_width_log2_lookup[uv_bsize]; + const int uv_bh = b_height_log2_lookup[uv_bsize]; + const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + + (uv_bh - b_height_log2_lookup[unit_size]); + const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); + int j = i - 1; - vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); - flag_preduv_computed[i - 1] = 1; - var_uv[j] = cpi->fn_ptr[uv_bsize].vf( - p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + flag_preduv_computed[i - 1] = 1; + var_uv[j] = cpi->fn_ptr[uv_bsize].vf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); - if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && - (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) - skip_uv[j] = 1; - else - break; - } else { - skip_uv[i - 1] = 1; - } + if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && + (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) + skip_uv[j] = 1; + else + break; } // If the transform in YUV planes are skippable, the mode search checks @@ -543,7 +626,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, unsigned int *var_y, - unsigned int *sse_y) { + unsigned int *sse_y, int is_intra) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -563,24 +646,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, *var_y = var; *sse_y = sse; - if (cpi->common.tx_mode == TX_MODE_SELECT) { - if (sse > (var << 2)) - xd->mi[0]->tx_size = - VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - else - xd->mi[0]->tx_size = TX_8X8; - - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) - xd->mi[0]->tx_size = TX_8X8; - else if (xd->mi[0]->tx_size > TX_16X16) - xd->mi[0]->tx_size = TX_16X16; - } else { - xd->mi[0]->tx_size = - VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - } + xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr, + x->source_variance, is_intra); // Evaluate if the partition block is a skippable block in Y plane. { @@ -641,7 +708,7 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, int *skippable, int64_t *sse, BLOCK_SIZE bsize, - TX_SIZE tx_size, int rd_computed) { + TX_SIZE tx_size, int rd_computed, int is_intra) { MACROBLOCKD *xd = &x->e_mbd; const struct macroblockd_plane *pd = &xd->plane[0]; struct macroblock_plane *const p = &x->plane[0]; @@ -658,25 +725,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, const int bw = 4 * num_4x4_w; const int bh = 4 * num_4x4_h; -#if CONFIG_VP9_HIGHBITDEPTH - // TODO(jingning): Implement the high bit-depth Hadamard transforms and - // remove this check condition. - // TODO(marpan): Use this path (model_rd) for 8bit under certain conditions - // for now, as the vp9_quantize_fp below for highbitdepth build is slow. - if (xd->bd != 8 || - (cpi->oxcf.speed > 5 && cpi->common.frame_type != KEY_FRAME && - bsize < BLOCK_32X32)) { - unsigned int var_y, sse_y; - (void)tx_size; - if (!rd_computed) - model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, - &var_y, &sse_y); - *sse = INT_MAX; - *skippable = 0; - return; - } -#endif - if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME && (bsize < BLOCK_32X32 || (cpi->use_svc && @@ -685,7 +733,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, (void)tx_size; if (!rd_computed) model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, - &var_y, &sse_y); + &var_y, &sse_y, is_intra); *sse = INT_MAX; *skippable = 0; return; @@ -695,9 +743,19 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, // The max tx_size passed in is TX_16X16. assert(tx_size != TX_32X32); - +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride, + x->e_mbd.bd); + } else { + vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + } +#else vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); +#endif *skippable = 1; // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. @@ -726,13 +784,13 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } *skippable &= (*eob == 0); eob_cost += 1; @@ -876,6 +934,7 @@ static void encode_breakout_test( // Skipping threshold for dc. unsigned int thresh_dc; int motion_low = 1; + if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return; if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 || mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64) @@ -981,8 +1040,8 @@ static void estimate_block_intra(int plane, int block, int row, int col, VP9_COMP *const cpi = args->cpi; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *const p = &x->plane[0]; - struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; uint8_t *const src_buf_base = p->src.buf; uint8_t *const dst_buf_base = pd->dst.buf; @@ -992,8 +1051,8 @@ static void estimate_block_intra(int plane, int block, int row, int col, (void)block; - p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; - pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; // Use source buffer as an approximation for the fully reconstructed buffer. vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize], tx_size, args->mode, x->skip_encode ? p->src.buf : pd->dst.buf, @@ -1002,13 +1061,12 @@ static void estimate_block_intra(int plane, int block, int row, int col, if (plane == 0) { int64_t this_sse = INT64_MAX; - // TODO(jingning): This needs further refactoring. block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx, - VPXMIN(tx_size, TX_16X16), 0); + VPXMIN(tx_size, TX_16X16), 0, 1); } else { unsigned int var = 0; unsigned int sse = 0; - model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane, + model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, &var, &sse, plane, plane); } @@ -1292,18 +1350,16 @@ static void vp9_pickmode_ctx_den_update( VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig, int ref_frame_cost[MAX_REF_FRAMES], int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred, - TX_SIZE best_tx_size, PREDICTION_MODE best_mode, - MV_REFERENCE_FRAME best_ref_frame, INTERP_FILTER best_pred_filter, - uint8_t best_mode_skip_txfm) { + BEST_PICKMODE *bp) { ctx_den->zero_last_cost_orig = zero_last_cost_orig; ctx_den->ref_frame_cost = ref_frame_cost; ctx_den->frame_mv = frame_mv; ctx_den->reuse_inter_pred = reuse_inter_pred; - ctx_den->best_tx_size = best_tx_size; - ctx_den->best_mode = best_mode; - ctx_den->best_ref_frame = best_ref_frame; - ctx_den->best_pred_filter = best_pred_filter; - ctx_den->best_mode_skip_txfm = best_mode_skip_txfm; + ctx_den->best_tx_size = bp->best_tx_size; + ctx_den->best_mode = bp->best_mode; + ctx_den->best_ref_frame = bp->best_ref_frame; + ctx_den->best_pred_filter = bp->best_pred_filter; + ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm; } static void recheck_zeromv_after_denoising( @@ -1322,6 +1378,7 @@ static void recheck_zeromv_after_denoising( cpi->svc.number_spatial_layers == 1 && decision == FILTER_ZEROMV_BLOCK))) { // Check if we should pick ZEROMV on denoised signal. + VP9_COMMON *const cm = &cpi->common; int rate = 0; int64_t dist = 0; uint32_t var_y = UINT_MAX; @@ -1330,11 +1387,13 @@ static void recheck_zeromv_after_denoising( mi->mode = ZEROMV; mi->ref_frame[0] = LAST_FRAME; mi->ref_frame[1] = NONE; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE); mi->mv[0].as_int = 0; mi->interp_filter = EIGHTTAP; + if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR; xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0]; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); + model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y, 0); this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] + cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]] [INTER_OFFSET(ZEROMV)]; @@ -1346,6 +1405,7 @@ static void recheck_zeromv_after_denoising( this_rdc = *best_rdc; mi->mode = ctx_den->best_mode; mi->ref_frame[0] = ctx_den->best_ref_frame; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE); mi->interp_filter = ctx_den->best_pred_filter; if (ctx_den->best_ref_frame == INTRA_FRAME) { mi->mv[0].as_int = INVALID_MV; @@ -1416,27 +1476,217 @@ static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row, return force_skip_low_temp_var; } +static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, + int mi_row, int mi_col, PRED_BUFFER *tmp, + BLOCK_SIZE bsize, int reuse_inter_pred, + PRED_BUFFER **this_mode_pred, unsigned int *var_y, + unsigned int *sse_y, int force_smooth_filter, + int *this_early_term, int *flag_preduv_computed, + int use_model_yrd_large) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; + + int pf_rate[3] = { 0 }; + int64_t pf_dist[3] = { 0 }; + int curr_rate[3] = { 0 }; + unsigned int pf_var[3] = { 0 }; + unsigned int pf_sse[3] = { 0 }; + TX_SIZE pf_tx_size[3] = { 0 }; + int64_t best_cost = INT64_MAX; + INTERP_FILTER best_filter = SWITCHABLE, filter; + PRED_BUFFER *current_pred = *this_mode_pred; + uint8_t skip_txfm = SKIP_TXFM_NONE; + int best_early_term = 0; + int best_flag_preduv_computed[2] = { 0 }; + INTERP_FILTER filter_start = force_smooth_filter ? EIGHTTAP_SMOOTH : EIGHTTAP; + for (filter = filter_start; filter <= EIGHTTAP_SMOOTH; ++filter) { + int64_t cost; + mi->interp_filter = filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + // For large partition blocks, extra testing is done. + if (use_model_yrd_large) + model_rd_for_sb_y_large(cpi, bsize, x, xd, &pf_rate[filter], + &pf_dist[filter], &pf_var[filter], + &pf_sse[filter], mi_row, mi_col, this_early_term, + flag_preduv_computed); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], + &pf_var[filter], &pf_sse[filter], 0); + curr_rate[filter] = pf_rate[filter]; + pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); + cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); + pf_tx_size[filter] = mi->tx_size; + if (cost < best_cost) { + best_filter = filter; + best_cost = cost; + skip_txfm = x->skip_txfm[0]; + best_early_term = *this_early_term; + best_flag_preduv_computed[0] = flag_preduv_computed[0]; + best_flag_preduv_computed[1] = flag_preduv_computed[1]; + + if (reuse_inter_pred) { + if (*this_mode_pred != current_pred) { + free_pred_buffer(*this_mode_pred); + *this_mode_pred = current_pred; + } + current_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + + if (reuse_inter_pred && *this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mi->interp_filter = best_filter; + mi->tx_size = pf_tx_size[best_filter]; + this_rdc->rate = curr_rate[best_filter]; + this_rdc->dist = pf_dist[best_filter]; + *var_y = pf_var[best_filter]; + *sse_y = pf_sse[best_filter]; + x->skip_txfm[0] = skip_txfm; + *this_early_term = best_early_term; + flag_preduv_computed[0] = best_flag_preduv_computed[0]; + flag_preduv_computed[1] = best_flag_preduv_computed[1]; + if (reuse_inter_pred) { + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = (*this_mode_pred)->stride; + } +} + +static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x, + int_mv frame_mv[][MAX_REF_FRAMES], + MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int best_pred_sad, int *rate_mv, + unsigned int best_sse_sofar, RD_COST *best_rdc) { + SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + SPEED_FEATURES *const sf = &cpi->sf; + + if (ref_frame > LAST_FRAME && gf_temporal_ref && + cpi->oxcf.rc_mode == VPX_CBR) { + int tmp_sad; + uint32_t dis; + int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + if (bsize < BLOCK_16X16) return -1; + + tmp_sad = vp9_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv); + + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; + if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1; + + frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; + *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; + frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; + + cpi->find_fractional_mv_step( + x, &frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); + } else if (svc->use_base_mv && svc->spatial_layer_id) { + if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { + const int pre_stride = xd->plane[0].pre[0].stride; + unsigned int base_mv_sse = UINT_MAX; + int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; + const uint8_t *const pre_buf = + xd->plane[0].pre[0].buf + + (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + + (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); + cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + pre_buf, pre_stride, &base_mv_sse); + + // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, + // for SVC encoding. + if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + + // Exit NEWMV search if base_mv_sse is large. + if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) + return -1; + if (base_mv_sse < (best_sse_sofar << 1)) { + // Base layer mv is good. + // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since + // (0, 0) mode is already tested. + unsigned int base_mv_sse_normalized = + base_mv_sse >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && + base_mv_sse_normalized < 400 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 1)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + + return 0; +} + +static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { + bp->best_mode = ZEROMV; + bp->best_ref_frame = LAST_FRAME; + bp->best_tx_size = TX_SIZES; + bp->best_intra_tx_size = TX_SIZES; + bp->best_pred_filter = EIGHTTAP; + bp->best_mode_skip_txfm = SKIP_TXFM_NONE; + bp->best_second_ref_frame = NONE; + bp->best_pred = NULL; +} + void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; - const SVC *const svc = &cpi->svc; + SVC *const svc = &cpi->svc; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[0]; - PREDICTION_MODE best_mode = ZEROMV; - MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; + + BEST_PICKMODE best_pickmode; + + MV_REFERENCE_FRAME ref_frame; MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame; - TX_SIZE best_tx_size = TX_SIZES; - INTERP_FILTER best_pred_filter = EIGHTTAP; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; RD_COST this_rdc, best_rdc; - uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE; // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; @@ -1451,15 +1701,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (cpi->sf.adaptive_rd_thresh_row_mt) ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx]) : tile_data->thresh_freq_fact[bsize]; - +#if CONFIG_VP9_TEMPORAL_DENOISING + const int denoise_recheck_zeromv = 1; +#endif INTERP_FILTER filter_ref; - const int bsl = mi_width_log2_lookup[bsize]; - const int pred_filter_search = - cm->interp_filter == SWITCHABLE - ? (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & - 0x1 - : 0; + int pred_filter_search = cm->interp_filter == SWITCHABLE; int const_motion[MAX_REF_FRAMES] = { 0 }; const int bh = num_4x4_blocks_high_lookup[bsize] << 2; const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; @@ -1472,7 +1718,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]); #endif struct buf_2d orig_dst = pd->dst; - PRED_BUFFER *best_pred = NULL; PRED_BUFFER *this_mode_pred = NULL; const int pixels_in_block = bh * bw; int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready; @@ -1488,22 +1733,84 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int skip_ref_find_pred[4] = { 0 }; unsigned int sse_zeromv_normalized = UINT_MAX; unsigned int best_sse_sofar = UINT_MAX; - unsigned int thresh_svc_skip_golden = 500; + int gf_temporal_ref = 0; + int force_test_gf_zeromv = 0; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; int denoise_svc_pickmode = 1; #endif INTERP_FILTER filter_gf_svc = EIGHTTAP; - MV_REFERENCE_FRAME best_second_ref_frame = NONE; + MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME; + const struct segmentation *const seg = &cm->seg; int comp_modes = 0; int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; int flag_svc_subpel = 0; int svc_mv_col = 0; int svc_mv_row = 0; + int no_scaling = 0; + int large_block = 0; + int use_model_yrd_large = 0; + unsigned int thresh_svc_skip_golden = 500; + unsigned int thresh_skip_golden = 500; + int force_smooth_filter = cpi->sf.force_smooth_interpol; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); + + init_best_pickmode(&best_pickmode); + + x->encode_breakout = seg->enabled + ? cpi->segment_encode_breakout[mi->segment_id] + : cpi->encode_breakout; + + x->source_variance = UINT_MAX; + if (cpi->sf.default_interp_filter == BILINEAR) { + best_pickmode.best_pred_filter = BILINEAR; + filter_gf_svc = BILINEAR; + } + if (cpi->use_svc && svc->spatial_layer_id > 0) { + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1; + } + if (svc->spatial_layer_id > 0 && + (svc->high_source_sad_superframe || no_scaling)) + thresh_svc_skip_golden = 0; + // Lower the skip threshold if lower spatial layer is better quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 && + cm->base_qindex > svc->lower_layer_qindex + 15) + thresh_svc_skip_golden = 100; + // Increase skip threshold if lower spatial layer is lower quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 && + cm->base_qindex < svc->lower_layer_qindex - 20) + thresh_svc_skip_golden = 1000; + + if (!cpi->use_svc || + (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + struct scale_factors *const sf_last = &cm->frame_refs[LAST_FRAME - 1].sf; + struct scale_factors *const sf_golden = + &cm->frame_refs[GOLDEN_FRAME - 1].sf; + gf_temporal_ref = 1; + // For temporal long term prediction, check that the golden reference + // is same scale as last reference, otherwise disable. + if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) || + (sf_last->y_scale_fp != sf_golden->y_scale_fp)) { + gf_temporal_ref = 0; + } else { + if (cpi->rc.avg_frame_low_motion > 70) + thresh_svc_skip_golden = 500; + else + thresh_svc_skip_golden = 0; + } + } init_ref_frame_cost(cm, xd, ref_frame_cost); - memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES); if (reuse_inter_pred) { @@ -1528,16 +1835,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; x->skip = 0; + if (cpi->sf.cb_pred_filter_search) { + const int bsl = mi_width_log2_lookup[bsize]; + pred_filter_search = cm->interp_filter == SWITCHABLE + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1 + : 0; + } // Instead of using vp9_get_pred_context_switchable_interp(xd) to assign // filter_ref, we use a less strict condition on assigning filter_ref. // This is to reduce the probabily of entering the flow of not assigning // filter_ref and then skip filter search. - if (xd->above_mi && is_inter_block(xd->above_mi)) - filter_ref = xd->above_mi->interp_filter; - else if (xd->left_mi && is_inter_block(xd->left_mi)) - filter_ref = xd->left_mi->interp_filter; - else - filter_ref = cm->interp_filter; + filter_ref = cm->interp_filter; + if (cpi->sf.default_interp_filter != BILINEAR) { + if (xd->above_mi && is_inter_block(xd->above_mi)) + filter_ref = xd->above_mi->interp_filter; + else if (xd->left_mi && is_inter_block(xd->left_mi)) + filter_ref = xd->left_mi->interp_filter; + } // initialize mode decisions vp9_rd_cost_reset(&best_rdc); @@ -1558,23 +1874,24 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #endif // CONFIG_VP9_HIGHBITDEPTH x->source_variance = vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && mi->segment_id > 0 && + x->zero_temp_sad_source && x->source_variance == 0) { + mi->segment_id = 0; + vp9_init_plane_quantizers(cpi, x); + } } #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { - if (cpi->use_svc) { - int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - denoise_svc_pickmode = denoise_svc(cpi) && !lc->is_key_frame; - } + if (cpi->use_svc) denoise_svc_pickmode = vp9_denoise_svc_non_key(cpi); if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) vp9_denoiser_reset_frame_stats(ctx); } #endif - if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc && + if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref && !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) { usable_ref_frame = LAST_FRAME; } else { @@ -1601,14 +1918,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // For svc mode, on spatial_layer_id > 0: if the reference has different scale // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && - cpi->svc.spatial_layer_id > 0) { + svc->spatial_layer_id > 0 && !gf_temporal_ref) { if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; - if (vp9_is_scaled(sf)) svc_force_zero_mode[LAST_FRAME - 1] = 1; + if (vp9_is_scaled(sf)) { + svc_force_zero_mode[LAST_FRAME - 1] = 1; + inter_layer_ref = LAST_FRAME; + } } if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; - if (vp9_is_scaled(sf)) svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + if (vp9_is_scaled(sf)) { + svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + inter_layer_ref = GOLDEN_FRAME; + } } } @@ -1624,6 +1947,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } + if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad || + cpi->rc.avg_frame_low_motion < 60)) + usable_ref_frame = LAST_FRAME; + if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; @@ -1638,7 +1965,21 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME) comp_modes = 2; + // If the segment reference frame feature is enabled and it's set to GOLDEN + // reference, then make sure we don't skip checking GOLDEN, this is to + // prevent possibility of not picking any mode. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + usable_ref_frame = GOLDEN_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 0; + thresh_svc_skip_golden = 0; + } + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { + // Skip find_predictor if the reference frame is not in the + // ref_frame_flags (i.e., not used as a reference for this frame). + skip_ref_find_pred[ref_frame] = + !(cpi->ref_frame_flags & flag_list[ref_frame]); if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, &ref_frame_skip_mask, flag_list, tile_data, mi_row, @@ -1652,16 +1993,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used // an averaging filter for downsampling (phase = 8). If so, we will test - // a nonzero motion mode on the spatial (goldeen) reference. + // a nonzero motion mode on the spatial reference. // The nonzero motion is half pixel shifted to left and top (-4, -4). - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && - svc_force_zero_mode[GOLDEN_FRAME - 1] && - cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) { + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc_force_zero_mode[inter_layer_ref - 1] && + svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && + !gf_temporal_ref) { svc_mv_col = -4; svc_mv_row = -4; flag_svc_subpel = 1; } + // For SVC with quality layers, when QP of lower layer is lower + // than current layer: force check of GF-ZEROMV before early exit + // due to skip flag. + if (svc->spatial_layer_id > 0 && no_scaling && + (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + cm->base_qindex > svc->lower_layer_qindex + 10) + force_test_gf_zeromv = 1; + + // For low motion content use x->sb_is_skin in addition to VeryHighSad + // for setting large_block. + large_block = (x->content_state_sb == kVeryHighSad || + (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) || + cpi->oxcf.speed < 7) + ? bsize > BLOCK_32X32 + : bsize >= BLOCK_32X32; + use_model_yrd_large = + cpi->oxcf.rc_mode == VPX_CBR && large_block && + !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && + cm->base_qindex; + for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) { int rate_mv = 0; int mode_rd_thresh; @@ -1675,7 +2037,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int inter_mv_mode = 0; int skip_this_mv = 0; int comp_pred = 0; - int force_gf_mv = 0; + int force_mv_inter_layer = 0; PREDICTION_MODE this_mode; second_ref_frame = NONE; @@ -1699,8 +2061,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (ref_frame > usable_ref_frame) continue; if (skip_ref_find_pred[ref_frame]) continue; - if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) { - force_gf_mv = 1; + if (svc->previous_frame_is_intra_only) { + if (ref_frame != LAST_FRAME || frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + continue; + + if (flag_svc_subpel && ref_frame == inter_layer_ref) { + force_mv_inter_layer = 1; // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), // otherwise set NEWMV to (svc_mv_col, svc_mv_row). if (this_mode == NEWMV) { @@ -1713,7 +2086,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } if (comp_pred) { - const struct segmentation *const seg = &cm->seg; if (!cpi->allow_comp_inter_inter) continue; // Skip compound inter modes if ARF is not available. if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; @@ -1722,15 +2094,33 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; } - // For SVC, skip the golden (spatial) reference search if sse of zeromv_last - // is below threshold. - if (cpi->use_svc && ref_frame == GOLDEN_FRAME && - sse_zeromv_normalized < thresh_svc_skip_golden) + // For CBR mode: skip the golden reference search if sse of zeromv_last is + // below threshold. + if (ref_frame == GOLDEN_FRAME && cpi->oxcf.rc_mode == VPX_CBR && + ((cpi->use_svc && sse_zeromv_normalized < thresh_svc_skip_golden) || + (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden))) continue; - if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != NEARESTMV) { - continue; + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + + // For screen content. If zero_temp_sad source is computed: skip + // non-zero motion check for stationary blocks. If the superblock is + // non-stationary then for flat blocks skip the zero last check (keep golden + // as it may be inter-layer reference). Otherwise (if zero_temp_sad_source + // is not computed) skip non-zero motion check for flat blocks. + // TODO(marpan): Compute zero_temp_sad_source per coding block. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { + if ((frame_mv[this_mode][ref_frame].as_int != 0 && + x->zero_temp_sad_source) || + (frame_mv[this_mode][ref_frame].as_int == 0 && + x->source_variance == 0 && ref_frame == LAST_FRAME && + !x->zero_temp_sad_source)) + continue; + } else if (frame_mv[this_mode][ref_frame].as_int != 0 && + x->source_variance == 0) { + continue; + } } if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; @@ -1759,14 +2149,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; } - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; - if (const_motion[ref_frame] && this_mode == NEARMV) continue; // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped // later. - if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + if (!force_mv_inter_layer && force_skip_low_temp_var && + ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) { continue; } @@ -1780,34 +2169,39 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } if (cpi->use_svc) { - if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] && + if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (sf->reference_masking && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == LAST_FRAME)) { - if (usable_ref_frame < ALTREF_FRAME) { - if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { - i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i])) - if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) - ref_frame_skip_mask |= (1 << ref_frame); + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we end + // up unable to pick any mode. + if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) { + if (sf->reference_masking && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { + if (usable_ref_frame < ALTREF_FRAME) { + if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { + i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; + if ((cpi->ref_frame_flags & flag_list[i])) + if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + ref_frame_skip_mask |= (1 << ref_frame); + } + } else if (!cpi->rc.is_src_frame_alt_ref && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == ALTREF_FRAME)) { + int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; + int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; + if (((cpi->ref_frame_flags & flag_list[ref1]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || + ((cpi->ref_frame_flags & flag_list[ref2]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) + ref_frame_skip_mask |= (1 << ref_frame); } - } else if (!cpi->rc.is_src_frame_alt_ref && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == ALTREF_FRAME)) { - int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; - int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; - if (((cpi->ref_frame_flags & flag_list[ref1]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || - ((cpi->ref_frame_flags & flag_list[ref2]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) - ref_frame_skip_mask |= (1 << ref_frame); } + if (ref_frame_skip_mask & (1 << ref_frame)) continue; } - if (ref_frame_skip_mask & (1 << ref_frame)) continue; // Select prediction reference frames. for (i = 0; i < MAX_MB_PLANE; i++) { @@ -1820,8 +2214,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; - mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1 - : rd_threshes[mode_index]; + mode_rd_thresh = best_pickmode.best_mode_skip_txfm + ? rd_threshes[mode_index] << 1 + : rd_threshes[mode_index]; // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding // speed with little/no subjective quality loss. @@ -1835,92 +2230,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (!cpi->sf.adaptive_rd_thresh_row_mt && rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, &rd_thresh_freq_fact[mode_index]))) - continue; + if (frame_mv[this_mode][ref_frame].as_int != 0) continue; - if (this_mode == NEWMV && !force_gf_mv) { - if (ref_frame > LAST_FRAME && !cpi->use_svc && - cpi->oxcf.rc_mode == VPX_CBR) { - int tmp_sad; - uint32_t dis; - int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; - - if (bsize < BLOCK_16X16) continue; - - tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); - - if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue; - if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) - continue; - - frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; - rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, - &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; - frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; - - cpi->find_fractional_mv_step( - x, &frame_mv[NEWMV][ref_frame].as_mv, - &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, - cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, - 0); - } else if (svc->use_base_mv && svc->spatial_layer_id) { - if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { - const int pre_stride = xd->plane[0].pre[0].stride; - unsigned int base_mv_sse = UINT_MAX; - int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; - const uint8_t *const pre_buf = - xd->plane[0].pre[0].buf + - (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + - (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); - cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, - pre_buf, pre_stride, &base_mv_sse); - - // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, - // for SVC encoding. - if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 && - frame_mv[NEWMV][ref_frame].as_mv.row == 0 && - frame_mv[NEWMV][ref_frame].as_mv.col == 0) - continue; - - // Exit NEWMV search if base_mv_sse is large. - if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) - continue; - if (base_mv_sse < (best_sse_sofar << 1)) { - // Base layer mv is good. - // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since - // (0, 0) mode is already tested. - unsigned int base_mv_sse_normalized = - base_mv_sse >> - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && - base_mv_sse_normalized < 400 && - frame_mv[NEWMV][ref_frame].as_mv.row == 0 && - frame_mv[NEWMV][ref_frame].as_mv.col == 0) - continue; - if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], &rate_mv, - best_rdc.rdcost, 1)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost, 0)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost, 0)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], &rate_mv, - best_rdc.rdcost, 0)) { + if (this_mode == NEWMV && !force_mv_inter_layer) { + if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize, + mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar, + &best_rdc)) continue; - } } // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector @@ -1978,70 +2294,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search && (ref_frame == LAST_FRAME || - (ref_frame == GOLDEN_FRAME && !force_gf_mv && + (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer && (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { - int pf_rate[3]; - int64_t pf_dist[3]; - int curr_rate[3]; - unsigned int pf_var[3]; - unsigned int pf_sse[3]; - TX_SIZE pf_tx_size[3]; - int64_t best_cost = INT64_MAX; - INTERP_FILTER best_filter = SWITCHABLE, filter; - PRED_BUFFER *current_pred = this_mode_pred; rd_computed = 1; - - for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) { - int64_t cost; - mi->interp_filter = filter; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], - &pf_var[filter], &pf_sse[filter]); - curr_rate[filter] = pf_rate[filter]; - pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); - cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); - pf_tx_size[filter] = mi->tx_size; - if (cost < best_cost) { - best_filter = filter; - best_cost = cost; - skip_txfm = x->skip_txfm[0]; - - if (reuse_inter_pred) { - if (this_mode_pred != current_pred) { - free_pred_buffer(this_mode_pred); - this_mode_pred = current_pred; - } - current_pred = &tmp[get_pred_buffer(tmp, 3)]; - pd->dst.buf = current_pred->data; - pd->dst.stride = bw; - } - } - } - - if (reuse_inter_pred && this_mode_pred != current_pred) - free_pred_buffer(current_pred); - - mi->interp_filter = best_filter; - mi->tx_size = pf_tx_size[best_filter]; - this_rdc.rate = curr_rate[best_filter]; - this_rdc.dist = pf_dist[best_filter]; - var_y = pf_var[best_filter]; - sse_y = pf_sse[best_filter]; - x->skip_txfm[0] = skip_txfm; - if (reuse_inter_pred) { - pd->dst.buf = this_mode_pred->data; - pd->dst.stride = this_mode_pred->stride; - } + search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize, + reuse_inter_pred, &this_mode_pred, &var_y, &sse_y, + force_smooth_filter, &this_early_term, + flag_preduv_computed, use_model_yrd_large); } else { - // For low motion content use x->sb_is_skin in addition to VeryHighSad - // for setting large_block. - const int large_block = - (x->content_state_sb == kVeryHighSad || - (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) || - cpi->oxcf.speed < 7) - ? bsize > BLOCK_32X32 - : bsize >= BLOCK_32X32; mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; if (cpi->use_svc && ref_frame == GOLDEN_FRAME && @@ -2051,19 +2312,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); // For large partition blocks, extra testing is done. - if (cpi->oxcf.rc_mode == VPX_CBR && large_block && - !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && - cm->base_qindex) { + if (use_model_yrd_large) { + rd_computed = 1; model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, &this_early_term, flag_preduv_computed); } else { rd_computed = 1; model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, - &var_y, &sse_y); + &var_y, &sse_y, 0); } // Save normalized sse (between current and last frame) for (0, 0) motion. - if (cpi->use_svc && ref_frame == LAST_FRAME && + if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) { sse_zeromv_normalized = sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); @@ -2074,8 +2334,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (!this_early_term) { this_sse = (int64_t)sse_y; block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize, - VPXMIN(mi->tx_size, TX_16X16), rd_computed); - + VPXMIN(mi->tx_size, TX_16X16), rd_computed, 0); x->skip_txfm[0] = is_skippable; if (is_skippable) { this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); @@ -2095,9 +2354,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_rdc.rate += vp9_get_switchable_rate(cpi, xd); } } else { - this_rdc.rate += cm->interp_filter == SWITCHABLE - ? vp9_get_switchable_rate(cpi, xd) - : 0; + if (cm->interp_filter == SWITCHABLE) { + if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) + this_rdc.rate += vp9_get_switchable_rate(cpi, xd); + } this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); } @@ -2138,7 +2398,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skipping checking: test to see if this block can be reconstructed by // prediction only. - if (cpi->allow_encode_breakout) { + if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected && + !svc->high_num_blocks_with_motion) { encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode, var_y, sse_y, yv12_mb, &this_rdc.rate, &this_rdc.dist, flag_preduv_computed); @@ -2149,6 +2410,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } + // On spatially flat blocks for screne content: bias against zero-last + // if the sse_y is non-zero. Only on scene change or high motion frames. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + (scene_change_detected || svc->high_num_blocks_with_motion) && + ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0 && + svc->spatial_layer_id == 0 && x->source_variance == 0 && sse_y > 0) { + this_rdc.rdcost = this_rdc.rdcost << 2; + } + #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow) { @@ -2165,71 +2435,86 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (this_rdc.rdcost < best_rdc.rdcost || x->skip) { best_rdc = this_rdc; - best_mode = this_mode; - best_pred_filter = mi->interp_filter; - best_tx_size = mi->tx_size; - best_ref_frame = ref_frame; - best_mode_skip_txfm = x->skip_txfm[0]; best_early_term = this_early_term; - best_second_ref_frame = second_ref_frame; + best_pickmode.best_mode = this_mode; + best_pickmode.best_pred_filter = mi->interp_filter; + best_pickmode.best_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = ref_frame; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; + best_pickmode.best_second_ref_frame = second_ref_frame; if (reuse_inter_pred) { - free_pred_buffer(best_pred); - best_pred = this_mode_pred; + free_pred_buffer(best_pickmode.best_pred); + best_pickmode.best_pred = this_mode_pred; } } else { if (reuse_inter_pred) free_pred_buffer(this_mode_pred); } - if (x->skip) break; + if (x->skip && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) + break; // If early termination flag is 1 and at least 2 modes are checked, // the mode search is terminated. - if (best_early_term && idx > 0) { + if (best_early_term && idx > 0 && !scene_change_detected && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) { x->skip = 1; break; } } - mi->mode = best_mode; - mi->interp_filter = best_pred_filter; - mi->tx_size = best_tx_size; - mi->ref_frame[0] = best_ref_frame; - mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; + mi->mode = best_pickmode.best_mode; + mi->interp_filter = best_pickmode.best_pred_filter; + mi->tx_size = best_pickmode.best_tx_size; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->mv[0].as_int = + frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int; - x->skip_txfm[0] = best_mode_skip_txfm; - mi->ref_frame[1] = best_second_ref_frame; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; // For spatial enhancemanent layer: perform intra prediction only if base // layer is chosen as the reference. Always perform intra prediction if - // LAST is the only reference or is_key_frame is set. - if (cpi->svc.spatial_layer_id) { + // LAST is the only reference, or is_key_frame is set, or on base + // temporal layer. + if (svc->spatial_layer_id && !gf_temporal_ref) { perform_intra_pred = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame || + svc->temporal_layer_id == 0 || + svc->layer_context[svc->temporal_layer_id].is_key_frame || !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || - (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && - svc_force_zero_mode[best_ref_frame - 1]); + (!svc->layer_context[svc->temporal_layer_id].is_key_frame && + svc_force_zero_mode[best_pickmode.best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } - if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && - cpi->rc.is_src_frame_alt_ref) + if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) || + svc->previous_frame_is_intra_only) perform_intra_pred = 0; + + // If the segment reference frame feature is enabled and set then + // skip the intra prediction. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0) + perform_intra_pred = 0; + // Perform intra prediction search, if the best SAD is above a certain // threshold. if (best_rdc.rdcost == INT64_MAX || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0) || + (scene_change_detected && perform_intra_pred) || ((!force_skip_low_temp_var || bsize < BLOCK_32X32 || x->content_state_sb == kVeryHighSad) && perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh && bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad && !x->lowvar_highsumdiff)) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; + int64_t this_sse = INT64_MAX; int i; - TX_SIZE best_intra_tx_size = TX_SIZES; + PRED_BUFFER *const best_pred = best_pickmode.best_pred; TX_SIZE intra_tx_size = VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16) - intra_tx_size = TX_16X16; if (reuse_inter_pred && best_pred != NULL) { if (best_pred->data == orig_dst.buf) { @@ -2249,7 +2534,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_mode_pred->data, this_mode_pred->stride, NULL, 0, 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH - best_pred = this_mode_pred; + best_pickmode.best_pred = this_mode_pred; } } pd->dst = orig_dst; @@ -2258,8 +2543,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, const PREDICTION_MODE this_mode = intra_mode_list[i]; THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; int mode_rd_thresh = rd_threshes[mode_index]; + // For spatially flat blocks, under short_circuit_flat_blocks flag: + // only check DC mode for stationary blocks, otherwise also check + // H and V mode. if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != DC_PRED) { + ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) { continue; } @@ -2271,8 +2559,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, &rd_thresh_freq_fact[mode_index])) || (!cpi->sf.adaptive_rd_thresh_row_mt && rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, - &rd_thresh_freq_fact[mode_index]))) - continue; + &rd_thresh_freq_fact[mode_index]))) { + // Avoid this early exit for screen on base layer, for scene + // changes or high motion frames. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN || + svc->spatial_layer_id > 0 || + (!scene_change_detected && !svc->high_num_blocks_with_motion)) + continue; + } mi->mode = this_mode; mi->ref_frame[0] = INTRA_FRAME; @@ -2281,8 +2575,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, args.skippable = 1; args.rdc = &this_rdc; mi->tx_size = intra_tx_size; - vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra, - &args); + + compute_intra_yprediction(this_mode, bsize, x, xd); + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y, 1); + block_yrd(cpi, x, &this_rdc, &args.skippable, &this_sse, bsize, + VPXMIN(mi->tx_size, TX_16X16), 1, 1); + // Check skip cost here since skippable is not set for for uv, this // mirrors the behavior used by inter if (args.skippable) { @@ -2309,36 +2608,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (this_rdc.rdcost < best_rdc.rdcost) { best_rdc = this_rdc; - best_mode = this_mode; - best_intra_tx_size = mi->tx_size; - best_ref_frame = INTRA_FRAME; - best_second_ref_frame = NONE; + best_pickmode.best_mode = this_mode; + best_pickmode.best_intra_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = INTRA_FRAME; + best_pickmode.best_second_ref_frame = NONE; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; mi->mv[1].as_int = INVALID_MV; - best_mode_skip_txfm = x->skip_txfm[0]; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; } } // Reset mb_mode_info to the best inter mode. - if (best_ref_frame != INTRA_FRAME) { - mi->tx_size = best_tx_size; + if (best_pickmode.best_ref_frame != INTRA_FRAME) { + mi->tx_size = best_pickmode.best_tx_size; } else { - mi->tx_size = best_intra_tx_size; + mi->tx_size = best_pickmode.best_intra_tx_size; } } pd->dst = orig_dst; - mi->mode = best_mode; - mi->ref_frame[0] = best_ref_frame; - mi->ref_frame[1] = best_second_ref_frame; - x->skip_txfm[0] = best_mode_skip_txfm; + mi->mode = best_pickmode.best_mode; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; if (!is_inter_block(mi)) { mi->interp_filter = SWITCHABLE_FILTERS; } - if (reuse_inter_pred && best_pred != NULL) { + if (reuse_inter_pred && best_pickmode.best_pred != NULL) { + PRED_BUFFER *const best_pred = best_pickmode.best_pred; if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) @@ -2367,25 +2667,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Remove this condition when the issue is resolved. if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1; vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost, - frame_mv, reuse_inter_pred, best_tx_size, - best_mode, best_ref_frame, best_pred_filter, - best_mode_skip_txfm); - vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision); - recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb, - &best_rdc, bsize, mi_row, mi_col); - best_ref_frame = ctx_den.best_ref_frame; + frame_mv, reuse_inter_pred, &best_pickmode); + vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision, + gf_temporal_ref); + if (denoise_recheck_zeromv) + recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, + yv12_mb, &best_rdc, bsize, mi_row, mi_col); + best_pickmode.best_ref_frame = ctx_den.best_ref_frame; } #endif - if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME) + if (best_pickmode.best_ref_frame == ALTREF_FRAME || + best_pickmode.best_second_ref_frame == ALTREF_FRAME) x->arf_frame_usage++; - else if (best_ref_frame != INTRA_FRAME) + else if (best_pickmode.best_ref_frame != INTRA_FRAME) x->lastgolden_frame_usage++; if (cpi->sf.adaptive_rd_thresh) { - THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)]; + THR_MODES best_mode_idx = + mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)]; - if (best_ref_frame == INTRA_FRAME) { + if (best_pickmode.best_ref_frame == INTRA_FRAME) { // Only consider the modes that are included in the intra_mode_list. int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); int i; @@ -2405,7 +2707,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } else { for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { PREDICTION_MODE this_mode; - if (best_ref_frame != ref_frame) continue; + if (best_pickmode.best_ref_frame != ref_frame) continue; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { if (cpi->sf.adaptive_rd_thresh_row_mt) update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, @@ -2585,9 +2887,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, x, &tmp_mv, &mbmi_ext->ref_mvs[ref_frame][0].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, - cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, - &dummy_dist, &x->pred_sse[ref_frame], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dummy_dist, + &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv; } else { @@ -2620,7 +2923,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, #endif model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, - &var_y, &sse_y); + &var_y, &sse_y, 0); this_rdc.rate += b_rate; this_rdc.rdcost = diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.h b/libs/libvpx/vp9/encoder/vp9_pickmode.h index 9aa00c4fab..15207e6cf4 100644 --- a/libs/libvpx/vp9/encoder/vp9_pickmode.h +++ b/libs/libvpx/vp9/encoder/vp9_pickmode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_PICKMODE_H_ -#define VP9_ENCODER_VP9_PICKMODE_H_ +#ifndef VPX_VP9_ENCODER_VP9_PICKMODE_H_ +#define VPX_VP9_ENCODER_VP9_PICKMODE_H_ #include "vp9/encoder/vp9_encoder.h" @@ -32,4 +32,4 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_PICKMODE_H_ +#endif // VPX_VP9_ENCODER_VP9_PICKMODE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.c b/libs/libvpx/vp9/encoder/vp9_quantize.c index 09f61ead26..26d1434c34 100644 --- a/libs/libvpx/vp9/encoder/vp9_quantize.c +++ b/libs/libvpx/vp9/encoder/vp9_quantize.c @@ -204,10 +204,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); - case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + return q == 0 ? 64 : (quant < 2368 ? 84 : 80); } #else (void)bit_depth; @@ -221,13 +220,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) { int i, q, quant; for (q = 0; q < QINDEX_RANGE; q++) { - const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); - const int qrounding_factor = q == 0 ? 64 : 48; + int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); + int qrounding_factor = q == 0 ? 64 : 48; + const int sharpness_adjustment = 16 * (7 - cpi->oxcf.sharpness) / 7; + + if (cpi->oxcf.sharpness > 0 && q > 0) { + qzbin_factor = 64 + sharpness_adjustment; + qrounding_factor = 64 - sharpness_adjustment; + } for (i = 0; i < 2; ++i) { int qrounding_factor_fp = i == 0 ? 48 : 42; if (q == 0) qrounding_factor_fp = 64; - + if (cpi->oxcf.sharpness > 0) + qrounding_factor_fp = 64 - sharpness_adjustment; // y quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth) : vp9_ac_quant(q, 0, cm->bit_depth); @@ -282,12 +288,12 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { // Y x->plane[0].quant = quants->y_quant[qindex]; x->plane[0].quant_fp = quants->y_quant_fp[qindex]; - x->plane[0].round_fp = quants->y_round_fp[qindex]; + memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex], + 8 * sizeof(*(x->plane[0].round_fp))); x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; xd->plane[0].dequant = cpi->y_dequant[qindex]; - x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0]; x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1]; @@ -295,12 +301,12 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { for (i = 1; i < 3; i++) { x->plane[i].quant = quants->uv_quant[qindex]; x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; - x->plane[i].round_fp = quants->uv_round_fp[qindex]; + memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex], + 8 * sizeof(*(x->plane[i].round_fp))); x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; xd->plane[i].dequant = cpi->uv_dequant[qindex]; - x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0]; x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1]; } diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.h b/libs/libvpx/vp9/encoder/vp9_quantize.h index 61320361b6..ed9b849584 100644 --- a/libs/libvpx/vp9/encoder/vp9_quantize.h +++ b/libs/libvpx/vp9/encoder/vp9_quantize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_QUANTIZE_H_ -#define VP9_ENCODER_VP9_QUANTIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_QUANTIZE_H_ +#define VPX_VP9_ENCODER_VP9_QUANTIZE_H_ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" @@ -59,4 +59,4 @@ int vp9_qindex_to_quantizer(int qindex); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_QUANTIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_QUANTIZE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.c b/libs/libvpx/vp9/encoder/vp9_ratectrl.c index b7f3a0e897..6745b0adfc 100644 --- a/libs/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.c @@ -31,10 +31,13 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_ratectrl.h" -// Max rate target for 1080P and below encodes under normal circumstances -// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +// Max rate per frame for 1080P and below encodes if no level requirement given. +// For larger formats limit to MAX_MB_RATE bits per MB +// 4Mbits is derived from the level requirement for level 4 (1080P 30) which +// requires that HW can sustain a rate of 16Mbits over a 4 frame group. +// If a lower level requirement is specified then this may over ride this value. #define MAX_MB_RATE 250 -#define MAXRATE_1080P 2025000 +#define MAXRATE_1080P 4000000 #define DEFAULT_KF_BOOST 2000 #define DEFAULT_GF_BOOST 2000 @@ -45,18 +48,16 @@ #define MAX_BPB_FACTOR 50 #if CONFIG_VP9_HIGHBITDEPTH -#define ASSIGN_MINQ_TABLE(bit_depth, name) \ - do { \ - switch (bit_depth) { \ - case VPX_BITS_8: name = name##_8; break; \ - case VPX_BITS_10: name = name##_10; break; \ - case VPX_BITS_12: name = name##_12; break; \ - default: \ - assert(0 && \ - "bit_depth should be VPX_BITS_8, VPX_BITS_10" \ - " or VPX_BITS_12"); \ - name = NULL; \ - } \ +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case VPX_BITS_8: name = name##_8; break; \ + case VPX_BITS_10: name = name##_10; break; \ + default: \ + assert(bit_depth == VPX_BITS_12); \ + name = name##_12; \ + break; \ + } \ } while (0) #else #define ASSIGN_MINQ_TABLE(bit_depth, name) \ @@ -97,8 +98,8 @@ static int kf_low = 400; #else static int gf_high = 2000; static int gf_low = 400; -static int kf_high = 5000; -static int kf_low = 400; +static int kf_high = 4800; +static int kf_low = 300; #endif // Functions to compute the active minq lookup table entries based on a @@ -128,7 +129,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, for (i = 0; i < QINDEX_RANGE; i++) { const double maxq = vp9_convert_qindex_to_q(i, bit_depth); kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); - kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); #ifdef AGGRESSIVE_VBR arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth); inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth); @@ -164,10 +165,9 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0; - case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1.0; + assert(bit_depth == VPX_BITS_12); + return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; } #else return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; @@ -211,17 +211,15 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; - if (cpi->oxcf.pass != 2) { - const int min_frame_target = - VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); - if (target < min_frame_target) target = min_frame_target; - if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a constructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for constructed ARFs. - target = min_frame_target; - } + const int min_frame_target = + VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + if (target < min_frame_target) target = min_frame_target; + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; } // Clip the frame target to the maximum allowed value. @@ -247,20 +245,68 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { return target; } +// TODO(marpan/jianj): bits_off_target and buffer_level are used in the saame +// way for CBR mode, for the buffering updates below. Look into removing one +// of these (i.e., bits_off_target). +// Update the buffer level before encoding with the per-frame-bandwidth, +static void update_buffer_level_preencode(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + rc->bits_off_target += rc->avg_frame_bandwidth; + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; +} + +// Update the buffer level before encoding with the per-frame-bandwidth +// for SVC. The current and all upper temporal layers are updated, needed +// for the layered rate control which involves cumulative buffer levels for +// the temporal layers. Allow for using the timestamp(pts) delta for the +// framerate when the set_ref_frame_config is used. +static void update_buffer_level_svc_preencode(VP9_COMP *cpi) { + SVC *const svc = &cpi->svc; + int i; + // Set this to 1 to use timestamp delta for "framerate" under + // ref_frame_config usage. + int use_timestamp = 1; + const int64_t ts_delta = + svc->time_stamp_superframe - svc->time_stamp_prev[svc->spatial_layer_id]; + for (i = svc->temporal_layer_id; i < svc->number_temporal_layers; ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + if (use_timestamp && cpi->svc.use_set_ref_frame_config && + svc->number_temporal_layers == 1 && ts_delta > 0 && + svc->current_superframe > 0) { + // TODO(marpan): This may need to be modified for temporal layers. + const double framerate_pts = 10000000.0 / ts_delta; + lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts); + } else { + lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + } + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = + VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + if (i == svc->temporal_layer_id) { + cpi->rc.bits_off_target = lrc->bits_off_target; + cpi->rc.buffer_level = lrc->buffer_level; + } + } +} + // Update the buffer level for higher temporal layers, given the encoded current // temporal layer. -static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { +static void update_layer_buffer_level_postencode(SVC *svc, + int encoded_frame_size) { int i = 0; - int current_temporal_layer = svc->temporal_layer_id; + const int current_temporal_layer = svc->temporal_layer_id; for (i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; - int bits_off_for_this_layer = - (int)(lc->target_bandwidth / lc->framerate - encoded_frame_size); - lrc->bits_off_target += bits_off_for_this_layer; - + lrc->bits_off_target -= encoded_frame_size; // Clip buffer level to maximum buffer size for the layer. lrc->bits_off_target = VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); @@ -268,21 +314,13 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { } } -// Update the buffer level: leaky bucket model. -static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { - const VP9_COMMON *const cm = &cpi->common; +// Update the buffer level after encoding with encoded frame size. +static void update_buffer_level_postencode(VP9_COMP *cpi, + int encoded_frame_size) { RATE_CONTROL *const rc = &cpi->rc; - - // Non-viewable frames are a special case and are treated as pure overhead. - if (!cm->show_frame) { - rc->bits_off_target -= encoded_frame_size; - } else { - rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; - } - + rc->bits_off_target -= encoded_frame_size; // Clip the buffer level to the maximum specified buffer size. rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); - // For screen-content mode, and if frame-dropper is off, don't let buffer // level go below threshold, given here as -rc->maximum_ buffer_size. if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && @@ -292,7 +330,7 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { rc->buffer_level = rc->bits_off_target; if (is_one_pass_cbr_svc(cpi)) { - update_layer_buffer_level(&cpi->svc, encoded_frame_size); + update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size); } } @@ -355,6 +393,9 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->high_source_sad = 0; rc->reset_high_source_sad = 0; rc->high_source_sad_lagindex = -1; + rc->high_num_blocks_with_motion = 0; + rc->hybrid_intra_scene_change = 0; + rc->re_encode_maxq_scene_change = 0; rc->alt_ref_gf_group = 0; rc->last_frame_is_src_altref = 0; rc->fac_active_worst_inter = 150; @@ -377,6 +418,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { rc->rate_correction_factors[i] = 1.0; + rc->damped_adjustment[i] = 0; } rc->min_gf_interval = oxcf->min_gf_interval; @@ -388,27 +430,115 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( oxcf->init_framerate, rc->min_gf_interval); rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; + + rc->force_max_q = 0; + rc->last_post_encode_dropped_scene_change = 0; + rc->use_post_encode_drop = 0; + rc->ext_use_post_encode_drop = 0; + rc->arf_active_best_quality_adjustment_factor = 1.0; + + rc->preserve_arf_as_gld = 0; + rc->preserve_next_arf_as_gld = 0; + rc->show_arf_as_gld = 0; } -int vp9_rc_drop_frame(VP9_COMP *cpi) { +static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level > drop_mark); + } else { + int i; + // For SVC in the FULL_SUPERFRAME_DROP): the condition on + // buffer (if its above threshold, so no drop) is checked on current and + // upper spatial layers. If any spatial layer is not above threshold then + // we return 0. + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (!(lrc->buffer_level > drop_mark_layer)) return 0; + } + } + return 1; + } +} + +static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level <= drop_mark); + } else { + int i; + // For SVC in the constrained framedrop mode (svc->framedrop_mode = + // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on + // buffer (if its below threshold, so drop frame) is checked on current + // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any + // spatial layer is <= threshold, then we return 1 (drop). + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) { + if (lrc->buffer_level <= drop_mark_layer) return 1; + } else { + if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + } + } + } + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) + return 0; + else + return 1; + } +} + +static int drop_frame(VP9_COMP *cpi) { const VP9EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - if (!oxcf->drop_frames_water_mark || - (is_one_pass_cbr_svc(cpi) && - cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode)) { + SVC *svc = &cpi->svc; + int drop_frames_water_mark = oxcf->drop_frames_water_mark; + if (cpi->use_svc) { + // If we have dropped max_consec_drop frames, then we don't + // drop this spatial layer, and reset counter to 0. + if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) { + svc->drop_count[svc->spatial_layer_id] = 0; + return 0; + } else { + drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id]; + } + } + if (!drop_frames_water_mark || + (svc->spatial_layer_id > 0 && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { return 0; } else { - if (rc->buffer_level < 0) { + if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) || + (check_buffer_below_thresh(cpi, -1) && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { // Always drop if buffer is below 0. return 1; } else { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. int drop_mark = - (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100); - if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100); + if (check_buffer_above_thresh(cpi, drop_mark) && + (rc->decimation_factor > 0)) { --rc->decimation_factor; - } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) { + } else if (check_buffer_below_thresh(cpi, drop_mark) && + rc->decimation_factor == 0) { rc->decimation_factor = 1; } if (rc->decimation_factor > 0) { @@ -427,11 +557,129 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { } } +int post_encode_drop_cbr(VP9_COMP *cpi, size_t *size) { + size_t frame_size = *size << 3; + int64_t new_buffer_level = + cpi->rc.buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size; + + // For now we drop if new buffer level (given the encoded frame size) goes + // below 0. + if (new_buffer_level < 0) { + *size = 0; + vp9_rc_postencode_update_drop_frame(cpi); + // Update flag to use for next frame. + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) + cpi->rc.last_post_encode_dropped_scene_change = 1; + // Force max_q on next fame. + cpi->rc.force_max_q = 1; + cpi->rc.avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + cpi->last_frame_dropped = 1; + cpi->ext_refresh_frame_flags_pending = 0; + if (cpi->use_svc) { + SVC *svc = &cpi->svc; + int sl = 0; + int tl = 0; + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + // Postencode drop is only checked on base spatial layer, + // for now if max-q is set on base we force it on all layers. + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->force_max_q = 1; + lrc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + } + } + } + return 1; + } + + cpi->rc.force_max_q = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + return 0; +} + +int vp9_rc_drop_frame(VP9_COMP *cpi) { + SVC *svc = &cpi->svc; + int svc_prev_layer_dropped = 0; + // In the constrained or full_superframe framedrop mode for svc + // (framedrop_mode != LAYER_DROP), if the previous spatial layer was + // dropped, drop the current spatial layer. + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) + svc_prev_layer_dropped = 1; + if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP) || + drop_frame(cpi)) { + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + if (cpi->use_svc) { + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + if (svc->framedrop_mode == LAYER_DROP || + svc->drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where the base is dropped + // (drop_spatial_layer[0] == 1), which means full superframe dropped, + // we don't increment the svc frame counters. In particular temporal + // layer counter (which is incremented in vp9_inc_frame_in_layer()) + // won't be incremented, so on a dropped frame we try the same + // temporal_layer_id on next incoming frame. This is to avoid an + // issue with temporal alignement with full superframe dropping. + vp9_inc_frame_in_layer(cpi); + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + int all_layers_drop = 1; + for (i = 0; i < svc->spatial_layer_id; i++) { + if (svc->drop_spatial_layer[i] == 0) { + all_layers_drop = 0; + break; + } + } + if (all_layers_drop == 1) svc->skip_enhancement_layer = 0; + } + } + return 1; + } + return 0; +} + +static int adjust_q_cbr(const VP9_COMP *cpi, int q) { + // This makes sure q is between oscillating Qs to prevent resonance. + if (!cpi->rc.reset_high_source_sad && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && + (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && + cpi->rc.q_1_frame != cpi->rc.q_2_frame) { + int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), + VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); + // If the previous frame had overshoot and the current q needs to increase + // above the clamped value, reduce the clamp for faster reaction to + // overshoot. + if (cpi->rc.rc_1_frame == -1 && q > qclamp) + q = (q + qclamp) >> 1; + else + q = qclamp; + } + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + vp9_cyclic_refresh_limit_q(cpi, &q); + return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); +} + static double get_rate_correction_factor(const VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; double rcf; - if (cpi->common.frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rcf = rc->rate_correction_factors[KF_STD]; } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = @@ -451,13 +699,14 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) { static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; // Normalize RCF to account for the size-dependent scaling factor. factor /= rcf_mult[cpi->rc.frame_size_selector]; factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); - if (cpi->common.frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rc->rate_correction_factors[KF_STD] = factor; } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = @@ -478,6 +727,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { int correction_factor = 100; double rate_correction_factor = get_rate_correction_factor(cpi); double adjustment_limit; + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; int projected_size_based_on_q = 0; @@ -494,8 +745,9 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { projected_size_based_on_q = vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; projected_size_based_on_q = - vp9_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs, + vp9_estimate_bits_at_q(frame_type, cm->base_qindex, cm->MBs, rate_correction_factor, cm->bit_depth); } // Work out a size correction factor. @@ -503,10 +755,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) / projected_size_based_on_q); - // More heavily damped adjustment used if we have been oscillating either side - // of target. - adjustment_limit = - 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor))); + // Do not use damped adjustment for the first frame of each frame type + if (!cpi->rc.damped_adjustment[rf_lvl]) { + adjustment_limit = 1.0; + cpi->rc.damped_adjustment[rf_lvl] = 1; + } else { + // More heavily damped adjustment used if we have been oscillating either + // side of target. + adjustment_limit = + 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor))); + } cpi->rc.q_2_frame = cpi->rc.q_1_frame; cpi->rc.q_1_frame = cm->base_qindex; @@ -569,8 +827,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, bits_per_mb_at_this_q = (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb( - cm->frame_type, i, correction_factor, cm->bit_depth); + frame_type, i, correction_factor, cm->bit_depth); } if (bits_per_mb_at_this_q <= target_bits_per_mb) { @@ -585,16 +844,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, } } while (++i <= active_worst_quality); - // In CBR mode, this makes sure q is between oscillating Qs to prevent - // resonance. - if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad && - (!cpi->oxcf.gf_cbr_boost_pct || - !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && - (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && - cpi->rc.q_1_frame != cpi->rc.q_2_frame) { - q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), - VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); - } + // Adjustment to q for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q); + return q; } @@ -623,13 +875,19 @@ static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, kf_low_motion_minq, kf_high_motion_minq); } -static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, +static int get_gf_active_quality(const VP9_COMP *const cpi, int q, vpx_bit_depth_t bit_depth) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const RATE_CONTROL *const rc = &cpi->rc; + int *arfgf_low_motion_minq; int *arfgf_high_motion_minq; + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : rc->gfu_boost; ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); - return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + return get_active_quality(q, gfu_boost, gf_low, gf_high, arfgf_low_motion_minq, arfgf_high_motion_minq); } @@ -674,7 +932,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { int active_worst_quality; int ambient_qp; unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; - if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad) + if (frame_is_intra_only(cm) || rc->reset_high_source_sad || rc->force_max_q) return rc->worst_quality; // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] // for the first few frames following key frame. These are both initialized @@ -685,6 +943,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) : rc->avg_frame_qindex[INTER_FRAME]; + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 5) >> 2); // For SVC if the current base spatial layer was key frame, use the QP from // that base layer for ambient_qp. if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { @@ -694,13 +953,15 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { if (lc->is_key_frame) { const RATE_CONTROL *lrc = &lc->rc; ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]); + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 9) >> 3); } } - active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2); if (rc->buffer_level > rc->optimal_buffer_level) { // Adjust down. - // Maximum limit for down adjustment, ~30%. + // Maximum limit for down adjustment ~30%; make it lower for screen content. int max_adjustment_down = active_worst_quality / 3; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + max_adjustment_down = active_worst_quality >> 3; if (max_adjustment_down) { buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / max_adjustment_down); @@ -769,6 +1030,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); } } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc && + cpi->oxcf.gf_cbr_boost_pct && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was @@ -779,7 +1041,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, } else { q = active_worst_quality; } - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) { @@ -804,21 +1066,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, *top_index = active_worst_quality; *bottom_index = active_best_quality; -#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY - // Limit Q range for the adaptive loop. - if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced && - !(cm->current_video_frame == 0)) { - int qdelta = 0; - vpx_clear_system_state(); - qdelta = vp9_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); - *top_index = active_worst_quality + qdelta; - *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; - } -#endif - // Special case code to try and match quality with forced key frames - if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { q = rc->last_boosted_qindex; } else { q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, @@ -831,6 +1080,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, q = *top_index; } } + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && *bottom_index >= rc->best_quality); @@ -939,7 +1189,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; @@ -954,7 +1204,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { @@ -1045,19 +1295,122 @@ int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) { 1.75, // GF_ARF_STD 2.00, // KF_STD }; - static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = { - INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME - }; const VP9_COMMON *const cm = &cpi->common; - int qdelta = - vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, - rate_factor_deltas[rf_level], cm->bit_depth); + + int qdelta = vp9_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, q, rate_factor_deltas[rf_level], cm->bit_depth); return qdelta; } #define STATIC_MOTION_THRESH 95 + +static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + + if (rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + double last_boosted_q; + int delta_qindex; + int qindex; + + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, cm->bit_depth); + active_worst_quality = + VPXMIN(qindex + delta_qindex, active_worst_quality); + } else { + qindex = rc->last_boosted_qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.75, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 4; + } + + // Dont allow the active min to be lossless (q0) unlesss the max q + // already indicates lossless. + active_best_quality = + VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality)); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + *top_index = active_worst_quality; + *bottom_index = active_best_quality; +} + +static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index, + int gf_group_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int is_intra_frame = frame_is_intra_only(cm); + + const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf); + + int q = cq_level; + int active_best_quality = cq_level; + int active_worst_quality = cq_level; + + // Key frame qp decision + if (is_intra_frame && rc->frames_to_key > 1) + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + + // ARF / GF qp decision + if (!is_intra_frame && !rc->is_src_frame_alt_ref && + cpi->refresh_alt_ref_frame) { + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = ((layer_depth - 1) * cq_level + + active_best_quality + layer_depth / 2) / + layer_depth; + } + } + + q = active_best_quality; + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + return q; +} + static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, - int *top_index) { + int *top_index, int gf_group_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -1067,56 +1420,20 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, int active_worst_quality = cpi->twopass.active_worst_quality; int q; int *inter_minq; + int arf_active_best_quality_adjustment, arf_active_best_quality_max; + int *arfgf_high_motion_minq; + const int boost_frame = + !rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); - if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) { - // Handle the special case for key frames forced when we have reached - // the maximum key frame interval. Here force the Q to a range - // based on the ambient Q to reduce the risk of popping. - if (rc->this_key_frame_forced) { - double last_boosted_q; - int delta_qindex; - int qindex; + if (oxcf->rc_mode == VPX_Q) + return rc_constant_q(cpi, bottom_index, top_index, gf_group_index); - if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { - qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); - active_best_quality = qindex; - last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 1.25, cm->bit_depth); - active_worst_quality = - VPXMIN(qindex + delta_qindex, active_worst_quality); - } else { - qindex = rc->last_boosted_qindex; - last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75, cm->bit_depth); - active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); - } - } else { - // Not forced keyframe. - double q_adj_factor = 1.0; - double q_val; - // Baseline value derived from cpi->active_worst_quality and kf boost. - active_best_quality = - get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); - - // Allow somewhat lower kf minq with small image formats. - if ((cm->width * cm->height) <= (352 * 288)) { - q_adj_factor -= 0.25; - } - - // Make a further adjustment based on the kf zero motion measure. - q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); - - // Convert the adjustment factor to a qindex delta - // on active_best_quality. - q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); - active_best_quality += - vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); - } - } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + if (frame_is_intra_only(cm)) { + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + } else if (boost_frame) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. @@ -1129,63 +1446,59 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, // For constrained quality dont allow Q less than the cq level if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; + } + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_high_motion_minq); + arf_active_best_quality_max = arfgf_high_motion_minq[q]; + arf_active_best_quality_adjustment = + arf_active_best_quality_max - active_best_quality; + active_best_quality = arf_active_best_quality_max - + (int)(arf_active_best_quality_adjustment * + rc->arf_active_best_quality_adjustment_factor); - // Constrained quality use slightly lower active best. - active_best_quality = active_best_quality * 15 / 16; - - } else if (oxcf->rc_mode == VPX_Q) { - if (!cpi->refresh_alt_ref_frame) { - active_best_quality = cq_level; - } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); - - // Modify best quality for second level arfs. For mode VPX_Q this - // becomes the baseline frame q. - if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) - active_best_quality = (active_best_quality + cq_level + 1) / 2; - } - } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = + ((layer_depth - 1) * q + active_best_quality + layer_depth / 2) / + layer_depth; } } else { - if (oxcf->rc_mode == VPX_Q) { - active_best_quality = cq_level; - } else { - active_best_quality = inter_minq[active_worst_quality]; + active_best_quality = inter_minq[active_worst_quality]; - // For the constrained quality mode we don't want - // q to fall below the cq level. - if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { - active_best_quality = cq_level; - } + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; } } // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. - if (cpi->oxcf.rc_mode != VPX_Q) { - if (frame_is_intra_only(cm) || - (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { - active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); - active_worst_quality += (cpi->twopass.extend_maxq / 2); - } else { - active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; - active_worst_quality += cpi->twopass.extend_maxq; - } + if (frame_is_intra_only(cm) || boost_frame) { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); + active_worst_quality += (cpi->twopass.extend_maxq / 2); + } else { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; + active_worst_quality += cpi->twopass.extend_maxq; + + // For normal frames do not allow an active minq lower than the q used for + // the last boosted frame. + active_best_quality = VPXMAX(active_best_quality, rc->last_boosted_qindex); } #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY vpx_clear_system_state(); // Static forced key frames Q restrictions dealt with elsewhere. - if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) || - !rc->this_key_frame_forced || - (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { - int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index], + if (!frame_is_intra_only(cm) || !rc->this_key_frame_forced || + cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH) { + int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group_index], active_worst_quality); active_worst_quality = VPXMAX(active_worst_quality + qdelta, active_best_quality); @@ -1205,17 +1518,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, active_worst_quality = clamp(active_worst_quality, active_best_quality, rc->worst_quality); - if (oxcf->rc_mode == VPX_Q) { - q = active_best_quality; - // Special case code to try and match quality with forced key frames. - } else if ((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) && - rc->this_key_frame_forced) { + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { // If static since last kf use better of last boosted and last kf q. if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); } else { q = rc->last_boosted_qindex; } + } else if (frame_is_intra_only(cm) && !rc->this_key_frame_forced) { + q = active_best_quality; } else { q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, active_worst_quality); @@ -1242,13 +1553,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, int *top_index) { int q; + const int gf_group_index = cpi->twopass.gf_group.index; if (cpi->oxcf.pass == 0) { if (cpi->oxcf.rc_mode == VPX_CBR) q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); else q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); } else { - q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index); + q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, + gf_group_index); } if (cpi->sf.use_nonrd_pick_mode) { if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex; @@ -1261,6 +1574,89 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, return q; } +void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) { + VP9_COMMON *cm = &cpi->common; + TWO_PASS *const twopass = &cpi->twopass; + + cpi->rc.is_src_frame_alt_ref = 0; + cm->show_existing_frame = 0; + cpi->rc.show_arf_as_gld = 0; + switch (twopass->gf_group.update_type[gf_group_index]) { + case KF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + break; + case LF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + break; + case GF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + break; + case OVERLAY_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + if (cpi->rc.preserve_arf_as_gld) { + cpi->rc.show_arf_as_gld = 1; + cpi->refresh_golden_frame = 0; + cm->show_existing_frame = 1; + cm->refresh_frame_context = 0; + } + break; + case MID_OVERLAY_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + case USE_BUF_FRAME: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + cm->show_existing_frame = 1; + cm->refresh_frame_context = 0; + break; + default: + assert(twopass->gf_group.update_type[gf_group_index] == ARF_UPDATE); + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 1; + break; + } +} + +void vp9_estimate_qp_gop(VP9_COMP *cpi) { + int gop_length = cpi->twopass.gf_group.gf_group_size; + int bottom_index, top_index; + int idx; + const int gf_index = cpi->twopass.gf_group.index; + const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; + const int refresh_frame_context = cpi->common.refresh_frame_context; + + for (idx = 1; idx <= gop_length; ++idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; + int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; + cpi->twopass.gf_group.index = idx; + vp9_rc_set_frame_target(cpi, target_rate); + vp9_configure_buffer_updates(cpi, idx); + tpl_frame->base_qindex = + rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx); + tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); + } + // Reset the actual index and frame update + cpi->twopass.gf_group.index = gf_index; + cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; + cpi->common.refresh_frame_context = refresh_frame_context; + vp9_configure_buffer_updates(cpi, gf_index); +} + void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { @@ -1333,6 +1729,15 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; rc->frames_since_golden++; + + if (rc->show_arf_as_gld) { + rc->frames_since_golden = 0; + // If we are not using alt ref in the up and coming group clear the arf + // active flag. In multi arf group case, if the index is not 0 then + // we are overlaying a mid group arf so should not reset the flag. + if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0)) + rc->source_alt_ref_active = 0; + } } } @@ -1367,7 +1772,8 @@ static void compute_frame_low_motion(VP9_COMP *const cpi) { int cnt_zeromv = 0; for (mi_row = 0; mi_row < rows; mi_row++) { for (mi_col = 0; mi_col < cols; mi_col++) { - if (abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16) + if (mi[0]->ref_frame[0] == LAST_FRAME && + abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16) cnt_zeromv++; mi++; } @@ -1381,6 +1787,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { const VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; const int qindex = cm->base_qindex; // Update rate control heuristics @@ -1390,7 +1797,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { vp9_rc_update_rate_correction_factors(cpi); // Keep a record of last Q and ambient average Q. - if (cm->frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rc->last_q[KEY_FRAME] = qindex; rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); @@ -1423,6 +1830,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } } + if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi); + // Keep record of last boosted (KF/KF/ARF) Q value. // If the current frame is coded at a lower Q then we also update it. // If all mbs in this group are skipped only update if the Q value is @@ -1434,13 +1843,13 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { rc->last_boosted_qindex = qindex; } - if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex; + if (frame_is_intra_only(cm)) rc->last_kf_qindex = qindex; - update_buffer_level(cpi, rc->projected_frame_size); + update_buffer_level_postencode(cpi, rc->projected_frame_size); // Rolling monitors of whether we are over or underspending used to help // regulate min and Max Q in two pass. - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { rc->rolling_target_bits = ROUND_POWER_OF_TWO( rc->rolling_target_bits * 3 + rc->this_frame_target, 2); rc->rolling_actual_bits = ROUND_POWER_OF_TWO( @@ -1457,9 +1866,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; - if (!cpi->use_svc || is_two_pass_svc(cpi)) { + if (!cpi->use_svc) { if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && - (cm->frame_type != KEY_FRAME)) + (!frame_is_intra_only(cm))) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); else @@ -1467,7 +1876,28 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { update_golden_frame_stats(cpi); } - if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0; + // If second (long term) temporal reference is used for SVC, + // update the golden frame counter, only for base temporal layer. + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id == 0) { + int i = 0; + if (cpi->refresh_golden_frame) + rc->frames_since_golden = 0; + else + rc->frames_since_golden++; + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + // Update the frames_since_golden for all upper temporal layers. + for (i = 1; i < svc->number_temporal_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->frames_since_golden = rc->frames_since_golden; + } + } + + if (frame_is_intra_only(cm)) rc->frames_since_key = 0; if (cm->show_frame) { rc->frames_since_key++; rc->frames_to_key--; @@ -1481,24 +1911,53 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } if (oxcf->pass == 0) { - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm) && + (!cpi->use_svc || + (cpi->use_svc && + !svc->layer_context[svc->temporal_layer_id].is_key_frame && + svc->spatial_layer_id == svc->number_spatial_layers - 1))) { compute_frame_low_motion(cpi); if (cpi->sf.use_altref_onepass) update_altref_usage(cpi); } + // For SVC: set avg_frame_low_motion (only computed on top spatial layer) + // to all lower spatial layers. + if (cpi->use_svc && + svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + for (i = 0; i < svc->number_spatial_layers - 1; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_frame_low_motion = rc->avg_frame_low_motion; + } + } cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref; } - if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0; + if (!frame_is_intra_only(cm)) rc->reset_high_source_sad = 0; rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; + if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1) + svc->lower_layer_qindex = cm->base_qindex; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { - // Update buffer level with zero size, update frame counters, and return. - update_buffer_level(cpi, 0); + cpi->common.current_video_frame++; cpi->rc.frames_since_key++; cpi->rc.frames_to_key--; cpi->rc.rc_2_frame = 0; cpi->rc.rc_1_frame = 0; + cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; + // For SVC on dropped frame when framedrop_mode != LAYER_DROP: + // in this mode the whole superframe may be dropped if only a single layer + // has buffer underflow (below threshold). Since this can then lead to + // increasing buffer levels/overflow for certain layers even though whole + // superframe is dropped, we cap buffer level if its already stable. + if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP && + cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) { + cpi->rc.buffer_level = cpi->rc.optimal_buffer_level; + cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level; + } } static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { @@ -1544,10 +2003,9 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; - // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. if (!cpi->refresh_alt_ref_frame && (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + rc->frames_to_key == 0)) { cm->frame_type = KEY_FRAME; rc->this_key_frame_forced = cm->current_video_frame != 0 && rc->frames_to_key == 0; @@ -1582,9 +2040,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { // Adjust boost and af_ratio based on avg_frame_low_motion, which varies // between 0 and 100 (stationary, 100% zero/small motion). rc->gfu_boost = - VPXMAX(500, - DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / - (rc->avg_frame_low_motion + 100)); + VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } adjust_gfint_frame_constraint(cpi, rc->frames_to_key); @@ -1684,30 +2141,80 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { return vp9_rc_clamp_iframe_target_size(cpi, target); } +static void set_intra_only_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + // Don't allow intra_only frame for bypass/flexible SVC mode, or if number + // of spatial layers is 1 or if number of spatial or temporal layers > 3. + // Also if intra-only is inserted on very first frame, don't allow if + // if number of temporal layers > 1. This is because on intra-only frame + // only 3 reference buffers can be updated, but for temporal layers > 1 + // we generally need to use buffer slots 4 and 5. + if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) || + svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS || + svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 || + svc->number_spatial_layers == 1) + return; + cm->show_frame = 0; + cm->intra_only = 1; + cm->frame_type = INTER_FRAME; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + cpi->ext_refresh_golden_frame = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (cm->current_video_frame == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; + } else { + int i; + int count = 0; + cpi->lst_fb_idx = -1; + cpi->gld_fb_idx = -1; + cpi->alt_fb_idx = -1; + // For intra-only frame we need to refresh all slots that were + // being used for the base layer (fb_idx_base[i] == 1). + // Start with assigning last first, then golden and then alt. + for (i = 0; i < REF_FRAMES; ++i) { + if (svc->fb_idx_base[i] == 1) count++; + if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i; + if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i; + if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i; + } + // If golden or alt is not being used for base layer, then set them + // to the lst_fb_idx. + if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx; + if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx; + } +} + void vp9_rc_get_svc_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; int target = rc->avg_frame_bandwidth; - int layer = - LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); + int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + if (svc->first_spatial_layer_to_encode) + svc->layer_context[svc->temporal_layer_id].is_key_frame = 0; // Periodic key frames is based on the super-frame counter // (svc.current_superframe), also only base spatial layer is key frame. - if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || + // Key frame is set for any of the following: very first frame, frame flags + // indicates key, superframe counter hits key frequencey, or (non-intra) sync + // flag is set for spatial layer 0. + if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || (cpi->oxcf.auto_key && - (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) && - cpi->svc.spatial_layer_id == 0)) { + (svc->current_superframe % cpi->oxcf.key_freq == 0) && + !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) || + (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (is_two_pass_svc(cpi)) { - cpi->svc.layer_context[layer].is_key_frame = 1; - cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - } else if (is_one_pass_cbr_svc(cpi)) { - if (cm->current_video_frame > 0) vp9_svc_reset_key_frame(cpi); - layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - cpi->svc.layer_context[layer].is_key_frame = 1; + if (is_one_pass_cbr_svc(cpi)) { + if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1); + layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + svc->layer_context[layer].is_key_frame = 1; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); // Assumption here is that LAST_FRAME is being updated for a keyframe. // Thus no change in update flags. @@ -1715,48 +2222,127 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { } } else { cm->frame_type = INTER_FRAME; - if (is_two_pass_svc(cpi)) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; - if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - } - cpi->ref_frame_flags &= (~VP9_ALT_FLAG); - } else if (is_one_pass_cbr_svc(cpi)) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; - } + if (is_one_pass_cbr_svc(cpi)) { + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + // Add condition current_video_frame > 0 for the case where first frame + // is intra only followed by overlay/copy frame. In this case we don't + // want to reset is_key_frame to 0 on overlay/copy frame. + lc->is_key_frame = + (svc->spatial_layer_id == 0 && cm->current_video_frame > 0) + ? 0 + : svc->layer_context[svc->temporal_layer_id].is_key_frame; target = calc_pframe_target_size_one_pass_cbr(cpi); } } + if (svc->simulcast_mode) { + if (svc->spatial_layer_id > 0 && + svc->layer_context[layer].is_key_frame == 1) { + cm->frame_type = KEY_FRAME; + cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + target = calc_iframe_target_size_one_pass_cbr(cpi); + } + // Set the buffer idx and refresh flags for key frames in simulcast mode. + // Note the buffer slot for long-term reference is set below (line 2255), + // and alt_ref is used for that on key frame. So use last and golden for + // the other two normal slots. + if (cm->frame_type == KEY_FRAME) { + if (svc->number_spatial_layers == 2) { + if (svc->spatial_layer_id == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 2; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 1) { + cpi->lst_fb_idx = 1; + cpi->gld_fb_idx = 3; + cpi->alt_fb_idx = 6; + } + } else if (svc->number_spatial_layers == 3) { + if (svc->spatial_layer_id == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 3; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 1) { + cpi->lst_fb_idx = 1; + cpi->gld_fb_idx = 4; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 2) { + cpi->lst_fb_idx = 2; + cpi->gld_fb_idx = 5; + cpi->alt_fb_idx = 7; + } + } + cpi->ext_refresh_last_frame = 1; + cpi->ext_refresh_golden_frame = 1; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + + // Check if superframe contains a sync layer request. + vp9_svc_check_spatial_layer_sync(cpi); + + // If long term termporal feature is enabled, set the period of the update. + // The update/refresh of this reference frame is always on base temporal + // layer frame. + if (svc->use_gf_temporal_ref_current_layer) { + // Only use gf long-term prediction on non-key superframes. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // Use golden for this reference, which will be used for prediction. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->gld_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + // Enable prediction off LAST (last reference) and golden (which will + // generally be further behind/long-term reference). + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + // Check for update/refresh of reference: only refresh on base temporal + // layer. + if (svc->temporal_layer_id == 0) { + if (svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On key frame we update the buffer index used for long term reference. + // Use the alt_ref since it is not used or updated on key frames. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } else if (rc->frames_till_gf_update_due == 0) { + // Set perdiod of next update. Make it a multiple of 10, as the cyclic + // refresh is typically ~10%, and we'd like the update to happen after + // a few cylces of the refresh (so it better quality frame). Note the + // cyclic refresh for SVC only operates on base temporal layer frames. + // Choose 20 as perdiod for now (2 cycles). + rc->baseline_gf_interval = 20; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + cpi->ext_refresh_golden_frame = 1; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + } + } else if (!svc->use_gf_temporal_ref) { + rc->frames_till_gf_update_due = INT_MAX; + rc->baseline_gf_interval = INT_MAX; + } + if (svc->set_intra_only_frame) { + set_intra_only_frame(cpi); + target = calc_iframe_target_size_one_pass_cbr(cpi); + } // Any update/change of global cyclic refresh parameters (amount/delta-qp) // should be done here, before the frame qp is selected. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_parameters(cpi); vp9_rc_set_frame_target(cpi, target); - rc->frames_till_gf_update_due = INT_MAX; - rc->baseline_gf_interval = INT_MAX; + if (cm->show_frame) update_buffer_level_svc_preencode(cpi); } void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; - // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. - if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && rc->frames_to_key == 0)) { cm->frame_type = KEY_FRAME; - rc->this_key_frame_forced = - cm->current_video_frame != 0 && rc->frames_to_key == 0; rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; @@ -1782,12 +2368,15 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_parameters(cpi); - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) target = calc_iframe_target_size_one_pass_cbr(cpi); else target = calc_pframe_target_size_one_pass_cbr(cpi); vp9_rc_set_frame_target(cpi, target); + + if (cm->show_frame) update_buffer_level_preencode(cpi); + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC) cpi->resize_pending = vp9_resize_one_pass_cbr(cpi); else @@ -1859,13 +2448,8 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( cpi->framerate, rc->min_gf_interval); - // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; - - if (is_altref_enabled(cpi)) { - if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; - } + // Extended max interval for genuinely static scenes like slide shows. + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; @@ -1909,12 +2493,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); // A maximum bitrate for a frame is defined. - // The baseline for this aligns with HW implementations that - // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits - // per 16x16 MB (averaged over a frame). However this limit is extended if - // a very high rate is given on the command line or the the rate cannnot - // be acheived because of a user specificed max q (e.g. when the user - // specifies lossless encode. + // However this limit is extended if a very high rate is given on the command + // line or the the rate cannnot be acheived because of a user specificed max q + // (e.g. when the user specifies lossless encode). + // + // If a level is specified that requires a lower maximum rate then the level + // value take precedence. vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / 100); @@ -2271,30 +2855,56 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, void vp9_scene_detection_onepass(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + YV12_BUFFER_CONFIG const *unscaled_src = cpi->un_scaled_source; + YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source; + uint8_t *src_y; + int src_ystride; + int src_width; + int src_height; + uint8_t *last_src_y; + int last_src_ystride; + int last_src_width; + int last_src_height; + if (cpi->un_scaled_source == NULL || cpi->unscaled_last_source == NULL || + (cpi->use_svc && cpi->svc.current_superframe == 0)) + return; + src_y = unscaled_src->y_buffer; + src_ystride = unscaled_src->y_stride; + src_width = unscaled_src->y_width; + src_height = unscaled_src->y_height; + last_src_y = unscaled_last_src->y_buffer; + last_src_ystride = unscaled_last_src->y_stride; + last_src_width = unscaled_last_src->y_width; + last_src_height = unscaled_last_src->y_height; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) return; #endif rc->high_source_sad = 0; - if (cpi->Last_Source != NULL && - cpi->Last_Source->y_width == cpi->Source->y_width && - cpi->Last_Source->y_height == cpi->Source->y_height) { + rc->high_num_blocks_with_motion = 0; + // For SVC: scene detection is only checked on first spatial layer of + // the superframe using the original/unscaled resolutions. + if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode && + src_width == last_src_width && src_height == last_src_height) { YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; - uint8_t *src_y = cpi->Source->y_buffer; - int src_ystride = cpi->Source->y_stride; - uint8_t *last_src_y = cpi->Last_Source->y_buffer; - int last_src_ystride = cpi->Last_Source->y_stride; + int num_mi_cols = cm->mi_cols; + int num_mi_rows = cm->mi_rows; int start_frame = 0; int frames_to_buffer = 1; int frame = 0; int scene_cut_force_key_frame = 0; + int num_zero_temp_sad = 0; uint64_t avg_sad_current = 0; - uint32_t min_thresh = 4000; + uint32_t min_thresh = 10000; float thresh = 8.0f; uint32_t thresh_key = 140000; if (cpi->oxcf.speed <= 5) thresh_key = 240000; - if (cpi->oxcf.rc_mode == VPX_VBR) { - min_thresh = 65000; - thresh = 2.1f; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000; + if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f; + if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) { + const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2); + num_mi_cols = aligned_width >> MI_SIZE_LOG2; + num_mi_rows = aligned_height >> MI_SIZE_LOG2; } if (cpi->oxcf.lag_in_frames > 0) { frames_to_buffer = (cm->current_video_frame == 1) @@ -2342,14 +2952,15 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { uint64_t avg_sad = 0; uint64_t tmp_sad = 0; int num_samples = 0; - int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; - int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + int sb_cols = (num_mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + int sb_rows = (num_mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; if (cpi->oxcf.lag_in_frames > 0) { src_y = frames[frame]->y_buffer; src_ystride = frames[frame]->y_stride; last_src_y = frames[frame + 1]->y_buffer; last_src_ystride = frames[frame + 1]->y_stride; } + num_zero_temp_sad = 0; for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { // Checker-board pattern, ignore boundary. @@ -2361,6 +2972,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { last_src_ystride); avg_sad += tmp_sad; num_samples++; + if (tmp_sad == 0) num_zero_temp_sad++; } src_y += 64; last_src_y += 64; @@ -2377,7 +2989,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { if (avg_sad > VPXMAX(min_thresh, (unsigned int)(rc->avg_source_sad[0] * thresh)) && - rc->frames_since_key > 1) + rc->frames_since_key > 1 + cpi->svc.number_spatial_layers && + num_zero_temp_sad < 3 * (num_samples >> 2)) rc->high_source_sad = 1; else rc->high_source_sad = 0; @@ -2388,6 +3001,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { } else { rc->avg_source_sad[lagframe_idx] = avg_sad; } + if (num_zero_temp_sad < (3 * num_samples >> 2)) + rc->high_num_blocks_with_motion = 1; } } // For CBR non-screen content mode, check if we should reset the rate @@ -2407,6 +3022,19 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad) rc->this_frame_target = rc->avg_frame_bandwidth; } + // For SVC the new (updated) avg_source_sad[0] for the current superframe + // updates the setting for all layers. + if (cpi->use_svc) { + int sl, tl; + SVC *const svc = &cpi->svc; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_source_sad[0] = rc->avg_source_sad[0]; + } + } // For VBR, under scene change/high content change, force golden refresh. if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME && rc->high_source_sad && rc->frames_to_key > 3 && @@ -2437,12 +3065,26 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { // Test if encoded frame will significantly overshoot the target bitrate, and // if so, set the QP, reset/adjust some rate control parameters, and return 1. +// frame_size = -1 means frame has not been encoded. int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - int thresh_qp = 3 * (rc->worst_quality >> 2); - int thresh_rate = rc->avg_frame_bandwidth * 10; - if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) { + SPEED_FEATURES *const sf = &cpi->sf; + int thresh_qp = 7 * (rc->worst_quality >> 3); + int thresh_rate = rc->avg_frame_bandwidth << 3; + // Lower thresh_qp for video (more overshoot at lower Q) to be + // more conservative for video. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + thresh_qp = 3 * (rc->worst_quality >> 2); + // If this decision is not based on an encoded frame size but just on + // scene/slide change detection (i.e., re_encode_overshoot_cbr_rt == + // FAST_DETECTION_MAXQ), for now skip the (frame_size > thresh_rate) + // condition in this case. + // TODO(marpan): Use a better size/rate condition for this case and + // adjust thresholds. + if ((sf->overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ || + frame_size > thresh_rate) && + cm->base_qindex < thresh_qp) { double rate_correction_factor = cpi->rc.rate_correction_factors[INTER_NORMAL]; const int target_size = cpi->rc.avg_frame_bandwidth; @@ -2452,6 +3094,29 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { int enumerator; // Force a re-encode, and for now use max-QP. *q = cpi->rc.worst_quality; + cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 1; + // If the frame_size is much larger than the threshold (big content change) + // and the encoded frame used alot of Intra modes, then force hybrid_intra + // encoding for the re-encode on this scene change. hybrid_intra will + // use rd-based intra mode selection for small blocks. + if (sf->overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) { + MODE_INFO **mi = cm->mi_grid_visible; + int sum_intra_usage = 0; + int mi_row, mi_col; + int tot = 0; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++; + tot++; + mi++; + } + mi += 8; + } + sum_intra_usage = 100 * sum_intra_usage / (cm->mi_rows * cm->mi_cols); + if (sum_intra_usage > 60) cpi->rc.hybrid_intra_scene_change = 1; + } // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as // these parameters will affect QP selection for subsequent frames. If they // have settled down to a very different (low QP) state, then not adjusting @@ -2479,21 +3144,27 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor; } // For temporal layers, reset the rate control parametes across all - // temporal layers. + // temporal layers. If the first_spatial_layer_to_encode > 0, then this + // superframe has skipped lower base layers. So in this case we should also + // reset and force max-q for spatial layers < first_spatial_layer_to_encode. if (cpi->use_svc) { - int i = 0; + int tl = 0; + int sl = 0; SVC *svc = &cpi->svc; - for (i = 0; i < svc->number_temporal_layers; ++i) { - const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, - svc->number_temporal_layers); - LAYER_CONTEXT *lc = &svc->layer_context[layer]; - RATE_CONTROL *lrc = &lc->rc; - lrc->avg_frame_qindex[INTER_FRAME] = *q; - lrc->buffer_level = rc->optimal_buffer_level; - lrc->bits_off_target = rc->optimal_buffer_level; - lrc->rc_1_frame = 0; - lrc->rc_2_frame = 0; - lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + for (sl = 0; sl < svc->first_spatial_layer_to_encode; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = *q; + lrc->buffer_level = lrc->optimal_buffer_level; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + lrc->force_max_q = 1; + } } } return 1; diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.h b/libs/libvpx/vp9/encoder/vp9_ratectrl.h index c1b210677e..09d69e4d4e 100644 --- a/libs/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RATECTRL_H_ -#define VP9_ENCODER_VP9_RATECTRL_H_ +#ifndef VPX_VP9_ENCODER_VP9_RATECTRL_H_ +#define VPX_VP9_ENCODER_VP9_RATECTRL_H_ #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" @@ -34,6 +34,14 @@ extern "C" { #define FRAME_OVERHEAD_BITS 200 +// Threshold used to define a KF group as static (e.g. a slide show). +// Essentially this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 + +// The maximum duration of a GF group that is static (for example a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, @@ -167,15 +175,34 @@ typedef struct { uint64_t avg_source_sad[MAX_LAG_BUFFERS]; uint64_t prev_avg_source_sad_lag; int high_source_sad_lagindex; + int high_num_blocks_with_motion; int alt_ref_gf_group; int last_frame_is_src_altref; int high_source_sad; int count_last_scene_change; + int hybrid_intra_scene_change; + int re_encode_maxq_scene_change; int avg_frame_low_motion; int af_ratio_onepass_vbr; int force_qpmin; int reset_high_source_sad; double perc_arf_usage; + int force_max_q; + // Last frame was dropped post encode on scene change. + int last_post_encode_dropped_scene_change; + // Enable post encode frame dropping for screen content. Only enabled when + // ext_use_post_encode_drop is enabled by user. + int use_post_encode_drop; + // External flag to enable post encode frame dropping, controlled by user. + int ext_use_post_encode_drop; + + int damped_adjustment[RATE_FACTOR_LEVELS]; + double arf_active_best_quality_adjustment_factor; + int arf_active_best_quality_adjustment_window; + + int preserve_arf_as_gld; + int preserve_next_arf_as_gld; + int show_arf_as_gld; } RATE_CONTROL; struct VP9_COMP; @@ -184,7 +211,7 @@ struct VP9EncoderConfig; void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc); -int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, +int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, double correction_factor, vpx_bit_depth_t bit_depth); double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); @@ -195,9 +222,9 @@ void vp9_rc_init_minq_luts(void); int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate); // Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to -// be passed in to ensure that the max_gf_interval returned is at least as bis +// be passed in to ensure that the max_gf_interval returned is at least as big // as that. -int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); +int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); // Generally at the high level, the following flow is expected // to be enforced for rate control: @@ -237,13 +264,16 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); // Changes only the rate correction factors in the rate control structure. void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi); +// Post encode drop for CBR mode. +int post_encode_drop_cbr(struct VP9_COMP *cpi, size_t *size); + // Decide if we should drop this frame: For 1-pass CBR. // Changes only the decimation count in the rate control structure int vp9_rc_drop_frame(struct VP9_COMP *cpi); // Computes frame size bounds. void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi, - int this_frame_target, + int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit); @@ -294,8 +324,12 @@ void vp9_scene_detection_onepass(struct VP9_COMP *cpi); int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); +void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index); + +void vp9_estimate_qp_gop(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RATECTRL_H_ +#endif // VPX_VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_rd.c b/libs/libvpx/vp9/encoder/vp9_rd.c index 6b2306ce9b..34c74424ce 100644 --- a/libs/libvpx/vp9/encoder/vp9_rd.c +++ b/libs/libvpx/vp9/encoder/vp9_rd.c @@ -57,6 +57,30 @@ void vp9_rd_cost_init(RD_COST *rd_cost) { rd_cost->rdcost = 0; } +int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist) { + assert(mult >= 0); + assert(div > 0); + if (rate >= 0 && dist >= 0) { + return RDCOST(mult, div, rate, dist); + } + if (rate >= 0 && dist < 0) { + return RDCOST_NEG_D(mult, div, rate, -dist); + } + if (rate < 0 && dist >= 0) { + return RDCOST_NEG_R(mult, div, -rate, dist); + } + return -RDCOST(mult, div, -rate, -dist); +} + +void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost) { + if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX) { + rd_cost->rdcost = + vp9_calculate_rd_cost(mult, div, rd_cost->rate, rd_cost->dist); + } else { + vp9_rd_cost_reset(rd_cost); + } +} + // The baseline rd thresholds for breaking out of the rd loop for // certain modes are assumed to be based on 8x8 blocks. // This table is used to correct for block size. @@ -69,10 +93,12 @@ static void fill_mode_costs(VP9_COMP *cpi) { const FRAME_CONTEXT *const fc = cpi->common.fc; int i, j; - for (i = 0; i < INTRA_MODES; ++i) - for (j = 0; j < INTRA_MODES; ++j) + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES; ++j) { vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], vp9_intra_mode_tree); + } + } vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); for (i = 0; i < INTRA_MODES; ++i) { @@ -82,9 +108,28 @@ static void fill_mode_costs(VP9_COMP *cpi) { fc->uv_mode_prob[i], vp9_intra_mode_tree); } - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { vp9_cost_tokens(cpi->switchable_interp_costs[i], fc->switchable_interp_prob[i], vp9_switchable_interp_tree); + } + + for (i = TX_8X8; i < TX_SIZES; ++i) { + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) { + const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs); + int k; + for (k = 0; k <= i; ++k) { + int cost = 0; + int m; + for (m = 0; m <= k - (k == i); ++m) { + if (m == k) + cost += vp9_cost_zero(tx_probs[m]); + else + cost += vp9_cost_one(tx_probs[m]); + } + cpi->tx_size_cost[i - 1][j][k] = cost; + } + } + } } static void fill_token_costs(vp9_coeff_cost *c, @@ -143,40 +188,74 @@ void vp9_init_me_luts(void) { static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, 8, 8, 4, 4, 2, 2, 1, 0 }; -static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, - 128, 144 }; -int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { - const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); -#if CONFIG_VP9_HIGHBITDEPTH - int64_t rdmult = 0; - switch (cpi->common.bit_depth) { - case VPX_BITS_8: rdmult = 88 * q * q / 24; break; - case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break; - case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; +// Note that the element below for frame type "USE_BUF_FRAME", which indicates +// that the show frame flag is set, should not be used as no real frame +// is encoded so we should not reach here. However, a dummy value +// is inserted here to make sure the data structure has the right number +// of values assigned. +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, + 128, 144, 144 }; + +int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { + // largest dc_quant is 21387, therefore rdmult should always fit in int32_t + const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); + uint32_t rdmult = q * q; + + if (cpi->common.frame_type != KEY_FRAME) { + if (qindex < 128) + rdmult = rdmult * 4; + else if (qindex < 190) + rdmult = rdmult * 4 + rdmult / 2; + else + rdmult = rdmult * 3; + } else { + if (qindex < 64) + rdmult = rdmult * 4; + else if (qindex <= 128) + rdmult = rdmult * 3 + rdmult / 2; + else if (qindex < 190) + rdmult = rdmult * 4 + rdmult / 2; + else + rdmult = rdmult * 7 + rdmult / 2; + } +#if CONFIG_VP9_HIGHBITDEPTH + switch (cpi->common.bit_depth) { + case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; + case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; + default: break; } -#else - int64_t rdmult = 88 * q * q / 24; #endif // CONFIG_VP9_HIGHBITDEPTH - return rdmult; + return rdmult > 0 ? rdmult : 1; } -int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { - int64_t rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); - +static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) { + int64_t rdmult_64 = rdmult; if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; - const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100)); + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : cpi->rc.gfu_boost; + const int boost_index = VPXMIN(15, (gfu_boost / 100)); - rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; - rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7; + rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7); } - if (rdmult < 1) rdmult = 1; - return (int)rdmult; + return (int)rdmult_64; +} + +int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { + int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); + return modulate_rdmult(cpi, rdmult); +} + +int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) { + int rdmult = + vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex); + rdmult = (int)((double)rdmult / beta); + rdmult = rdmult > 0 ? rdmult : 1; + return modulate_rdmult(cpi, rdmult); } static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { @@ -185,10 +264,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break; case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break; - case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; + break; } #else (void)bit_depth; @@ -209,12 +288,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) { x->sadperbit16 = sad_per_bit16lut_10[qindex]; x->sadperbit4 = sad_per_bit4lut_10[qindex]; break; - case VPX_BITS_12: + default: + assert(cpi->common.bit_depth == VPX_BITS_12); x->sadperbit16 = sad_per_bit16lut_12[qindex]; x->sadperbit4 = sad_per_bit4lut_12[qindex]; break; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); } #else (void)cpi; @@ -255,6 +333,15 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { } } +void vp9_build_inter_mode_cost(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + int i; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + vp9_cost_tokens((int *)cpi->inter_mode_cost[i], cm->fc->inter_mode_probs[i], + vp9_inter_mode_tree); + } +} + void vp9_initialize_rd_consts(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; @@ -303,10 +390,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc, cm->allow_high_precision_mv); - - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - vp9_cost_tokens((int *)cpi->inter_mode_cost[i], - cm->fc->inter_mode_probs[i], vp9_inter_mode_tree); + vp9_build_inter_mode_cost(cpi); } } } @@ -471,13 +555,13 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, for (i = 0; i < num_4x4_h; i += 4) t_left[i] = !!*(const uint32_t *)&left[i]; break; - case TX_32X32: + default: + assert(tx_size == TX_32X32); for (i = 0; i < num_4x4_w; i += 8) t_above[i] = !!*(const uint64_t *)&above[i]; for (i = 0; i < num_4x4_h; i += 8) t_left[i] = !!*(const uint64_t *)&left[i]; break; - default: assert(0 && "Invalid transform size."); break; } } @@ -493,8 +577,7 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, uint8_t *src_y_ptr = x->plane[0].src.buf; uint8_t *ref_y_ptr; const int num_mv_refs = - MAX_MV_REF_CANDIDATES + - (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size); + MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size); MV pred_mv[3]; pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv; @@ -504,11 +587,12 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int == x->mbmi_ext->ref_mvs[ref_frame][1].as_int; + // Get the sad for each candidate reference mv. for (i = 0; i < num_mv_refs; ++i) { const MV *this_mv = &pred_mv[i]; int fp_row, fp_col; - + if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue; if (i == 1 && near_same_nearest) continue; fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; @@ -573,6 +657,7 @@ YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, const VP9_COMMON *const cm = &cpi->common; const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame); + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ? &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL; diff --git a/libs/libvpx/vp9/encoder/vp9_rd.h b/libs/libvpx/vp9/encoder/vp9_rd.h index 59022c106e..df6ea9094c 100644 --- a/libs/libvpx/vp9/encoder/vp9_rd.h +++ b/libs/libvpx/vp9/encoder/vp9_rd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RD_H_ -#define VP9_ENCODER_VP9_RD_H_ +#ifndef VPX_VP9_ENCODER_VP9_RD_H_ +#define VPX_VP9_ENCODER_VP9_RD_H_ #include @@ -27,7 +27,12 @@ extern "C" { #define RD_EPB_SHIFT 6 #define RDCOST(RM, DM, R, D) \ - (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM)) + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) + ((D) << (DM)) +#define RDCOST_NEG_R(RM, DM, R, D) \ + ((D) << (DM)) - ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) +#define RDCOST_NEG_D(RM, DM, R, D) \ + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) - ((D) << (DM)) + #define QIDX_SKIP_THRESH 115 #define MV_COST_WEIGHT 108 @@ -42,6 +47,9 @@ extern "C" { #define RD_THRESH_MAX_FACT 64 #define RD_THRESH_INC 1 +#define VP9_DIST_SCALE_LOG2 4 +#define VP9_DIST_SCALE (1 << VP9_DIST_SCALE_LOG2) + // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. typedef enum { @@ -98,8 +106,8 @@ typedef enum { typedef struct RD_OPT { // Thresh_mult is used to set a threshold for the rd score. A higher value // means that we will accept the best mode so far more often. This number - // is used in combination with the current block size, and thresh_freq_fact - // to pick a threshold. + // is used in combination with the current block size, and thresh_freq_fact to + // pick a threshold. int thresh_mult[MAX_MODES]; int thresh_mult_sub8x8[MAX_REFS]; @@ -108,9 +116,14 @@ typedef struct RD_OPT { int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; +#if CONFIG_CONSISTENT_RECODE + int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; + int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; +#endif int RDMULT; int RDDIV; + double r0; } RD_OPT; typedef struct RD_COST { @@ -123,22 +136,27 @@ typedef struct RD_COST { void vp9_rd_cost_reset(RD_COST *rd_cost); // Initialize the rate distortion cost values to zero. void vp9_rd_cost_init(RD_COST *rd_cost); +// It supports negative rate and dist, which is different from RDCOST(). +int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist); +// Update the cost value based on its rate and distortion. +void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost); struct TileInfo; struct TileDataEnc; struct VP9_COMP; struct macroblock; -int64_t vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, - int qindex); +int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex); int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); +int vp9_get_adaptive_rdmult(const struct VP9_COMP *cpi, double beta); + void vp9_initialize_rd_consts(struct VP9_COMP *cpi); void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); -void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist); void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], @@ -169,8 +187,8 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); -void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize, - int best_mode_index); +void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, + int bsize, int best_mode_index); static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, const int *const thresh_fact) { @@ -208,8 +226,10 @@ unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi, BLOCK_SIZE bs, int bd); #endif +void vp9_build_inter_mode_cost(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RD_H_ +#endif // VPX_VP9_ENCODER_VP9_RD_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.c b/libs/libvpx/vp9/encoder/vp9_rdopt.c index 2ba6378c5e..d07d91774b 100644 --- a/libs/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libs/libvpx/vp9/encoder/vp9_rdopt.c @@ -31,6 +31,9 @@ #include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_aq_variance.h" +#endif #include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" @@ -40,7 +43,6 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_aq_variance.h" #define LAST_FRAME_MODE_MASK \ ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) @@ -59,7 +61,9 @@ typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; -typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION; +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; +} REF_DEFINITION; struct rdcost_block_args { const VP9_COMP *cpi; @@ -75,9 +79,12 @@ struct rdcost_block_args { int use_fast_coef_costing; const scan_order *so; uint8_t skippable; + struct buf_2d *this_recon; }; #define LAST_NEW_MV_INDEX 6 + +#if !CONFIG_REALTIME_ONLY static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { { NEARESTMV, { LAST_FRAME, NONE } }, { NEARESTMV, { ALTREF_FRAME, NONE } }, @@ -125,6 +132,7 @@ static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { { { ALTREF_FRAME, NONE } }, { { LAST_FRAME, ALTREF_FRAME } }, { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } }, }; +#endif // !CONFIG_REALTIME_ONLY static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, int min_plane, int max_plane) { @@ -151,6 +159,7 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, } } +#if !CONFIG_REALTIME_ONLY static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, int *skip_txfm_sb, @@ -271,10 +280,11 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } *skip_txfm_sb = skip_flag; - *skip_sse_sb = total_sse << 4; + *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2; *out_rate_sum = (int)rate_sum; - *out_dist_sum = dist_sum << 4; + *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2; } +#endif // !CONFIG_REALTIME_ONLY #if CONFIG_VP9_HIGHBITDEPTH int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, @@ -457,6 +467,66 @@ static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; } +// Copy all visible 4x4s in the transform block. +static void copy_block_visible(const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge, + pd->subsampling_x, blk_col); + int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge, + pd->subsampling_y, blk_row); + const int is_highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + const int w = tx_4x4_w << 2; + const int h = tx_4x4_h << 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_highbd) { + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), src_stride, + CONVERT_TO_SHORTPTR(dst), dst_stride, NULL, 0, 0, + 0, 0, w, h, xd->bd); + } else { +#endif + vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, 0, 0, 0, 0, w, + h); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } else { + int r, c; + int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + const uint8_t *src_ptr = src + r * src_stride * 4 + c * 4; + uint8_t *dst_ptr = dst + r * dst_stride * 4 + c * 4; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_highbd) { + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + NULL, 0, 0, 0, 0, 4, 4, xd->bd); + } else { +#endif + vpx_convolve_copy(src_ptr, src_stride, dst_ptr, dst_stride, NULL, 0, + 0, 0, 0, 4, 4); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } + } + } + (void)is_highbd; +} + // Compute the pixel domain sum square error on all visible 4x4s in the // transform block. static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd, @@ -537,12 +607,13 @@ static int64_t sum_squares_visible(const MACROBLOCKD *xd, static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, TX_SIZE tx_size, int64_t *out_dist, - int64_t *out_sse) { + int64_t *out_sse, struct buf_2d *out_recon) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = p->eobs[block]; - if (x->block_tx_domain) { + if (!out_recon && x->block_tx_domain && eob) { const int ss_txfrm_size = tx_size << 1; int64_t this_sse; const int shift = tx_size == TX_32X32 ? 0 : 2; @@ -581,15 +652,23 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, const int dst_idx = 4 * (blk_row * dst_stride + blk_col); const uint8_t *src = &p->src.buf[src_idx]; const uint8_t *dst = &pd->dst.buf[dst_idx]; + uint8_t *out_recon_ptr = 0; + const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const uint16_t *eob = &p->eobs[block]; unsigned int tmp; tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, blk_col, plane_bsize, tx_bsize); *out_sse = (int64_t)tmp * 16; + if (out_recon) { + const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col); + out_recon_ptr = &out_recon->buf[out_recon_idx]; + copy_block_visible(xd, pd, dst, dst_stride, out_recon_ptr, + out_recon->stride, blk_row, blk_col, plane_bsize, + tx_bsize); + } - if (*eob) { + if (eob) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(16, uint16_t, recon16[1024]); uint8_t *recon = (uint8_t *)recon16; @@ -602,22 +681,22 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd); break; - case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd); + default: + assert(tx_size == TX_32X32); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd); break; - default: assert(0 && "Invalid transform size"); } } recon = CONVERT_TO_BYTEPTR(recon16); @@ -625,16 +704,16 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #endif // CONFIG_VP9_HIGHBITDEPTH vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); switch (tx_size) { - case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break; - case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break; - case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break; - case TX_4X4: + case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break; + case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break; + case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break; + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around // eob<=1, which is significant (not just an optimization) for // the lossless case. - x->inv_txfm_add(dqcoeff, recon, 32, *eob); + x->inv_txfm_add(dqcoeff, recon, 32, eob); break; - default: assert(0 && "Invalid transform size"); break; } #if CONFIG_VP9_HIGHBITDEPTH } @@ -642,6 +721,10 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, tmp = pixel_sse(cpi, xd, pd, src, src_stride, recon, 32, blk_row, blk_col, plane_bsize, tx_bsize); + if (out_recon) { + copy_block_visible(xd, pd, recon, 32, out_recon_ptr, out_recon->stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } } *out_dist = (int64_t)tmp * 16; @@ -666,26 +749,38 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, int64_t sse; const int coeff_ctx = combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]); + struct buf_2d *recon = args->this_recon; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; if (args->exit_early) return; if (!is_inter_block(mi)) { +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args intra_arg = { + x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0 + }; +#else struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip }; +#endif vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, &intra_arg); + if (recon) { + uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)]; + copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } if (x->block_tx_domain) { dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse); + tx_size, &dist, &sse, /*recon =*/0); } else { - const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; const struct macroblock_plane *const p = &x->plane[plane]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)]; - const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; unsigned int tmp; sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, @@ -699,17 +794,20 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, blk_row, blk_col, plane_bsize, tx_bsize); dist = (int64_t)tmp * 16; } - } else if (max_txsize_lookup[plane_bsize] == tx_size) { - if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == - SKIP_TXFM_NONE) { + } else { + int skip_txfm_flag = SKIP_TXFM_NONE; + if (max_txsize_lookup[plane_bsize] == tx_size) + skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))]; + + if (skip_txfm_flag == SKIP_TXFM_NONE || + (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) { // full forward transform and quantization vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); if (x->block_qcoeff_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse); - } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == - SKIP_TXFM_AC_ONLY) { + tx_size, &dist, &sse, recon); + } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) { // compute DC coefficient tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); @@ -735,14 +833,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, x->plane[plane].eobs[block] = 0; sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; dist = sse; + if (recon) { + uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)]; + copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } } - } else { - // full forward transform and quantization - vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); - if (x->block_qcoeff_opt) - vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); - dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse); } rd = RDCOST(x->rdmult, x->rddiv, 0, dist); @@ -761,7 +857,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, rd = VPXMIN(rd1, rd2); if (plane == 0) { x->zcoeff_blk[tx_size][block] = - !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless); + !x->plane[plane].eobs[block] || + (x->sharpness == 0 && rd1 > rd2 && !xd->lossless); x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block]; } @@ -781,7 +878,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, - TX_SIZE tx_size, int use_fast_coef_casting) { + TX_SIZE tx_size, int use_fast_coef_costing, + struct buf_2d *recon) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args; @@ -789,8 +887,9 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, args.cpi = cpi; args.x = x; args.best_rd = ref_best_rd; - args.use_fast_coef_costing = use_fast_coef_casting; + args.use_fast_coef_costing = use_fast_coef_costing; args.skippable = 1; + args.this_recon = recon; if (plane == 0) xd->mi[0]->tx_size = tx_size; @@ -815,7 +914,8 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *sse, - int64_t ref_best_rd, BLOCK_SIZE bs) { + int64_t ref_best_rd, BLOCK_SIZE bs, + struct buf_2d *recon) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; @@ -825,13 +925,13 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate, mi->tx_size = VPXMIN(max_tx_size, largest_tx_size); txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs, - mi->tx_size, cpi->sf.use_fast_coef_costing); + mi->tx_size, cpi->sf.use_fast_coef_costing, recon); } static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *psse, int64_t ref_best_rd, - BLOCK_SIZE bs) { + BLOCK_SIZE bs, struct buf_2d *recon) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; @@ -843,20 +943,34 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX } }; - int n, m; + int n; int s0, s1; - int64_t best_rd = INT64_MAX; + int64_t best_rd = ref_best_rd; TX_SIZE best_tx = max_tx_size; int start_tx, end_tx; + const int tx_size_ctx = get_tx_size_context(xd); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, recon_buf16[TX_SIZES][64 * 64]); + uint8_t *recon_buf[TX_SIZES]; + for (n = 0; n < TX_SIZES; ++n) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + recon_buf[n] = CONVERT_TO_BYTEPTR(recon_buf16[n]); + } else { + recon_buf[n] = (uint8_t *)recon_buf16[n]; + } + } +#else + DECLARE_ALIGNED(16, uint8_t, recon_buf[TX_SIZES][64 * 64]); +#endif // CONFIG_VP9_HIGHBITDEPTH - const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); assert(skip_prob > 0); s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); if (cm->tx_mode == TX_MODE_SELECT) { start_tx = max_tx_size; - end_tx = 0; + end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0); + if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx); } else { TX_SIZE chosen_tx_size = VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]); @@ -865,15 +979,17 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, } for (n = start_tx; n >= end_tx; n--) { - int r_tx_size = 0; - for (m = 0; m <= n - (n == (int)max_tx_size); m++) { - if (m == n) - r_tx_size += vp9_cost_zero(tx_probs[m]); - else - r_tx_size += vp9_cost_one(tx_probs[m]); + const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n]; + if (recon) { + struct buf_2d this_recon; + this_recon.buf = recon_buf[n]; + this_recon.stride = recon->stride; + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, + n, cpi->sf.use_fast_coef_costing, &this_recon); + } else { + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, + n, cpi->sf.use_fast_coef_costing, 0); } - txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0, - bs, n, cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; if (r[n][0] < INT_MAX) { r[n][1] += r_tx_size; @@ -915,11 +1031,25 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, *rate = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT]; *skip = s[mi->tx_size]; *psse = sse[mi->tx_size]; + if (recon) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + memcpy(CONVERT_TO_SHORTPTR(recon->buf), + CONVERT_TO_SHORTPTR(recon_buf[mi->tx_size]), + 64 * 64 * sizeof(uint16_t)); + } else { +#endif + memcpy(recon->buf, recon_buf[mi->tx_size], 64 * 64); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } } static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *psse, - BLOCK_SIZE bs, int64_t ref_best_rd) { + BLOCK_SIZE bs, int64_t ref_best_rd, + struct buf_2d *recon) { MACROBLOCKD *xd = &x->e_mbd; int64_t sse; int64_t *ret_sse = psse ? psse : &sse; @@ -928,10 +1058,10 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, - bs); + bs, recon); } else { choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, - bs); + bs, recon); } } @@ -1273,7 +1403,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, mic->mode = mode; super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, - bsize, best_rd); + bsize, best_rd, /*recon = */ 0); if (this_rate_tokenonly == INT_MAX) continue; @@ -1325,7 +1455,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, - plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing); + plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing, + /*recon = */ 0); if (pnrate == INT_MAX) { is_cost_valid = 0; break; @@ -1393,6 +1524,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } +#if !CONFIG_REALTIME_ONLY static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize) { @@ -1466,11 +1598,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, if (is_compound) this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int; break; - case ZEROMV: + default: + assert(mode == ZEROMV); this_mv[0].as_int = 0; if (is_compound) this_mv[1].as_int = 0; break; - default: break; } mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int; @@ -1604,6 +1736,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion); } +#endif // !CONFIG_REALTIME_ONLY typedef struct { int eobs; @@ -1631,6 +1764,7 @@ typedef struct { int mvthresh; } BEST_SEG_INFO; +#if !CONFIG_REALTIME_ONLY static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { return (mv->row >> 3) < mv_limits->row_min || (mv->row >> 3) > mv_limits->row_max || @@ -1829,8 +1963,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, bestsme = cpi->find_fractional_mv_step( x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], 0, - cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost, - &dis, &sse, second_pred, pw, ph); + cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search); } // Restore the pointer to the first (possibly scaled) prediction buffer. @@ -1884,6 +2018,8 @@ static int64_t rd_pick_best_sub8x8_mode( const BLOCK_SIZE bsize = mi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + const int pw = num_4x4_blocks_wide << 2; + const int ph = num_4x4_blocks_high << 2; ENTROPY_CONTEXT t_above[2], t_left[2]; int subpelmv = 1, have_ref = 0; SPEED_FEATURES *const sf = &cpi->sf; @@ -1992,8 +2128,11 @@ static int64_t rd_pick_best_sub8x8_mode( mvp_full.col = bsi->mvp.as_mv.col >> 3; if (sf->adaptive_motion_search) { - mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3; - mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3; + if (x->pred_mv[mi->ref_frame[0]].row != INT16_MAX && + x->pred_mv[mi->ref_frame[0]].col != INT16_MAX) { + mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3; + mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3; + } step_param = VPXMAX(step_param, 8); } @@ -2015,16 +2154,16 @@ static int64_t rd_pick_best_sub8x8_mode( cpi->find_fractional_mv_step( x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop, - sf->mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), + sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &distortion, - &x->pred_sse[mi->ref_frame[0]], NULL, 0, 0); + &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); // save motion search result for use in compound prediction seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv; } - if (sf->adaptive_motion_search) - x->pred_mv[mi->ref_frame[0]] = *new_mv; + x->pred_mv[mi->ref_frame[0]] = *new_mv; // restore src pointers mi_buf_restore(x, orig_src, orig_pre); @@ -2319,6 +2458,22 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, block_size); } +#if CONFIG_NON_GREEDY_MV +static int ref_frame_to_gf_rf_idx(int ref_frame) { + if (ref_frame == GOLDEN_FRAME) { + return 0; + } + if (ref_frame == LAST_FRAME) { + return 1; + } + if (ref_frame == ALTREF_FRAME) { + return 2; + } + assert(0); + return -1; +} +#endif + static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { @@ -2326,19 +2481,35 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const VP9_COMMON *cm = &cpi->common; MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; - int bestsme = INT_MAX; int step_param; - int sadpb = x->sadperbit16; MV mvp_full; int ref = mi->ref_frame[0]; MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; const MvLimits tmp_mv_limits = x->mv_limits; int cost_list[5]; - + const int best_predmv_idx = x->mv_best_ref_index[ref]; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); - + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; MV pred_mv[3]; + +#if CONFIG_NON_GREEDY_MV + double bestsme; + int_mv nb_full_mvs[NB_MVS_NUM]; + const int nb_full_mv_num = NB_MVS_NUM; + int gf_group_idx = cpi->twopass.gf_group.index; + int gf_rf_idx = ref_frame_to_gf_rf_idx(ref); + BLOCK_SIZE square_bsize = get_square_block_size(bsize); + const int lambda = (pw * ph) / 4; + assert(pw * ph == lambda << 2); + vp9_prepare_nb_full_mvs(&cpi->tpl_stats[gf_group_idx], mi_row, mi_col, + gf_rf_idx, square_bsize, nb_full_mvs); +#else // CONFIG_NON_GREEDY_MV + int bestsme = INT_MAX; + int sadpb = x->sadperbit16; +#endif // CONFIG_NON_GREEDY_MV + pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv; pred_mv[2] = x->pred_mv[ref]; @@ -2367,7 +2538,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) { - int boffset = + const int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] - VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize])); step_param = VPXMAX(step_param, boffset); @@ -2385,8 +2556,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int i; for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { - x->pred_mv[ref].row = 0; - x->pred_mv[ref].col = 0; + x->pred_mv[ref].row = INT16_MAX; + x->pred_mv[ref].col = INT16_MAX; tmp_mv->as_int = INVALID_MV; if (scaled_ref_frame) { @@ -2404,14 +2575,69 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // after full-pixel motion search. vp9_set_mv_search_range(&x->mv_limits, &ref_mv); - mvp_full = pred_mv[x->mv_best_ref_index[ref]]; - + mvp_full = pred_mv[best_predmv_idx]; mvp_full.col >>= 3; mvp_full.row >>= 3; +#if CONFIG_NON_GREEDY_MV + bestsme = vp9_full_pixel_diamond_new(cpi, x, &mvp_full, step_param, lambda, 1, + &cpi->fn_ptr[bsize], nb_full_mvs, + nb_full_mv_num, &tmp_mv->as_mv); +#else // CONFIG_NON_GREEDY_MV bestsme = vp9_full_pixel_search( cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + + if (cpi->sf.enhanced_full_pixel_motion_search) { + int i; + for (i = 0; i < 3; ++i) { +#if CONFIG_NON_GREEDY_MV + double this_me; +#else // CONFIG_NON_GREEDY_MV + int this_me; +#endif // CONFIG_NON_GREEDY_MV + MV this_mv; + int diff_row; + int diff_col; + int step; + + if (pred_mv[i].row == INT16_MAX || pred_mv[i].col == INT16_MAX) continue; + if (i == best_predmv_idx) continue; + + diff_row = ((int)pred_mv[i].row - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].row) >> + 3; + diff_col = ((int)pred_mv[i].col - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].col) >> + 3; + if (diff_row == 0 && diff_col == 0) continue; + if (diff_row < 0) diff_row = -diff_row; + if (diff_col < 0) diff_col = -diff_col; + step = get_msb((diff_row + diff_col + 1) >> 1); + if (step <= 0) continue; + + mvp_full = pred_mv[i]; + mvp_full.col >>= 3; + mvp_full.row >>= 3; +#if CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_diamond_new( + cpi, x, &mvp_full, VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), + lambda, 1, &cpi->fn_ptr[bsize], nb_full_mvs, nb_full_mv_num, + &this_mv); +#else // CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, + VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), + cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, &this_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + if (this_me < bestsme) { + tmp_mv->as_mv = this_mv; + bestsme = this_me; + } + } + } x->mv_limits = tmp_mv_limits; @@ -2420,13 +2646,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); } *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - if (cpi->sf.adaptive_motion_search) x->pred_mv[ref] = tmp_mv->as_mv; + x->pred_mv[ref] = tmp_mv->as_mv; if (scaled_ref_frame) { int i; @@ -2453,21 +2680,56 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, // visual quality. static int discount_newmv_test(const VP9_COMP *cpi, int this_mode, int_mv this_mv, - int_mv (*mode_mv)[MAX_REF_FRAMES], - int ref_frame) { + int_mv (*mode_mv)[MAX_REF_FRAMES], int ref_frame, + int mi_row, int mi_col, BLOCK_SIZE bsize) { +#if CONFIG_NON_GREEDY_MV + (void)mode_mv; + (void)this_mv; + if (this_mode == NEWMV && bsize >= BLOCK_8X8 && cpi->tpl_ready) { + const int gf_group_idx = cpi->twopass.gf_group.index; + const int gf_rf_idx = ref_frame_to_gf_rf_idx(ref_frame); + const TplDepFrame tpl_frame = cpi->tpl_stats[gf_group_idx]; + const int tpl_block_mi_h = num_8x8_blocks_high_lookup[cpi->tpl_bsize]; + const int tpl_block_mi_w = num_8x8_blocks_wide_lookup[cpi->tpl_bsize]; + const int tpl_mi_row = mi_row - (mi_row % tpl_block_mi_h); + const int tpl_mi_col = mi_col - (mi_col % tpl_block_mi_w); + const int mv_mode = + tpl_frame + .mv_mode_arr[gf_rf_idx][tpl_mi_row * tpl_frame.stride + tpl_mi_col]; + if (mv_mode == NEW_MV_MODE) { + int_mv tpl_new_mv = *get_pyramid_mv(&tpl_frame, gf_rf_idx, cpi->tpl_bsize, + tpl_mi_row, tpl_mi_col); + int row_diff = abs(tpl_new_mv.as_mv.row - this_mv.as_mv.row); + int col_diff = abs(tpl_new_mv.as_mv.col - this_mv.as_mv.col); + if (VPXMAX(row_diff, col_diff) <= 8) { + return 1; + } else { + return 0; + } + } else { + return 0; + } + } else { + return 0; + } +#else + (void)mi_row; + (void)mi_col; + (void)bsize; return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) && (this_mv.as_int != 0) && ((mode_mv[NEARESTMV][ref_frame].as_int == 0) || (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) && ((mode_mv[NEARMV][ref_frame].as_int == 0) || (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV))); +#endif } static int64_t handle_inter_mode( VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int *rate2, int64_t *distortion, int *skippable, int *rate_y, int *rate_uv, - int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, - int mi_col, int_mv single_newmv[MAX_REF_FRAMES], + struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], + int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) { @@ -2573,7 +2835,8 @@ static int64_t handle_inter_mode( // under certain circumstances where we want to help initiate a weak // motion field, where the distortion gain for a single block may not // be enough to overcome the cost of a new mv. - if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) { + if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0], mi_row, + mi_col, bsize)) { *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } else { *rate2 += rate_mv; @@ -2606,8 +2869,8 @@ static int64_t handle_inter_mode( // // Under some circumstances we discount the cost of new mv mode to encourage // initiation of a motion field. - if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, - refs[0])) { + if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, refs[0], + mi_row, mi_col, bsize)) { *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]), cost_mv_ref(cpi, NEARESTMV, mbmi_ext->mode_context[refs[0]])); @@ -2771,7 +3034,7 @@ static int64_t handle_inter_mode( memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm)); memcpy(x->bsse, bsse, sizeof(bsse)); - if (!skip_txfm_sb) { + if (!skip_txfm_sb || xd->lossless) { int skippable_y, skippable_uv; int64_t sseuv = INT64_MAX; int64_t rdcosty = INT64_MAX; @@ -2779,7 +3042,7 @@ static int64_t handle_inter_mode( // Y cost and distortion vp9_subtract_plane(x, bsize, 0); super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize, - ref_best_rd); + ref_best_rd, recon); if (*rate_y == INT_MAX) { *rate2 = INT_MAX; @@ -2821,6 +3084,7 @@ static int64_t handle_inter_mode( restore_dst_buf(xd, orig_dst, orig_dst_stride); return 0; // The rate-distortion cost will be re-calculated by caller. } +#endif // !CONFIG_REALTIME_ONLY void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, @@ -2874,85 +3138,97 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); } +#if !CONFIG_REALTIME_ONLY // This function is designed to apply a bias or adjustment to an rd value based // on the relative variance of the source and reconstruction. -#define VERY_LOW_VAR_THRESH 2 -#define LOW_VAR_THRESH 5 -#define VAR_MULT 100 -static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 100 }; +#define LOW_VAR_THRESH 250 +#define VAR_MULT 250 +static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 250 }; static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *this_rd, + struct buf_2d *recon, MV_REFERENCE_FRAME ref_frame, - unsigned int source_variance) { + MV_REFERENCE_FRAME second_ref_frame, + PREDICTION_MODE this_mode) { MACROBLOCKD *const xd = &x->e_mbd; unsigned int rec_variance; unsigned int src_variance; unsigned int src_rec_min; - unsigned int absvar_diff = 0; + unsigned int var_diff = 0; unsigned int var_factor = 0; unsigned int adj_max; + unsigned int low_var_thresh = LOW_VAR_THRESH; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; vp9e_tune_content content_type = cpi->oxcf.content; if (*this_rd == INT64_MAX) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - if (source_variance > 0) { - rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, - bsize, xd->bd); - src_variance = source_variance; - } else { - rec_variance = - vp9_high_get_sby_variance(cpi, &xd->plane[0].dst, bsize, xd->bd); - src_variance = - vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd); - } + rec_variance = vp9_high_get_sby_variance(cpi, recon, bsize, xd->bd); + src_variance = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd); } else { - if (source_variance > 0) { - rec_variance = - vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); - src_variance = source_variance; - } else { - rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize); - src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); - } - } -#else - if (source_variance > 0) { - rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); - src_variance = source_variance; - } else { - rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize); + rec_variance = vp9_get_sby_variance(cpi, recon, bsize); src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); } +#else + rec_variance = vp9_get_sby_variance(cpi, recon, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); #endif // CONFIG_VP9_HIGHBITDEPTH + // Scale based on area in 8x8 blocks + rec_variance /= (bw * bh); + src_variance /= (bw * bh); + + if (content_type == VP9E_CONTENT_FILM) { + if (cpi->oxcf.pass == 2) { + // Adjust low variance threshold based on estimated group noise enegry. + double noise_factor = + (double)cpi->twopass.gf_group.group_noise_energy / SECTION_NOISE_DEF; + low_var_thresh = (unsigned int)(low_var_thresh * noise_factor); + + if (ref_frame == INTRA_FRAME) { + low_var_thresh *= 2; + if (this_mode == DC_PRED) low_var_thresh *= 5; + } else if (second_ref_frame > INTRA_FRAME) { + low_var_thresh *= 2; + } + } + } else { + low_var_thresh = LOW_VAR_THRESH / 2; + } + // Lower of source (raw per pixel value) and recon variance. Note that // if the source per pixel is 0 then the recon value here will not be per // pixel (see above) so will likely be much larger. - src_rec_min = VPXMIN(source_variance, rec_variance); + src_rec_min = VPXMIN(src_variance, rec_variance); - if (src_rec_min > LOW_VAR_THRESH) return; + if (src_rec_min > low_var_thresh) return; - absvar_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) - : (rec_variance - src_variance); + // We care more when the reconstruction has lower variance so give this case + // a stronger weighting. + var_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) * 2 + : (rec_variance - src_variance) / 2; adj_max = max_var_adjust[content_type]; var_factor = - (unsigned int)((int64_t)VAR_MULT * absvar_diff) / VPXMAX(1, src_variance); + (unsigned int)((int64_t)VAR_MULT * var_diff) / VPXMAX(1, src_variance); var_factor = VPXMIN(adj_max, var_factor); + if ((content_type == VP9E_CONTENT_FILM) && + ((ref_frame == INTRA_FRAME) || (second_ref_frame > INTRA_FRAME))) { + var_factor *= 2; + } + *this_rd += (*this_rd * var_factor) / 100; - if (content_type == VP9E_CONTENT_FILM) { - if (src_rec_min <= VERY_LOW_VAR_THRESH) { - if (ref_frame == INTRA_FRAME) *this_rd *= 2; - if (bsize > 6) *this_rd *= 2; - } - } + (void)xd; } +#endif // !CONFIG_REALTIME_ONLY // Do we have an internal image edge (e.g. formatting bars). int vp9_internal_image_edge(VP9_COMP *cpi) { @@ -3023,6 +3299,7 @@ int vp9_active_edge_sb(VP9_COMP *cpi, int mi_row, int mi_col) { vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE); } +#if !CONFIG_REALTIME_ONLY void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, @@ -3066,20 +3343,36 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, const int intra_cost_penalty = vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int best_skip2 = 0; - uint8_t ref_frame_skip_mask[2] = { 0 }; + uint8_t ref_frame_skip_mask[2] = { 0, 1 }; uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; int mode_skip_start = sf->mode_skip_start + 1; const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; int64_t mode_threshold[MAX_MODES]; - int *tile_mode_map = tile_data->mode_map[bsize]; - int mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid - // lock mechanism involved with reads from - // tile_mode_map + int8_t *tile_mode_map = tile_data->mode_map[bsize]; + int8_t mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid + // lock mechanism involved with reads from + // tile_mode_map const int mode_search_skip_flags = sf->mode_search_skip_flags; + const int is_rect_partition = + num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize]; int64_t mask_filter = 0; int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; + struct buf_2d *recon; + struct buf_2d recon_buf; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, recon16[64 * 64]); + recon_buf.buf = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH + ? CONVERT_TO_BYTEPTR(recon16) + : (uint8_t *)recon16; +#else + DECLARE_ALIGNED(16, uint8_t, recon8[64 * 64]); + recon_buf.buf = recon8; +#endif // CONFIG_VP9_HIGHBITDEPTH + recon_buf.stride = 64; + recon = cpi->oxcf.content == VP9E_CONTENT_FILM ? &recon_buf : 0; + vp9_zero(best_mbmode); x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; @@ -3105,7 +3398,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if ((cpi->ref_frame_flags & flag_list[ref_frame]) && + !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); @@ -3228,18 +3522,21 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_zero(x->sum_y_eobs); + if (is_rect_partition) { + if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; + if (second_ref_frame > 0 && + (ctx->skip_ref_frame_mask & (1 << second_ref_frame))) + continue; + } + // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. if (midx == mode_skip_start && best_mode_index >= 0) { switch (best_mbmode.ref_frame[0]) { case INTRA_FRAME: break; - case LAST_FRAME: - ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; + case LAST_FRAME: ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; break; case GOLDEN_FRAME: ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; break; case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break; case NONE: @@ -3313,6 +3610,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + // Skip compound inter modes if ARF is not available. if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; @@ -3339,7 +3640,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, // Disable intra modes other than DC_PRED for blocks with low variance // Threshold for intra skipping based on source variance // TODO(debargha): Specialize the threshold for super block sizes - const unsigned int skip_intra_var_thresh = 64; + const unsigned int skip_intra_var_thresh = + (cpi->oxcf.content == VP9E_CONTENT_FILM) ? 0 : 64; if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && x->source_variance < skip_intra_var_thresh) continue; @@ -3385,7 +3687,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, struct macroblockd_plane *const pd = &xd->plane[1]; memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, - best_rd); + best_rd, recon); if (rate_y == INT_MAX) continue; uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x] @@ -3408,7 +3710,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } else { this_rd = handle_inter_mode( cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, - &disable_skip, frame_mv, mi_row, mi_col, single_newmv, + recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, single_inter_filter, single_skippable, &total_sse, best_rd, &mask_filter, filter_cache); if (this_rd == INT64_MAX) continue; @@ -3437,7 +3739,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, // Cost the skip mb case rate2 += skip_cost1; - } else if (ref_frame != INTRA_FRAME && !xd->lossless) { + } else if (ref_frame != INTRA_FRAME && !xd->lossless && + !cpi->oxcf.sharpness) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0, distortion2) < RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) { @@ -3461,10 +3764,39 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); } - // Apply an adjustment to the rd value based on the similarity of the - // source variance and reconstructed variance. - rd_variance_adjustment(cpi, x, bsize, &this_rd, ref_frame, - x->source_variance); + if (recon) { + // In film mode bias against DC pred and other intra if there is a + // significant difference between the variance of the sub blocks in the + // the source. Also apply some bias against compound modes which also + // tend to blur fine texture such as film grain over time. + // + // The sub block test here acts in the case where one or more sub + // blocks have high relatively variance but others relatively low + // variance. Here the high variance sub blocks may push the + // total variance for the current block size over the thresholds + // used in rd_variance_adjustment() below. + if (cpi->oxcf.content == VP9E_CONTENT_FILM) { + if (bsize >= BLOCK_16X16) { + int min_energy, max_energy; + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + if (max_energy > min_energy) { + if (ref_frame == INTRA_FRAME) { + if (this_mode == DC_PRED) + this_rd += (this_rd * (max_energy - min_energy)); + else + this_rd += (this_rd * (max_energy - min_energy)) / 4; + } else if (second_ref_frame > INTRA_FRAME) { + this_rd += this_rd / 4; + } + } + } + } + // Apply an adjustment to the rd value based on the similarity of the + // source variance and reconstructed variance. + rd_variance_adjustment(cpi, x, bsize, &this_rd, recon, ref_frame, + second_ref_frame, this_mode); + } if (ref_frame == INTRA_FRAME) { // Keep record of best intra rd @@ -3616,9 +3948,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { - // If adaptive interp filter is enabled, then the current leaf node of 8x8 - // data is needed for sub8x8. Hence preserve the context. +// If adaptive interp filter is enabled, then the current leaf node of 8x8 +// data is needed for sub8x8. Hence preserve the context. +#if CONFIG_CONSISTENT_RECODE + if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; +#else if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; +#endif rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; @@ -3894,7 +4230,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, #if CONFIG_BETTER_HW_COMPATIBILITY // forbid 8X4 and 4X8 partitions if any reference frame is scaled. if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) { - int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); + int ref_scaled = ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); if (second_ref_frame > INTRA_FRAME) ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf); if (ref_scaled) continue; @@ -3940,6 +4277,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; + + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. @@ -4418,3 +4760,4 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, store_coding_context(x, ctx, best_ref_index, best_pred_diff, best_filter_diff, 0); } +#endif // !CONFIG_REALTIME_ONLY diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.h b/libs/libvpx/vp9/encoder/vp9_rdopt.h index 795c91aef7..e1147ff943 100644 --- a/libs/libvpx/vp9/encoder/vp9_rdopt.h +++ b/libs/libvpx/vp9/encoder/vp9_rdopt.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RDOPT_H_ -#define VP9_ENCODER_VP9_RDOPT_H_ +#ifndef VPX_VP9_ENCODER_VP9_RDOPT_H_ +#define VPX_VP9_ENCODER_VP9_RDOPT_H_ #include "vp9/common/vp9_blockd.h" @@ -29,6 +29,7 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); +#if !CONFIG_REALTIME_ONLY void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, @@ -39,21 +40,24 @@ void vp9_rd_pick_inter_mode_sb_seg_skip( struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); +#endif int vp9_internal_image_edge(struct VP9_COMP *cpi); int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step); int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step); int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col); +#if !CONFIG_REALTIME_ONLY void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); +#endif #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RDOPT_H_ +#endif // VPX_VP9_ENCODER_VP9_RDOPT_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_resize.c b/libs/libvpx/vp9/encoder/vp9_resize.c index f6c4aad4d3..7486dee25b 100644 --- a/libs/libvpx/vp9/encoder/vp9_resize.c +++ b/libs/libvpx/vp9/encoder/vp9_resize.c @@ -424,11 +424,11 @@ void vp9_resize_plane(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { int i; - uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height); + uint8_t *intbuf = (uint8_t *)calloc(width2 * height, sizeof(*intbuf)); uint8_t *tmpbuf = - (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width)); - uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height); - uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2); + (uint8_t *)calloc(width < height ? height : width, sizeof(*tmpbuf)); + uint8_t *arrbuf = (uint8_t *)calloc(height, sizeof(*arrbuf)); + uint8_t *arrbuf2 = (uint8_t *)calloc(height2, sizeof(*arrbuf2)); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; assert(width > 0); @@ -506,10 +506,12 @@ static void highbd_interpolate(const uint16_t *const input, int inlength, sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; filter = interp_filters[sub_pel]; sum = 0; - for (k = 0; k < INTERP_TAPS; ++k) + for (k = 0; k < INTERP_TAPS; ++k) { + assert(int_pel - INTERP_TAPS / 2 + 1 + k < inlength); sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ? 0 : int_pel - INTERP_TAPS / 2 + 1 + k)]; + } *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } // Middle part. @@ -720,6 +722,10 @@ void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width, uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); for (i = 0; i < height; ++i) { highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, intbuf + width2 * i, width2, tmpbuf, bd); diff --git a/libs/libvpx/vp9/encoder/vp9_resize.h b/libs/libvpx/vp9/encoder/vp9_resize.h index d3282ee191..5d4ce97eba 100644 --- a/libs/libvpx/vp9/encoder/vp9_resize.h +++ b/libs/libvpx/vp9/encoder/vp9_resize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RESIZE_H_ -#define VP9_ENCODER_VP9_RESIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_RESIZE_H_ +#define VPX_VP9_ENCODER_VP9_RESIZE_H_ #include #include "vpx/vpx_integer.h" @@ -65,4 +65,4 @@ void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RESIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_RESIZE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_segmentation.c b/libs/libvpx/vp9/encoder/vp9_segmentation.c index 4a5a68e07a..a163297e6e 100644 --- a/libs/libvpx/vp9/encoder/vp9_segmentation.c +++ b/libs/libvpx/vp9/encoder/vp9_segmentation.c @@ -9,6 +9,7 @@ */ #include +#include #include "vpx_mem/vpx_mem.h" @@ -46,6 +47,59 @@ void vp9_clear_segdata(struct segmentation *seg, int segment_id, seg->feature_data[segment_id][feature_id] = 0; } +void vp9_psnr_aq_mode_setup(struct segmentation *seg) { + int i; + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = 0; i < MAX_SEGMENTS; ++i) { + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 2 * (i - (MAX_SEGMENTS / 2))); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } +} + +void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi, + struct segmentation *seg) { + const VP9_COMMON *cm = &cpi->common; + const int seg_counts = cpi->kmeans_ctr_num; + const int base_qindex = cm->base_qindex; + const double base_qstep = vp9_convert_qindex_to_q(base_qindex, cm->bit_depth); + const double mid_ctr = cpi->kmeans_ctr_ls[seg_counts / 2]; + const double var_diff_scale = 4.0; + int i; + + assert(seg_counts <= MAX_SEGMENTS); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = 0; i < seg_counts / 2; ++i) { + double wiener_var_diff = mid_ctr - cpi->kmeans_ctr_ls[i]; + double target_qstep = base_qstep / (1.0 + wiener_var_diff / var_diff_scale); + int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth); + assert(wiener_var_diff >= 0.0); + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 0); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + + for (; i < seg_counts; ++i) { + double wiener_var_diff = cpi->kmeans_ctr_ls[i] - mid_ctr; + double target_qstep = base_qstep * (1.0 + wiener_var_diff / var_diff_scale); + int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth); + assert(wiener_var_diff >= 0.0); + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } +} + // Based on set of segment counts calculate a probability tree static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) { // Work out probabilities of each segment diff --git a/libs/libvpx/vp9/encoder/vp9_segmentation.h b/libs/libvpx/vp9/encoder/vp9_segmentation.h index 562805543b..9404c38bc8 100644 --- a/libs/libvpx/vp9/encoder/vp9_segmentation.h +++ b/libs/libvpx/vp9/encoder/vp9_segmentation.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SEGMENTATION_H_ -#define VP9_ENCODER_VP9_SEGMENTATION_H_ +#ifndef VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ +#define VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/encoder/vp9_encoder.h" @@ -26,6 +26,11 @@ void vp9_disable_segfeature(struct segmentation *seg, int segment_id, void vp9_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); +void vp9_psnr_aq_mode_setup(struct segmentation *seg); + +void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi, + struct segmentation *seg); + // The values given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. // @@ -47,4 +52,4 @@ void vp9_reset_segment_features(struct segmentation *seg); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SEGMENTATION_H_ +#endif // VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.h b/libs/libvpx/vp9/encoder/vp9_skin_detection.h index 8880bff466..46a722af9b 100644 --- a/libs/libvpx/vp9/encoder/vp9_skin_detection.h +++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_ -#define VP9_ENCODER_VP9_SKIN_MAP_H_ +#ifndef VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ +#define VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ #include "vp9/common/vp9_blockd.h" #include "vpx_dsp/skin_detection.h" @@ -37,4 +37,4 @@ void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SKIN_MAP_H_ +#endif // VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.c b/libs/libvpx/vp9/encoder/vp9_speed_features.c index a05db60c65..529dca0406 100644 --- a/libs/libvpx/vp9/encoder/vp9_speed_features.c +++ b/libs/libvpx/vp9/encoder/vp9_speed_features.c @@ -20,6 +20,7 @@ static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }; +#if !CONFIG_REALTIME_ONLY // Define 3 mesh density levels to control the number of searches. #define MESH_DENSITY_LEVELS 3 static MESH_PATTERN @@ -32,7 +33,7 @@ static MESH_PATTERN // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality static int frame_is_boosted(const VP9_COMP *cpi) { - return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi); + return frame_is_kf_gf_arf(cpi); } // Sets a partition size down to which the auto partition code will always @@ -61,46 +62,92 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed) { VP9_COMMON *const cm = &cpi->common; + const int min_frame_size = VPXMIN(cm->width, cm->height); + const int is_480p_or_larger = min_frame_size >= 480; + const int is_720p_or_larger = min_frame_size >= 720; + const int is_1080p_or_larger = min_frame_size >= 1080; + const int is_2160p_or_larger = min_frame_size >= 2160; // speed 0 features sf->partition_search_breakout_thr.dist = (1 << 20); sf->partition_search_breakout_thr.rate = 80; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; - // Currently, the machine-learning based partition search early termination - // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. - if (VPXMIN(cm->width, cm->height) >= 480) { - sf->ml_partition_search_early_termination = 1; + if (is_480p_or_larger) { + // Currently, the machine-learning based partition search early termination + // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. + sf->rd_ml_partition.search_early_termination = 1; + } else { + sf->use_square_only_thresh_high = BLOCK_32X32; } - if (speed >= 1) { - sf->ml_partition_search_early_termination = 0; - - if (VPXMIN(cm->width, cm->height) >= 720) { - sf->disable_split_mask = - cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - sf->partition_search_breakout_thr.dist = (1 << 23); + if (!is_1080p_or_larger) { + sf->rd_ml_partition.search_breakout = 1; + if (is_720p_or_larger) { + sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = 0.0f; } else { - sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; - sf->partition_search_breakout_thr.dist = (1 << 21); + sf->rd_ml_partition.search_breakout_thresh[0] = 2.5f; + sf->rd_ml_partition.search_breakout_thresh[1] = 1.5f; + sf->rd_ml_partition.search_breakout_thresh[2] = 1.5f; } } + if (speed >= 1) { + sf->rd_ml_partition.search_early_termination = 0; + sf->rd_ml_partition.search_breakout = 1; + if (is_480p_or_larger) + sf->use_square_only_thresh_high = BLOCK_64X64; + else + sf->use_square_only_thresh_high = BLOCK_32X32; + sf->use_square_only_thresh_low = BLOCK_16X16; + if (is_720p_or_larger) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->partition_search_breakout_thr.dist = (1 << 22); + sf->rd_ml_partition.search_breakout_thresh[0] = -5.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -5.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -9.0f; + } else { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + sf->partition_search_breakout_thr.dist = (1 << 21); + sf->rd_ml_partition.search_breakout_thresh[0] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -1.0f; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) { + sf->rd_ml_partition.search_breakout_thresh[0] -= 1.0f; + sf->rd_ml_partition.search_breakout_thresh[1] -= 1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] -= 1.0f; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + if (speed >= 2) { - if (VPXMIN(cm->width, cm->height) >= 720) { + sf->use_square_only_thresh_high = BLOCK_4X4; + sf->use_square_only_thresh_low = BLOCK_SIZES; + if (is_720p_or_larger) { sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; sf->adaptive_pred_interp_filter = 0; sf->partition_search_breakout_thr.dist = (1 << 24); sf->partition_search_breakout_thr.rate = 120; + sf->rd_ml_partition.search_breakout = 0; } else { sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; sf->partition_search_breakout_thr.dist = (1 << 22); sf->partition_search_breakout_thr.rate = 100; + sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -4.0f; } sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); // Use a set of speed features for 4k videos. - if (VPXMIN(cm->width, cm->height) >= 2160) { + if (is_2160p_or_larger) { sf->use_square_partition_only = 1; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; @@ -112,7 +159,8 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } if (speed >= 3) { - if (VPXMIN(cm->width, cm->height) >= 720) { + sf->rd_ml_partition.search_breakout = 0; + if (is_720p_or_larger) { sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; sf->partition_search_breakout_thr.dist = (1 << 25); @@ -137,7 +185,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, if (speed >= 4) { sf->partition_search_breakout_thr.rate = 300; - if (VPXMIN(cm->width, cm->height) >= 720) { + if (is_720p_or_larger) { sf->partition_search_breakout_thr.dist = (1 << 26); } else { sf->partition_search_breakout_thr.dist = (1 << 24); @@ -166,28 +214,41 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; - sf->use_square_partition_only = !frame_is_boosted(cpi); - sf->use_square_only_threshold = BLOCK_16X16; + sf->use_square_partition_only = !boosted; + sf->prune_ref_frame_for_rect_partitions = 1; + sf->rd_ml_partition.var_pruning = 1; + + sf->rd_ml_partition.prune_rect_thresh[0] = -1; + sf->rd_ml_partition.prune_rect_thresh[1] = 350; + sf->rd_ml_partition.prune_rect_thresh[2] = 325; + sf->rd_ml_partition.prune_rect_thresh[3] = 250; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { sf->exhaustive_searches_thresh = (1 << 22); - for (i = 0; i < MAX_MESH_STEP; ++i) { - int mesh_density_level = 0; - sf->mesh_patterns[i].range = - good_quality_mesh_patterns[mesh_density_level][i].range; - sf->mesh_patterns[i].interval = - good_quality_mesh_patterns[mesh_density_level][i].interval; - } } else { sf->exhaustive_searches_thresh = INT_MAX; } + for (i = 0; i < MAX_MESH_STEP; ++i) { + const int mesh_density_level = 0; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + if (speed >= 1) { + sf->temporal_filter_search_method = NSTEP; + sf->rd_ml_partition.var_pruning = !boosted; + sf->rd_ml_partition.prune_rect_thresh[1] = 225; + sf->rd_ml_partition.prune_rect_thresh[2] = 225; + sf->rd_ml_partition.prune_rect_thresh[3] = 225; + if (oxcf->pass == 2) { TWO_PASS *const twopass = &cpi->twopass; if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) || vp9_internal_image_edge(cpi)) { - sf->use_square_partition_only = !frame_is_boosted(cpi); + sf->use_square_partition_only = !boosted; } else { sf->use_square_partition_only = !frame_is_intra_only(cm); } @@ -199,23 +260,22 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5]; sf->allow_quant_coeff_opt = sf->optimize_coefficients; sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5]; - - sf->use_square_only_threshold = BLOCK_4X4; sf->less_rectangular_check = 1; - sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; - sf->mv.subpel_iters_per_step = 1; - sf->mode_skip_start = 10; + sf->mv.subpel_search_level = 1; + if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10; sf->adaptive_pred_interp_filter = 1; sf->allow_acl = 0; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + if (cpi->oxcf.content != VP9E_CONTENT_FILM) { + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + } sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 30; @@ -223,9 +283,11 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23) : INT_MAX; + sf->use_accurate_subpel_search = USE_4_TAPS; } if (speed >= 2) { + sf->rd_ml_partition.var_pruning = 0; if (oxcf->vbr_corpus_complexity) sf->recode_loop = ALLOW_RECODE_FIRST; else @@ -247,6 +309,12 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; + sf->enhanced_full_pixel_motion_search = 0; + sf->prune_ref_frame_for_rect_partitions = 0; + sf->rd_ml_partition.prune_rect_thresh[1] = -1; + sf->rd_ml_partition.prune_rect_thresh[2] = -1; + sf->rd_ml_partition.prune_rect_thresh[3] = -1; + sf->mv.subpel_search_level = 0; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { @@ -257,6 +325,8 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, good_quality_mesh_patterns[mesh_density_level][i].interval; } } + + sf->use_accurate_subpel_search = USE_2_TAPS; } if (speed >= 3) { @@ -316,6 +386,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->simple_model_rd_from_var = 1; } } +#endif // !CONFIG_REALTIME_ONLY static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, SPEED_FEATURES *sf, @@ -358,6 +429,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, static void set_rt_speed_feature_framesize_independent( VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) { VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; const int is_keyframe = cm->frame_type == KEY_FRAME; const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->static_segmentation = 0; @@ -374,6 +446,16 @@ static void set_rt_speed_feature_framesize_independent( sf->use_compound_nonrd_pickmode = 0; sf->nonrd_keyframe = 0; sf->svc_use_lowres_part = 0; + sf->overshoot_detection_cbr_rt = NO_DETECTION; + sf->disable_16x16part_nonkey = 0; + sf->disable_golden_ref = 0; + sf->enable_tpl_model = 0; + sf->enhanced_full_pixel_motion_search = 0; + sf->use_accurate_subpel_search = USE_2_TAPS; + sf->nonrd_use_ml_partition = 0; + sf->variance_part_thresh_mult = 1; + sf->cb_pred_filter_search = 0; + sf->force_smooth_interpol = 0; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -407,7 +489,7 @@ static void set_rt_speed_feature_framesize_independent( // Reference masking only enabled for 1 spatial layer, and if none of the // references have been scaled. The latter condition needs to be checked // for external or internal dynamic resize. - sf->reference_masking = (cpi->svc.number_spatial_layers == 1); + sf->reference_masking = (svc->number_spatial_layers == 1); if (sf->reference_masking == 1 && (cpi->external_resize == 1 || cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) { @@ -440,7 +522,7 @@ static void set_rt_speed_feature_framesize_independent( sf->disable_filter_search_var_thresh = 100; sf->use_uv_intra_rd_estimate = 1; sf->skip_encode_sb = 1; - sf->mv.subpel_iters_per_step = 1; + sf->mv.subpel_search_level = 0; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; sf->allow_skip_recode = 0; @@ -460,7 +542,7 @@ static void set_rt_speed_feature_framesize_independent( sf->adjust_partitioning_from_last_frame = cm->last_frame_type != cm->frame_type || (0 == (frames_since_key + 1) % sf->last_partitioning_redo_frequency); - sf->mv.subpel_force_stop = 1; + sf->mv.subpel_force_stop = QUARTER_PEL; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_DC_H_V; sf->intra_uv_mode_mask[i] = INTRA_DC; @@ -513,7 +595,10 @@ static void set_rt_speed_feature_framesize_independent( int i; if (content == VP9E_CONTENT_SCREEN) { for (i = 0; i < BLOCK_SIZES; ++i) - sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V; + if (i >= BLOCK_32X32) + sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V; + else + sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V; } else { for (i = 0; i < BLOCK_SIZES; ++i) if (i > BLOCK_16X16) @@ -531,6 +616,23 @@ static void set_rt_speed_feature_framesize_independent( sf->limit_newmv_early_exit = 1; if (!cpi->use_svc) sf->bias_golden = 1; } + // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent + // increase in encoding time. + if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1; + if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && + cpi->oxcf.rc_mode == VPX_CBR) { + if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc && + cpi->oxcf.content != VP9E_CONTENT_SCREEN) + sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ; + else + sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; + } + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cm->width <= 1280 && cm->height <= 720) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } + if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1; } if (speed >= 6) { @@ -539,8 +641,6 @@ static void set_rt_speed_feature_framesize_independent( sf->use_compound_nonrd_pickmode = 1; } sf->partition_search_type = VAR_BASED_PARTITION; - // Turn on this to use non-RD key frame coding mode. - sf->use_nonrd_pick_mode = 1; sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; @@ -553,7 +653,7 @@ static void set_rt_speed_feature_framesize_independent( (cm->width * cm->height <= 640 * 360) ? 40000 : 60000; if (cpi->content_state_sb_fd == NULL && (!cpi->use_svc || - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + svc->spatial_layer_id == svc->number_spatial_layers - 1)) { cpi->content_state_sb_fd = (uint8_t *)vpx_calloc( (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); } @@ -562,11 +662,14 @@ static void set_rt_speed_feature_framesize_independent( // Enable short circuit for low temporal variance. sf->short_circuit_low_temp_var = 1; } - if (cpi->svc.temporal_layer_id > 0) { + if (svc->temporal_layer_id > 0) { sf->adaptive_rd_thresh = 4; sf->limit_newmv_early_exit = 0; sf->base_mv_aggressive = 1; } + if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && + cpi->oxcf.rc_mode == VPX_CBR) + sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; } if (speed >= 7) { @@ -576,16 +679,15 @@ static void set_rt_speed_feature_framesize_independent( sf->mv.fullpel_search_step_param = 10; // For SVC: use better mv search on base temporal layer, and only // on base spatial layer if highest resolution is above 640x360. - if (cpi->svc.number_temporal_layers > 2 && - cpi->svc.temporal_layer_id == 0 && - (cpi->svc.spatial_layer_id == 0 || + if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 && + (svc->spatial_layer_id == 0 || cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) { sf->mv.search_method = NSTEP; sf->mv.fullpel_search_step_param = 6; } - if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) { + if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) { sf->use_simple_block_yrd = 1; - if (cpi->svc.non_reference_frame) + if (svc->non_reference_frame) sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; } if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1) @@ -596,22 +698,30 @@ static void set_rt_speed_feature_framesize_independent( if (!cpi->last_frame_dropped && cpi->resize_state == ORIG && !cpi->external_resize && (!cpi->use_svc || - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + (svc->spatial_layer_id == svc->number_spatial_layers - 1 && + !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) { sf->copy_partition_flag = 1; cpi->max_copied_frame = 2; // The top temporal enhancement layer (for number of temporal layers > 1) // are non-reference frames, so use large/max value for max_copied_frame. - if (cpi->svc.number_temporal_layers > 1 && - cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1) + if (svc->number_temporal_layers > 1 && + svc->temporal_layer_id == svc->number_temporal_layers - 1) cpi->max_copied_frame = 255; } // For SVC: enable use of lower resolution partition for higher resolution, // only for 3 spatial layers and when config/top resolution is above VGA. // Enable only for non-base temporal layer frames. - if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 && - cpi->svc.temporal_layer_id > 0 && + if (cpi->use_svc && svc->use_partition_reuse && + svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 && cpi->oxcf.width * cpi->oxcf.height > 640 * 480) sf->svc_use_lowres_part = 1; + // For SVC when golden is used as second temporal reference: to avoid + // encode time increase only use this feature on base temporal layer. + // (i.e remove golden flag from frame_flags for temporal_layer_id > 0). + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id > 0) + cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); + if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1; } if (speed >= 8) { @@ -621,9 +731,15 @@ static void set_rt_speed_feature_framesize_independent( if (!cpi->use_svc) cpi->max_copied_frame = 4; if (cpi->row_mt && cpi->oxcf.max_threads > 1) sf->adaptive_rd_thresh_row_mt = 1; - - if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3; - if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; + // Enable ML based partition for low res. + if (!frame_is_intra_only(cm) && cm->width * cm->height <= 352 * 288) { + sf->nonrd_use_ml_partition = 1; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + sf->nonrd_use_ml_partition = 0; +#endif + if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = HALF_PEL; // Only keep INTRA_DC mode for speed 8. if (!is_keyframe) { int i = 0; @@ -651,7 +767,27 @@ static void set_rt_speed_feature_framesize_independent( } sf->limit_newmv_early_exit = 0; sf->use_simple_block_yrd = 1; + if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1; } + + if (speed >= 9) { + sf->cb_pred_filter_search = 1; + sf->mv.enable_adaptive_subpel_force_stop = 1; + sf->mv.adapt_subpel_force_stop.mv_thresh = 1; + sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL; + sf->mv.adapt_subpel_force_stop.force_stop_above = HALF_PEL; + // Disable partition blocks below 16x16, except for low-resolutions. + if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240) + sf->disable_16x16part_nonkey = 1; + // Allow for disabling GOLDEN reference, for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1; + if (cpi->rc.avg_frame_low_motion < 70) sf->default_interp_filter = BILINEAR; + if (cm->width * cm->height >= 640 * 360) sf->variance_part_thresh_mult = 2; + } + + if (sf->nonrd_use_ml_partition) + sf->partition_search_type = ML_BASED_PARTITION; + if (sf->use_altref_onepass) { if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) { sf->partition_search_type = FIXED_PARTITION; @@ -666,9 +802,26 @@ static void set_rt_speed_feature_framesize_independent( (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->count_lastgolden_frame_usage)); } + if (svc->previous_frame_is_intra_only) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + // Special case for screen content: increase motion search on base spatial + // layer when high motion is detected or previous SL0 frame was dropped. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 && + (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) { + sf->mv.search_method = NSTEP; + // TODO(marpan/jianj): Tune this setting for screensharing. For now use + // small step_param for all spatial layers. + sf->mv.fullpel_search_step_param = 2; + } + // TODO(marpan): There is regression for aq-mode=3 speed <= 4, force it + // off for now. + if (speed <= 4 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + cpi->oxcf.aq_mode = 0; } -void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { +void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; const VP9EncoderConfig *const oxcf = &cpi->oxcf; RD_OPT *const rd = &cpi->rd; @@ -678,13 +831,15 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { // Some speed-up features even for best quality as minimal impact on quality. sf->partition_search_breakout_thr.dist = (1 << 19); sf->partition_search_breakout_thr.rate = 80; - sf->ml_partition_search_early_termination = 0; + sf->rd_ml_partition.search_early_termination = 0; + sf->rd_ml_partition.search_breakout = 0; - if (oxcf->mode == REALTIME) { - set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); - } else if (oxcf->mode == GOOD) { - set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); - } + if (oxcf->mode == REALTIME) + set_rt_speed_feature_framesize_dependent(cpi, sf, speed); +#if !CONFIG_REALTIME_ONLY + else if (oxcf->mode == GOOD) + set_good_speed_feature_framesize_dependent(cpi, sf, speed); +#endif if (sf->disable_split_mask == DISABLE_ALL_SPLIT) { sf->adaptive_pred_interp_filter = 0; @@ -710,17 +865,13 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && oxcf->max_threads > 1) sf->adaptive_rd_thresh = 0; - - // This is only used in motion vector unit test. - if (cpi->oxcf.motion_vector_unit_test == 1) - cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; - else if (cpi->oxcf.motion_vector_unit_test == 2) - cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; } -void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { +void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; +#if !CONFIG_REALTIME_ONLY VP9_COMMON *const cm = &cpi->common; +#endif MACROBLOCK *const x = &cpi->td.mb; const VP9EncoderConfig *const oxcf = &cpi->oxcf; int i; @@ -730,8 +881,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->mv.search_method = NSTEP; sf->recode_loop = ALLOW_RECODE_FIRST; sf->mv.subpel_search_method = SUBPEL_TREE; - sf->mv.subpel_iters_per_step = 2; - sf->mv.subpel_force_stop = 0; + sf->mv.subpel_search_level = 2; + sf->mv.subpel_force_stop = EIGHTH_PEL; sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); sf->mv.reduce_first_step_size = 0; sf->coeff_prob_appx_step = 1; @@ -741,6 +892,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; + sf->enhanced_full_pixel_motion_search = 1; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; sf->cb_pred_filter_search = 0; @@ -752,7 +904,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->partition_search_type = SEARCH_PARTITION; sf->less_rectangular_check = 0; sf->use_square_partition_only = 0; - sf->use_square_only_threshold = BLOCK_SIZES; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; sf->auto_min_max_partition_size = NOT_IN_USE; sf->rd_auto_partition_min_limit = BLOCK_4X4; sf->default_max_partition_size = BLOCK_64X64; @@ -771,6 +924,9 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->allow_quant_coeff_opt = sf->optimize_coefficients; sf->quant_opt_thresh = 99.0; sf->allow_acl = 1; + sf->enable_tpl_model = oxcf->enable_tpl_model; + sf->prune_ref_frame_for_rect_partitions = 0; + sf->temporal_filter_search_method = MESH; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; @@ -804,10 +960,17 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->limit_newmv_early_exit = 0; sf->bias_golden = 0; sf->base_mv_aggressive = 0; + sf->rd_ml_partition.prune_rect_thresh[0] = -1; + sf->rd_ml_partition.prune_rect_thresh[1] = -1; + sf->rd_ml_partition.prune_rect_thresh[2] = -1; + sf->rd_ml_partition.prune_rect_thresh[3] = -1; + sf->rd_ml_partition.var_pruning = 0; + sf->use_accurate_subpel_search = USE_8_TAPS; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; sf->tx_size_search_breakout = 1; + sf->tx_size_search_depth = 2; sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) @@ -820,10 +983,11 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { } if (oxcf->mode == REALTIME) - set_rt_speed_feature_framesize_independent(cpi, sf, oxcf->speed, - oxcf->content); + set_rt_speed_feature_framesize_independent(cpi, sf, speed, oxcf->content); +#if !CONFIG_REALTIME_ONLY else if (oxcf->mode == GOOD) - set_good_speed_feature_framesize_independent(cpi, cm, sf, oxcf->speed); + set_good_speed_feature_framesize_independent(cpi, cm, sf, speed); +#endif cpi->diamond_search_sad = vp9_diamond_search_sad; @@ -837,7 +1001,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->optimize_coefficients = 0; } - if (sf->mv.subpel_force_stop == 3) { + if (sf->mv.subpel_force_stop == FULL_PEL) { // Whole pel only cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree; } else if (sf->mv.subpel_search_method == SUBPEL_TREE) { @@ -850,6 +1014,12 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore; } + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; + x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; x->min_partition_size = sf->default_min_partition_size; @@ -867,10 +1037,4 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && oxcf->max_threads > 1) sf->adaptive_rd_thresh = 0; - - // This is only used in motion vector unit test. - if (cpi->oxcf.motion_vector_unit_test == 1) - cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; - else if (cpi->oxcf.motion_vector_unit_test == 2) - cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; } diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.h b/libs/libvpx/vp9/encoder/vp9_speed_features.h index 50d52bc23a..eb06281990 100644 --- a/libs/libvpx/vp9/encoder/vp9_speed_features.h +++ b/libs/libvpx/vp9/encoder/vp9_speed_features.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_ -#define VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#ifndef VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#define VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ #include "vp9/common/vp9_enums.h" @@ -57,7 +57,8 @@ typedef enum { BIGDIA = 3, SQUARE = 4, FAST_HEX = 5, - FAST_DIAMOND = 6 + FAST_DIAMOND = 6, + MESH = 7 } SEARCH_METHODS; typedef enum { @@ -135,20 +136,23 @@ typedef enum { } INTERP_FILTER_MASK; typedef enum { - // Search partitions using RD/NONRD criterion + // Search partitions using RD/NONRD criterion. SEARCH_PARTITION, - // Always use a fixed size partition + // Always use a fixed size partition. FIXED_PARTITION, REFERENCE_PARTITION, // Use an arbitrary partitioning scheme based on source variance within - // a 64X64 SB + // a 64X64 SB. VAR_BASED_PARTITION, - // Use non-fixed partitions based on source variance - SOURCE_VAR_BASED_PARTITION + // Use non-fixed partitions based on source variance. + SOURCE_VAR_BASED_PARTITION, + + // Make partition decisions with machine learning models. + ML_BASED_PARTITION } PARTITION_SEARCH_TYPE; typedef enum { @@ -161,6 +165,19 @@ typedef enum { ONE_LOOP_REDUCED = 1 } FAST_COEFF_UPDATE; +typedef enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } SUBPEL_FORCE_STOP; + +typedef struct ADAPT_SUBPEL_FORCE_STOP { + // Threshold for full pixel motion vector; + int mv_thresh; + + // subpel_force_stop if full pixel MV is below the threshold. + SUBPEL_FORCE_STOP force_stop_below; + + // subpel_force_stop if full pixel MV is equal to or above the threshold. + SUBPEL_FORCE_STOP force_stop_above; +} ADAPT_SUBPEL_FORCE_STOP; + typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; @@ -179,15 +196,17 @@ typedef struct MV_SPEED_FEATURES { // the same process. Along the way it skips many diagonals. SUBPEL_SEARCH_METHODS subpel_search_method; - // Maximum number of steps in logarithmic subpel search before giving up. - int subpel_iters_per_step; + // Subpel MV search level. Can take values 0 - 2. Higher values mean more + // extensive subpel search. + int subpel_search_level; - // Control when to stop subpel search: - // 0: Full subpel search. - // 1: Stop at quarter pixel. - // 2: Stop at half pixel. - // 3: Stop at full pixel. - int subpel_force_stop; + // When to stop subpel motion search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // If it's enabled, different subpel_force_stop will be used for different MV. + int enable_adaptive_subpel_force_stop; + + ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop; // This variable sets the step_param used in full pel motion search. int fullpel_search_step_param; @@ -205,6 +224,28 @@ typedef struct MESH_PATTERN { int interval; } MESH_PATTERN; +typedef enum { + // No reaction to rate control on a detected slide/scene change. + NO_DETECTION = 0, + + // Set to larger Q (max_q set by user) based only on the + // detected slide/scene change and current/past Q. + FAST_DETECTION_MAXQ = 1, + + // Based on (first pass) encoded frame, if large frame size is detected + // then set to higher Q for the second re-encode. This involves 2 pass + // encoding on slide change, so slower than 1, but more accurate for + // detecting overshoot. + RE_ENCODE_MAXQ = 2 +} OVERSHOOT_DETECTION_CBR_RT; + +typedef enum { + USE_2_TAPS = 0, + USE_4_TAPS, + USE_8_TAPS, + USE_8_TAPS_SHARP, +} SUBPEL_SEARCH_TYPE; + typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -258,6 +299,9 @@ typedef struct SPEED_FEATURES { // alternate reference frames. int allow_acl; + // Temporal dependency model based encoding mode optimization + int enable_tpl_model; + // Use transform domain distortion. Use pixel domain distortion in speed 0 // and certain situations in higher speed to improve the RD model precision. int allow_txfm_domain_distortion; @@ -272,6 +316,9 @@ typedef struct SPEED_FEATURES { // for intra and model coefs for the rest. TX_SIZE_SEARCH_METHOD tx_size_search_method; + // How many levels of tx size to search, starting from the largest. + int tx_size_search_depth; + // Low precision 32x32 fdct keeps everything in 16 bits and thus is less // precise but significantly faster than the non lp version. int use_lp32x32fdct; @@ -293,9 +340,14 @@ typedef struct SPEED_FEATURES { // rd than partition type split. int less_rectangular_check; - // Disable testing non square partitions. (eg 16x32) + // Disable testing non square partitions(eg 16x32) for block sizes larger than + // use_square_only_thresh_high or smaller than use_square_only_thresh_low. int use_square_partition_only; - BLOCK_SIZE use_square_only_threshold; + BLOCK_SIZE use_square_only_thresh_high; + BLOCK_SIZE use_square_only_thresh_low; + + // Prune reference frames for rectangular partitions. + int prune_ref_frame_for_rect_partitions; // Sets min and max partition sizes for this 64x64 region based on the // same 64x64 in last encoded frame, and the left and above neighbor. @@ -327,6 +379,9 @@ typedef struct SPEED_FEATURES { // point for this motion search and limits the search range around it. int adaptive_motion_search; + // Do extra full pixel motion search to obtain better motion vector. + int enhanced_full_pixel_motion_search; + // Threshold for allowing exhaistive motion search. int exhaustive_searches_thresh; @@ -448,8 +503,27 @@ typedef struct SPEED_FEATURES { // Partition search early breakout thresholds. PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr; - // Machine-learning based partition search early termination - int ml_partition_search_early_termination; + struct { + // Use ML-based partition search early breakout. + int search_breakout; + // Higher values mean more aggressiveness for partition search breakout that + // results in better encoding speed but worse compression performance. + float search_breakout_thresh[3]; + + // Machine-learning based partition search early termination + int search_early_termination; + + // Machine-learning based partition search pruning using prediction residue + // variance. + int var_pruning; + + // Threshold values used for ML based rectangular partition search pruning. + // If < 0, the feature is turned off. + // Higher values mean more aggressiveness to skip rectangular partition + // search that results in better encoding speed but worse coding + // performance. + int prune_rect_thresh[4]; + } rd_ml_partition; // Allow skipping partition search for still image frame int allow_partition_search_skip; @@ -508,15 +582,43 @@ typedef struct SPEED_FEATURES { // For SVC: enables use of partition from lower spatial resolution. int svc_use_lowres_part; + + // Flag to indicate process for handling overshoot on slide/scene change, + // for real-time CBR mode. + OVERSHOOT_DETECTION_CBR_RT overshoot_detection_cbr_rt; + + // Disable partitioning of 16x16 blocks. + int disable_16x16part_nonkey; + + // Allow for disabling golden reference. + int disable_golden_ref; + + // Allow sub-pixel search to use interpolation filters with different taps in + // order to achieve accurate motion search result. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; + + // Search method used by temporal filtering in full_pixel_motion_search. + SEARCH_METHODS temporal_filter_search_method; + + // Use machine learning based partition search. + int nonrd_use_ml_partition; + + // Multiplier for base thresold for variance partitioning. + int variance_part_thresh_mult; + + // Force subpel motion filter to always use SMOOTH_FILTER. + int force_smooth_interpol; } SPEED_FEATURES; struct VP9_COMP; -void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi); -void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi); +void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi, + int speed); +void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi, + int speed); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#endif // VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_subexp.c b/libs/libvpx/vp9/encoder/vp9_subexp.c index e8212ce05e..19bbd5373f 100644 --- a/libs/libvpx/vp9/encoder/vp9_subexp.c +++ b/libs/libvpx/vp9/encoder/vp9_subexp.c @@ -71,6 +71,7 @@ static int remap_prob(int v, int m) { else i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1; + assert(i >= 0 && (size_t)i < sizeof(map_table)); i = map_table[i]; return i; } diff --git a/libs/libvpx/vp9/encoder/vp9_subexp.h b/libs/libvpx/vp9/encoder/vp9_subexp.h index 26c89e2ea7..f0d544b527 100644 --- a/libs/libvpx/vp9/encoder/vp9_subexp.h +++ b/libs/libvpx/vp9/encoder/vp9_subexp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SUBEXP_H_ -#define VP9_ENCODER_VP9_SUBEXP_H_ +#ifndef VPX_VP9_ENCODER_VP9_SUBEXP_H_ +#define VPX_VP9_ENCODER_VP9_SUBEXP_H_ #ifdef __cplusplus extern "C" { @@ -37,4 +37,4 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SUBEXP_H_ +#endif // VPX_VP9_ENCODER_VP9_SUBEXP_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c index 2636bd9a58..8ba113bf3e 100644 --- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -19,6 +19,14 @@ #define SMALL_FRAME_WIDTH 32 #define SMALL_FRAME_HEIGHT 16 +static void swap_ptr(void *a, void *b) { + void **a_p = (void **)a; + void **b_p = (void **)b; + void *c = *a_p; + *a_p = *b_p; + *b_p = c; +} + void vp9_init_layer_context(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -29,24 +37,50 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->spatial_layer_id = 0; svc->temporal_layer_id = 0; - svc->first_spatial_layer_to_encode = 0; - svc->rc_drop_superframe = 0; svc->force_zero_mode_spatial_ref = 0; svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + svc->use_gf_temporal_ref = 1; + svc->use_gf_temporal_ref_current_layer = 0; svc->scaled_temp_is_alloc = 0; svc->scaled_one_half = 0; svc->current_superframe = 0; svc->non_reference_frame = 0; + svc->skip_enhancement_layer = 0; + svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON; + svc->framedrop_mode = CONSTRAINED_LAYER_DROP; + svc->set_intra_only_frame = 0; + svc->previous_frame_is_intra_only = 0; + svc->superframe_has_layer_sync = 0; + svc->use_set_ref_frame_config = 0; + svc->num_encoded_top_layer = 0; + svc->simulcast_mode = 0; - for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; - for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { - svc->ext_frame_flags[sl] = 0; - svc->ext_lst_fb_idx[sl] = 0; - svc->ext_gld_fb_idx[sl] = 1; - svc->ext_alt_fb_idx[sl] = 2; - svc->downsample_filter_type[sl] = EIGHTTAP; - svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter. + for (i = 0; i < REF_FRAMES; ++i) { + svc->fb_idx_spatial_layer_id[i] = -1; + svc->fb_idx_temporal_layer_id[i] = -1; + svc->fb_idx_base[i] = 0; } + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + svc->last_layer_dropped[sl] = 0; + svc->drop_spatial_layer[sl] = 0; + svc->ext_frame_flags[sl] = 0; + svc->lst_fb_idx[sl] = 0; + svc->gld_fb_idx[sl] = 1; + svc->alt_fb_idx[sl] = 2; + svc->downsample_filter_type[sl] = BILINEAR; + svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. + svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; + svc->fb_idx_upd_tl0[sl] = -1; + svc->drop_count[sl] = 0; + svc->spatial_layer_sync[sl] = 0; + } + svc->max_consec_drop = INT_MAX; + + svc->buffer_gf_temporal_ref[1].idx = 7; + svc->buffer_gf_temporal_ref[0].idx = 6; + svc->buffer_gf_temporal_ref[1].is_used = 0; + svc->buffer_gf_temporal_ref[0].is_used = 0; if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH, @@ -84,6 +118,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lrc->ni_frames = 0; lrc->decimation_count = 0; lrc->decimation_factor = 0; + lrc->worst_quality = oxcf->worst_allowed_q; + lrc->best_quality = oxcf->best_allowed_q; for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { lrc->rate_correction_factors[i] = 1.0; @@ -122,6 +158,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { size_t consec_zero_mv_size; VP9_COMMON *const cm = &cpi->common; lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; CHECK_MEM_ERROR(cm, lc->map, vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map))); memset(lc->map, 0, mi_rows * mi_cols); @@ -154,6 +193,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, int sl, tl, layer = 0, spatial_layer_target; float bitrate_alloc = 1.0; + cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode; + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { @@ -290,6 +331,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { LAYER_CONTEXT *const lc = get_layer_context(cpi); const int old_frame_since_key = cpi->rc.frames_since_key; const int old_frame_to_key = cpi->rc.frames_to_key; + const int old_ext_use_post_encode_drop = cpi->rc.ext_use_post_encode_drop; cpi->rc = lc->rc; cpi->twopass = lc->twopass; @@ -303,26 +345,23 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1 || - (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) { + cpi->svc.number_spatial_layers > 1) { cpi->rc.frames_since_key = old_frame_since_key; cpi->rc.frames_to_key = old_frame_to_key; } - + cpi->rc.ext_use_post_encode_drop = old_ext_use_post_encode_drop; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; - signed char *temp = cr->map; - uint8_t *temp2 = cr->last_coded_q_map; - uint8_t *temp3 = cpi->consec_zero_mv; - cr->map = lc->map; - lc->map = temp; - cr->last_coded_q_map = lc->last_coded_q_map; - lc->last_coded_q_map = temp2; - cpi->consec_zero_mv = lc->consec_zero_mv; - lc->consec_zero_mv = temp3; + swap_ptr(&cr->map, &lc->map); + swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map); + swap_ptr(&cpi->consec_zero_mv, &lc->consec_zero_mv); cr->sb_index = lc->sb_index; + cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; + cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; + cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change; } } @@ -350,6 +389,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->consec_zero_mv = cpi->consec_zero_mv; cpi->consec_zero_mv = temp3; lc->sb_index = cr->sb_index; + lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; + lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; + lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; } } @@ -381,15 +423,6 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { ++cpi->svc.current_superframe; } -int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { - return is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0 && - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id] - .is_key_frame; -} - void get_layer_resolution(const int width_org, const int height_org, const int num, const int den, int *width_out, int *height_out) { @@ -408,6 +441,51 @@ void get_layer_resolution(const int width_org, const int height_org, *height_out = h; } +static void reset_fb_idx_unused(VP9_COMP *const cpi) { + // If a reference frame is not referenced or refreshed, then set the + // fb_idx for that reference to the first one used/referenced. + // This is to avoid setting fb_idx for a reference to a slot that is not + // used/needed (i.e., since that reference is not referenced or refreshed). + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME first_ref = 0; + int first_fb_idx = 0; + int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + first_ref = ref_frame; + first_fb_idx = fb_idx[ref_frame - 1]; + break; + } + } + if (first_ref > 0) { + if (first_ref != LAST_FRAME && + !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) && + !cpi->ext_refresh_last_frame) + cpi->lst_fb_idx = first_fb_idx; + else if (first_ref != GOLDEN_FRAME && + !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + !cpi->ext_refresh_golden_frame) + cpi->gld_fb_idx = first_fb_idx; + else if (first_ref != ALTREF_FRAME && + !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) && + !cpi->ext_refresh_alt_ref_frame) + cpi->alt_fb_idx = first_fb_idx; + } +} + +// Never refresh any reference frame buffers on top temporal layers in +// simulcast mode, which has interlayer prediction disabled. +static void non_reference_frame_simulcast(VP9_COMP *const cpi) { + if (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1 && + cpi->svc.temporal_layer_id > 0) { + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + } +} + // The function sets proper ref_frame_flags, buffer indices, and buffer update // variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering // scheme. @@ -511,6 +589,10 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -546,6 +628,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { if (!spatial_id) { cpi->ref_frame_flags = VP9_LAST_FLAG; } else { + if (spatial_id == cpi->svc.number_spatial_layers - 1) + cpi->ext_refresh_alt_ref_frame = 0; cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; } } @@ -568,6 +652,10 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -600,54 +688,174 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering( } else { cpi->gld_fb_idx = 0; } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); +} + +static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( + VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode; + cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl]; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->lst_fb_idx = svc->lst_fb_idx[sl]; + cpi->gld_fb_idx = svc->gld_fb_idx[sl]; + cpi->alt_fb_idx = svc->alt_fb_idx[sl]; + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = 0; + if (svc->reference_last[sl]) cpi->ref_frame_flags |= VP9_LAST_FLAG; + if (svc->reference_golden[sl]) cpi->ref_frame_flags |= VP9_GOLD_FLAG; + if (svc->reference_altref[sl]) cpi->ref_frame_flags |= VP9_ALT_FLAG; +} + +void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int sl = svc->spatial_layer_id; + svc->lst_fb_idx[sl] = cpi->lst_fb_idx; + svc->gld_fb_idx[sl] = cpi->gld_fb_idx; + svc->alt_fb_idx[sl] = cpi->alt_fb_idx; + // For the fixed SVC mode: pass the refresh_lst/gld/alt_frame flags to the + // update_buffer_slot, this is needed for the GET_SVC_REF_FRAME_CONFIG api. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + int ref; + for (ref = 0; ref < REF_FRAMES; ++ref) { + svc->update_buffer_slot[sl] &= ~(1 << ref); + if ((ref == svc->lst_fb_idx[sl] && cpi->refresh_last_frame) || + (ref == svc->gld_fb_idx[sl] && cpi->refresh_golden_frame) || + (ref == svc->alt_fb_idx[sl] && cpi->refresh_alt_ref_frame)) + svc->update_buffer_slot[sl] |= (1 << ref); + } + } + + // TODO(jianj): Remove these 3, deprecated. + svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame; + svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; + svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; + + svc->reference_last[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]); + svc->reference_golden[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]); + svc->reference_altref[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); } int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { int width = 0, height = 0; + SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; - if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1; - cpi->svc.force_zero_mode_spatial_ref = 1; - cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride; + svc->skip_enhancement_layer = 0; - if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { + if (svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF && + svc->number_spatial_layers > 1 && svc->number_spatial_layers <= 3 && + svc->number_temporal_layers <= 3 && + !(svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config)) + svc->simulcast_mode = 1; + else + svc->simulcast_mode = 0; + + if (svc->number_spatial_layers > 1) { + svc->use_base_mv = 1; + svc->use_partition_reuse = 1; + } + svc->force_zero_mode_spatial_ref = 1; + svc->mi_stride[svc->spatial_layer_id] = cpi->common.mi_stride; + svc->mi_rows[svc->spatial_layer_id] = cpi->common.mi_rows; + svc->mi_cols[svc->spatial_layer_id] = cpi->common.mi_cols; + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { set_flags_and_fb_idx_for_temporal_mode3(cpi); - } else if (cpi->svc.temporal_layering_mode == + } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi); - } else if (cpi->svc.temporal_layering_mode == - VP9E_TEMPORAL_LAYERING_MODE_0101) { + } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) { set_flags_and_fb_idx_for_temporal_mode2(cpi); - } else if (cpi->svc.temporal_layering_mode == - VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - // In the BYPASS/flexible mode, the encoder is relying on the application - // to specify, for each spatial layer, the flags and buffer indices for the - // layering. - // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is - // needed to support the case where the frame flags may be passed in via - // vpx_codec_encode(), which can be used for the temporal-only svc case. - // TODO(marpan): Consider adding an enc_config parameter to better handle - // this case. - if (cpi->ext_refresh_frame_flags_pending == 0) { - int sl; - cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; - sl = cpi->svc.spatial_layer_id; - vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]); - cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl]; - cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl]; - cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl]; + } else if (svc->temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + set_flags_and_fb_idx_bypass_via_set_ref_frame_config(cpi); + } + + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[0].idx) + svc->buffer_gf_temporal_ref[0].is_used = 1; + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[1].idx) + svc->buffer_gf_temporal_ref[1].is_used = 1; + + // For the fixed (non-flexible/bypass) SVC mode: + // If long term temporal reference is enabled at the sequence level + // (use_gf_temporal_ref == 1), and inter_layer is disabled (on inter-frames), + // we can use golden as a second temporal reference + // (since the spatial/inter-layer reference is disabled). + // We check that the fb_idx for this reference (buffer_gf_temporal_ref.idx) is + // unused (slot 7 and 6 should be available for 3-3 layer system). + // For now usage of this second temporal reference will only be used for + // highest and next to highest spatial layer (i.e., top and middle layer for + // 3 spatial layers). + svc->use_gf_temporal_ref_current_layer = 0; + if (svc->use_gf_temporal_ref && !svc->buffer_gf_temporal_ref[0].is_used && + !svc->buffer_gf_temporal_ref[1].is_used && + svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON && + svc->number_spatial_layers <= 3 && svc->number_temporal_layers <= 3 && + svc->spatial_layer_id >= svc->number_spatial_layers - 2) { + // Enable the second (long-term) temporal reference at the frame-level. + svc->use_gf_temporal_ref_current_layer = 1; + } + + // Check if current superframe has any layer sync, only check once on + // base layer. + if (svc->spatial_layer_id == 0) { + int sl = 0; + // Default is no sync. + svc->superframe_has_layer_sync = 0; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + if (cpi->svc.spatial_layer_sync[sl]) svc->superframe_has_layer_sync = 1; } } - if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) - cpi->svc.rc_drop_superframe = 0; + // Reset the drop flags for all spatial layers, on the base layer. + if (svc->spatial_layer_id == 0) { + vp9_zero(svc->drop_spatial_layer); + // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx + // causes an issue with frame dropping and temporal layers, when the frame + // flags are passed via the encode call (bypass mode). Issue is that we're + // resetting ext_refresh_frame_flags_pending to 0 on frame drops. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx)); + // These are set by API before the superframe is encoded and they are + // passed to encoder layer by layer. Don't reset them on layer 0 in bypass + // mode. + vp9_zero(svc->update_buffer_slot); + vp9_zero(svc->reference_last); + vp9_zero(svc->reference_golden); + vp9_zero(svc->reference_altref); + // TODO(jianj): Remove these 3, deprecated. + vp9_zero(svc->update_last); + vp9_zero(svc->update_golden); + vp9_zero(svc->update_altref); + } + } - lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id]; + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS, // only for non-BYPASS mode for now. - if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS || + svc->use_set_ref_frame_config) { RATE_CONTROL *const lrc = &lc->rc; lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q); lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q); @@ -657,35 +865,68 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); - // For resolutions <= VGA: set phase of the filter = 8 (for symmetric - // averaging filter), use bilinear for now. - if (width * height <= 640 * 480) { - cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR; - cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8; - } + // Use Eightap_smooth for low resolutions. + if (width * height <= 320 * 240) + svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; + // For scale factors > 0.75, set the phase to 0 (aligns decimated pixel + // to source pixel). + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; + if (lc->scaling_factor_num > (3 * lc->scaling_factor_den) >> 2) + svc->downsample_filter_phase[svc->spatial_layer_id] = 0; - // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use - // of base motion vectors if spatial scale factors for any layers are not 2, + // The usage of use_base_mv or partition_reuse assumes down-scale of 2x2. + // For now, turn off use of base motion vectors and partition reuse if the + // spatial scale factors for any layers are not 2, // keep the case of 3 spatial layers with scale factor of 4x4 for base layer. // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. - if (cpi->svc.number_spatial_layers > 1) { + if (svc->number_spatial_layers > 1) { int sl; - for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) { - lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id]; + for (sl = 0; sl < svc->number_spatial_layers - 1; ++sl) { + lc = &svc->layer_context[sl * svc->number_temporal_layers + + svc->temporal_layer_id]; if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) && !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 && - cpi->svc.number_spatial_layers == 3)) { - cpi->svc.use_base_mv = 0; + svc->number_spatial_layers == 3)) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; break; } } + // For non-zero spatial layers: if the previous spatial layer was dropped + // disable the base_mv and partition_reuse features. + if (svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + } } - cpi->svc.non_reference_frame = 0; + svc->non_reference_frame = 0; if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame && - !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) { - cpi->svc.non_reference_frame = 1; + !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) + svc->non_reference_frame = 1; + // For non-flexible mode, where update_buffer_slot is used, need to check if + // all buffer slots are not refreshed. + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + if (svc->update_buffer_slot[svc->spatial_layer_id] != 0) + svc->non_reference_frame = 0; + } + + if (svc->spatial_layer_id == 0) { + svc->high_source_sad_superframe = 0; + svc->high_num_blocks_with_motion = 0; + } + + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->last_layer_dropped[svc->spatial_layer_id] && + svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For fixed/non-flexible mode, if the previous frame (same spatial layer + // from previous superframe) was dropped, make sure the lst_fb_idx + // for this frame corresponds to the buffer index updated on (last) encoded + // TL0 frame (with same spatial layer). + cpi->lst_fb_idx = svc->fb_idx_upd_tl0[svc->spatial_layer_id]; } if (vp9_set_size_literal(cpi, width, height) != 0) @@ -694,120 +935,6 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { return 0; } -#if CONFIG_SPATIAL_SVC -#define SMALL_FRAME_FB_IDX 7 - -int vp9_svc_start_frame(VP9_COMP *const cpi) { - int width = 0, height = 0; - LAYER_CONTEXT *lc; - struct lookahead_entry *buf; - int count = 1 << (cpi->svc.number_temporal_layers - 1); - - cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; - lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; - - cpi->svc.temporal_layer_id = 0; - while ((lc->current_video_frame_in_layer % count) != 0) { - ++cpi->svc.temporal_layer_id; - count >>= 1; - } - - cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; - - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; - - if (cpi->svc.spatial_layer_id == 0) - cpi->gld_fb_idx = - (lc->gold_ref_idx >= 0) ? lc->gold_ref_idx : cpi->lst_fb_idx; - else - cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1; - - if (lc->current_video_frame_in_layer == 0) { - if (cpi->svc.spatial_layer_id >= 2) { - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; - } else { - cpi->alt_fb_idx = cpi->lst_fb_idx; - cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG); - } - } else { - if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) { - cpi->alt_fb_idx = lc->alt_ref_idx; - if (!lc->has_alt_frame) cpi->ref_frame_flags &= (~VP9_ALT_FLAG); - } else { - // Find a proper alt_fb_idx for layers that don't have alt ref frame - if (cpi->svc.spatial_layer_id == 0) { - cpi->alt_fb_idx = cpi->lst_fb_idx; - } else { - LAYER_CONTEXT *lc_lower = - &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1]; - - if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] && - lc_lower->alt_ref_source != NULL) - cpi->alt_fb_idx = lc_lower->alt_ref_idx; - else if (cpi->svc.spatial_layer_id >= 2) - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; - else - cpi->alt_fb_idx = cpi->lst_fb_idx; - } - } - } - - get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, - lc->scaling_factor_num, lc->scaling_factor_den, &width, - &height); - - // Workaround for multiple frame contexts. In some frames we can't use prev_mi - // since its previous frame could be changed during decoding time. The idea is - // we put a empty invisible frame in front of them, then we will not use - // prev_mi when encoding these frames. - - buf = vp9_lookahead_peek(cpi->lookahead, 0); - if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 && - cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE && - lc->rc.frames_to_key != 0 && - !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) { - if ((cpi->svc.number_temporal_layers > 1 && - cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) || - (cpi->svc.number_spatial_layers > 1 && - cpi->svc.spatial_layer_id == 0)) { - struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0); - - if (buf != NULL) { - cpi->svc.empty_frame.ts_start = buf->ts_start; - cpi->svc.empty_frame.ts_end = buf->ts_end; - cpi->svc.encode_empty_frame_state = ENCODING; - cpi->common.show_frame = 0; - cpi->ref_frame_flags = 0; - cpi->common.frame_type = INTER_FRAME; - cpi->lst_fb_idx = cpi->gld_fb_idx = cpi->alt_fb_idx = - SMALL_FRAME_FB_IDX; - - if (cpi->svc.encode_intra_empty_frame != 0) cpi->common.intra_only = 1; - - width = SMALL_FRAME_WIDTH; - height = SMALL_FRAME_HEIGHT; - } - } - } - - cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q); - cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q); - - vp9_change_config(cpi, &cpi->oxcf); - - if (vp9_set_size_literal(cpi, width, height) != 0) - return VPX_CODEC_INVALID_PARAM; - - vp9_set_high_precision_mv(cpi, 1); - - cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source; - - return 0; -} - -#undef SMALL_FRAME_FB_IDX -#endif // CONFIG_SPATIAL_SVC - struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, struct lookahead_ctx *ctx, int drain) { @@ -840,7 +967,7 @@ void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) { } // Reset on key frame: reset counters, references and buffer updates. -void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { +void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) { int sl, tl; SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; @@ -848,7 +975,7 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { for (tl = 0; tl < svc->number_temporal_layers; ++tl) { lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; lc->current_video_frame_in_layer = 0; - lc->frames_from_key_frame = 0; + if (is_key) lc->frames_from_key_frame = 0; } } if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { @@ -887,3 +1014,276 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { } } } + +void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + const int sl = svc->spatial_layer_id; + // Check for disabling inter-layer (spatial) prediction, if + // svc.disable_inter_layer_pred is set. If the previous spatial layer was + // dropped then disable the prediction from this (scaled) reference. + // For INTER_LAYER_PRED_OFF_NONKEY: inter-layer prediction is disabled + // on key frames or if any spatial layer is a sync layer. + if ((svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY && + !svc->layer_context[svc->temporal_layer_id].is_key_frame && + !svc->superframe_has_layer_sync) || + svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF || + svc->drop_spatial_layer[sl - 1]) { + MV_REFERENCE_FRAME ref_frame; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + const struct scale_factors *const scale_fac = + &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + cpi->ref_frame_flags &= (~flag_list[ref_frame]); + // Point golden/altref frame buffer index to last. + if (!svc->simulcast_mode) { + if (ref_frame == GOLDEN_FRAME) + cpi->gld_fb_idx = cpi->lst_fb_idx; + else if (ref_frame == ALTREF_FRAME) + cpi->alt_fb_idx = cpi->lst_fb_idx; + } + } + } + } + } + // For fixed/non-flexible SVC: check for disabling inter-layer prediction. + // If the reference for inter-layer prediction (the reference that is scaled) + // is not the previous spatial layer from the same superframe, then we disable + // inter-layer prediction. Only need to check when inter_layer prediction is + // not set to OFF mode. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_OFF) { + // We only use LAST and GOLDEN for prediction in real-time mode, so we + // check both here. + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) { + struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + // If this reference was updated on the previous spatial layer of the + // current superframe, then we keep this reference (don't disable). + // Otherwise we disable the inter-layer prediction. + // This condition is verified by checking if the current frame buffer + // index is equal to any of the slots for the previous spatial layer, + // and if so, check if that slot was updated/refreshed. If that is the + // case, then this reference is valid for inter-layer prediction under + // the mode INTER_LAYER_PRED_ON_CONSTRAINED. + int fb_idx = + ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx; + int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG; + int disable = 1; + if (fb_idx < 0) continue; + if ((fb_idx == svc->lst_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->gld_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->alt_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx)))) + disable = 0; + if (disable) cpi->ref_frame_flags &= (~ref_flag); + } + } + } +} + +void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // For fixed/non-flexible mode, the following constraint are expected, + // when inter-layer prediciton is on (default). + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON && + svc->framedrop_mode != LAYER_DROP) { + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On non-key frames: LAST is always temporal reference, GOLDEN is + // spatial reference. + if (svc->temporal_layer_id == 0) + // Base temporal only predicts from base temporal. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0); + else + // Non-base temporal only predicts from lower temporal layer. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] < + svc->temporal_layer_id); + if (svc->spatial_layer_id > 0 && cpi->ref_frame_flags & VP9_GOLD_FLAG && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Non-base spatial only predicts from lower spatial layer with same + // temporal_id. + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } else if (svc->spatial_layer_id > 0 && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Only 1 reference for frame whose base is key; reference may be LAST + // or GOLDEN, so we check both. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == + svc->temporal_layer_id); + } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } + } else if (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For the usage of golden as second long term reference: the + // temporal_layer_id of that reference must be base temporal layer 0, and + // spatial_layer_id of that reference must be same as current + // spatial_layer_id. If not, disable feature. + // TODO(marpan): Investigate when this can happen, and maybe put this check + // and reset in a different place. + if (svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] != + svc->spatial_layer_id || + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] != 0) + svc->use_gf_temporal_ref_current_layer = 0; + } +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(VP9_COMP *const cpi) { + int layer = + LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + return denoise_svc(cpi) && !lc->is_key_frame; +} +#endif + +void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // Only for superframes whose base is not key, as those are + // already sync frames. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + if (svc->spatial_layer_id == 0) { + // On base spatial layer: if the current superframe has a layer sync then + // reset the pattern counters and reset to base temporal layer. + if (svc->superframe_has_layer_sync) + vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME); + } + // If the layer sync is set for this current spatial layer then + // disable the temporal reference. + if (svc->spatial_layer_id > 0 && + svc->spatial_layer_sync[svc->spatial_layer_id]) { + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + // If golden is used as second reference: need to remove it from + // prediction, reset refresh period to 0, and update the reference. + svc->use_gf_temporal_ref_current_layer = 0; + cpi->rc.baseline_gf_interval = 0; + cpi->rc.frames_till_gf_update_due = 0; + // On layer sync frame we must update the buffer index used for long + // term reference. Use the alt_ref since it is not used or updated on + // sync frames. + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + } +} + +void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // Update the usage of frame buffer index for base spatial layers. + if (svc->spatial_layer_id == 0) { + if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame) + svc->fb_idx_base[cpi->lst_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_GOLD_FLAG) || cpi->refresh_golden_frame) + svc->fb_idx_base[cpi->gld_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame) + svc->fb_idx_base[cpi->alt_fb_idx] = 1; + } +} + +static void vp9_svc_update_ref_frame_bypass_mode(VP9_COMP *const cpi) { + // For non-flexible/bypass SVC mode: check for refreshing other buffer + // slots. + SVC *const svc = &cpi->svc; + VP9_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->frame_type == KEY_FRAME || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + } + } +} + +void vp9_svc_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BufferPool *const pool = cm->buffer_pool; + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + vp9_svc_update_ref_frame_bypass_mode(cpi); + } else if (cm->frame_type == KEY_FRAME && !svc->simulcast_mode) { + // Keep track of frame index for each reference frame. + int i; + // On key frame update all reference frame slots. + for (i = 0; i < REF_FRAMES; i++) { + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + // LAST/GOLDEN/ALTREF is already updated above. + if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx) + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + } + } else { + if (cpi->refresh_last_frame) { + svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_golden_frame) { + svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_alt_ref_frame) { + svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id; + } + } + // Copy flags from encoder to SVC struct. + vp9_copy_flags_ref_update_idx(cpi); + vp9_svc_update_ref_frame_buffer_idx(cpi); +} + +void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) { + int64_t this_duration = + cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id]; + vp9_new_framerate(cpi, 10000000.0 / this_duration); +} + +void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + RATE_CONTROL *const rc = &cpi->rc; + // On key frames in CBR mode: reset the avg_frame_index for base layer + // (to level closer to worst_quality) if the overshoot is significant. + // Reset it for all temporal layers on base spatial layer. + if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR && + !svc->simulcast_mode && + rc->projected_frame_size > 3 * rc->avg_frame_bandwidth) { + int tl; + rc->avg_frame_qindex[INTER_FRAME] = + VPXMAX(rc->avg_frame_qindex[INTER_FRAME], + (cm->base_qindex + rc->worst_quality) >> 1); + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME]; + } + } +} diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h index b7cdfd9623..77d4382665 100644 --- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ -#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#ifndef VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#define VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ #include "vpx/vpx_encoder.h" @@ -19,6 +19,24 @@ extern "C" { #endif +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +typedef struct BUFFER_LONGTERM_REF { + int idx; + int is_used; +} BUFFER_LONGTERM_REF; + typedef struct { RATE_CONTROL rc; int target_bandwidth; @@ -42,10 +60,14 @@ typedef struct { size_t layer_size; struct vpx_psnr_pkt psnr_pkt; // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. + // TODO(jianj/marpan): Is it better to use the full cyclic refresh struct. int sb_index; signed char *map; uint8_t *last_coded_q_map; uint8_t *consec_zero_mv; + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + int counter_encode_maxq_scene_change; uint8_t speed; } LAYER_CONTEXT; @@ -56,8 +78,6 @@ typedef struct SVC { int number_temporal_layers; int spatial_layer_to_encode; - int first_spatial_layer_to_encode; - int rc_drop_superframe; // Workaround for multiple frame contexts enum { ENCODED = 0, ENCODING, NEED_TO_ENCODE } encode_empty_frame_state; @@ -81,14 +101,20 @@ typedef struct SVC { // Frame flags and buffer indexes for each spatial layer, set by the // application (external settings). int ext_frame_flags[VPX_MAX_LAYERS]; - int ext_lst_fb_idx[VPX_MAX_LAYERS]; - int ext_gld_fb_idx[VPX_MAX_LAYERS]; - int ext_alt_fb_idx[VPX_MAX_LAYERS]; - int ref_frame_index[REF_FRAMES]; + int lst_fb_idx[VPX_MAX_LAYERS]; + int gld_fb_idx[VPX_MAX_LAYERS]; + int alt_fb_idx[VPX_MAX_LAYERS]; int force_zero_mode_spatial_ref; + // Sequence level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref; + // Frame level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref_current_layer; + // Allow second reference for at most 2 top highest resolution layers. + BUFFER_LONGTERM_REF buffer_gf_temporal_ref[2]; int current_superframe; int non_reference_frame; int use_base_mv; + int use_partition_reuse; // Used to control the downscaling filter for source scaling, for 1 pass CBR. // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), // = 8 will center the target pixel and get a symmetric averaging filter. @@ -99,8 +125,73 @@ typedef struct SVC { BLOCK_SIZE *prev_partition_svc; int mi_stride[VPX_MAX_LAYERS]; + int mi_rows[VPX_MAX_LAYERS]; + int mi_cols[VPX_MAX_LAYERS]; int first_layer_denoise; + + int skip_enhancement_layer; + + int lower_layer_qindex; + + int last_layer_dropped[VPX_MAX_LAYERS]; + int drop_spatial_layer[VPX_MAX_LAYERS]; + int framedrop_thresh[VPX_MAX_LAYERS]; + int drop_count[VPX_MAX_LAYERS]; + int max_consec_drop; + SVC_LAYER_DROP_MODE framedrop_mode; + + INTER_LAYER_PRED disable_inter_layer_pred; + + // Flag to indicate scene change and high num of motion blocks at current + // superframe, scene detection is currently checked for each superframe prior + // to encoding, on the full resolution source. + int high_source_sad_superframe; + int high_num_blocks_with_motion; + + // Flags used to get SVC pattern info. + int update_buffer_slot[VPX_SS_MAX_LAYERS]; + uint8_t reference_last[VPX_SS_MAX_LAYERS]; + uint8_t reference_golden[VPX_SS_MAX_LAYERS]; + uint8_t reference_altref[VPX_SS_MAX_LAYERS]; + // TODO(jianj): Remove these last 3, deprecated. + uint8_t update_last[VPX_SS_MAX_LAYERS]; + uint8_t update_golden[VPX_SS_MAX_LAYERS]; + uint8_t update_altref[VPX_SS_MAX_LAYERS]; + + // Keep track of the frame buffer index updated/refreshed on the base + // temporal superframe. + int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS]; + + // Keep track of the spatial and temporal layer id of the frame that last + // updated the frame buffer index. + uint8_t fb_idx_spatial_layer_id[REF_FRAMES]; + uint8_t fb_idx_temporal_layer_id[REF_FRAMES]; + + int spatial_layer_sync[VPX_SS_MAX_LAYERS]; + uint8_t set_intra_only_frame; + uint8_t previous_frame_is_intra_only; + uint8_t superframe_has_layer_sync; + + uint8_t fb_idx_base[REF_FRAMES]; + + int use_set_ref_frame_config; + + int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; + + int first_spatial_layer_to_encode; + + // Parameters for allowing framerate per spatial layer, and buffer + // update based on timestamps. + int64_t duration[VPX_SS_MAX_LAYERS]; + int64_t timebase_fac; + int64_t time_stamp_superframe; + int64_t time_stamp_prev[VPX_SS_MAX_LAYERS]; + + int num_encoded_top_layer; + + // Every spatial layer on a superframe whose base is key is key too. + int simulcast_mode; } SVC; struct VP9_COMP; @@ -148,16 +239,37 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, // Start a frame and initialize svc parameters int vp9_svc_start_frame(struct VP9_COMP *const cpi); +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi); +#endif + +void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi); + int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi); void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); -void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi); +void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key); void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); +void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi); + +void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi); + +void vp9_svc_check_spatial_layer_sync(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame_buffer_idx(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame_key_simulcast(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi); + +void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi); + +void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_ +#endif // VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c index 2758c42aeb..701bb89287 100644 --- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -34,57 +34,155 @@ #include "vpx_scale/vpx_scale.h" static int fixed_divide[512]; +static unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, + 39322, 32768, 28087, 24576, 21846, + 19661, 17874, 0, 15124 }; +#if CONFIG_VP9_HIGHBITDEPTH +static int64_t highbd_index_mult[14] = { 0U, 0U, 0U, + 0U, 3221225472U, 2576980378U, + 2147483648U, 1840700270U, 1610612736U, + 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; +#endif // CONFIG_VP9_HIGHBITDEPTH static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, - uint8_t *pred, struct scale_factors *scale, int x, int y) { + uint8_t *pred, struct scale_factors *scale, int x, int y, MV *blk_mvs, + int use_32x32) { const int which_mv = 0; - const MV mv = { mv_row, mv_col }; const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP]; + int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1); enum mv_precision mv_precision_uv; int uv_stride; - if (uv_block_width == 8) { + if (uv_block_width == (BW >> 1)) { uv_stride = (stride + 1) >> 1; mv_precision_uv = MV_PRECISION_Q4; } else { uv_stride = stride; mv_precision_uv = MV_PRECISION_Q3; } +#if !CONFIG_VP9_HIGHBITDEPTH + (void)xd; +#endif + if (use_32x32) { + const MV mv = { mv_row, mv_col }; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, - CONVERT_TO_SHORTPTR(&pred[0]), 16, &mv, - scale, 16, 16, which_mv, kernel, - MV_PRECISION_Q3, x, y, xd->bd); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, + CONVERT_TO_SHORTPTR(&pred[0]), BW, &mv, + scale, BW, BH, which_mv, kernel, + MV_PRECISION_Q3, x, y, xd->bd); - vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, - CONVERT_TO_SHORTPTR(&pred[256]), - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, kernel, - mv_precision_uv, x, y, xd->bd); + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS]), uv_block_width, &mv, scale, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, + y, xd->bd); - vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, - CONVERT_TO_SHORTPTR(&pred[512]), - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, kernel, - mv_precision_uv, x, y, xd->bd); + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1)]), uv_block_width, &mv, + scale, uv_block_width, uv_block_height, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW, + BH, which_mv, kernel, MV_PRECISION_Q3, x, y); + + vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[BLK_PELS], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); return; } + + // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16 + // predictors. + // Y predictor + for (i = 0; i < BH; i += ys) { + for (j = 0; j < BW; j += xs) { + const MV mv = blk_mvs[k]; + const int y_offset = i * stride + j; + const int p_offset = i * BW + j; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(y_mb_ptr + y_offset), stride, + CONVERT_TO_SHORTPTR(&pred[p_offset]), BW, &mv, scale, xs, ys, + which_mv, kernel, MV_PRECISION_Q3, x, y, xd->bd); + } else { + vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, which_mv, kernel, + MV_PRECISION_Q3, x, y); + } +#else + vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, which_mv, kernel, + MV_PRECISION_Q3, x, y); #endif // CONFIG_VP9_HIGHBITDEPTH - (void)xd; - vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, - which_mv, kernel, MV_PRECISION_Q3, x, y); + k++; + } + } - vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - which_mv, kernel, mv_precision_uv, x, y); + // U and V predictors + ys = (uv_block_height >> 1); + xs = (uv_block_width >> 1); + k = 0; - vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - which_mv, kernel, mv_precision_uv, x, y); + for (i = 0; i < uv_block_height; i += ys) { + for (j = 0; j < uv_block_width; j += xs) { + const MV mv = blk_mvs[k]; + const int uv_offset = i * uv_stride + j; + const int p_offset = i * uv_block_width + j; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(u_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS + p_offset]), uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, mv_precision_uv, x, y, + xd->bd); + + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(v_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1) + p_offset]), + uv_block_width, &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + } else { + vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, which_mv, + kernel, mv_precision_uv, x, y); + } +#else + vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, &mv, + scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, which_mv, + kernel, mv_precision_uv, x, y); +#endif // CONFIG_VP9_HIGHBITDEPTH + k++; + } + } } void vp9_temporal_filter_init(void) { @@ -94,143 +192,372 @@ void vp9_temporal_filter_init(void) { for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i; } -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, - const uint8_t *frame2, - unsigned int block_width, - unsigned int block_height, int strength, - int filter_weight, uint32_t *accumulator, - uint16_t *count) { - unsigned int i, j, k; +static INLINE int mod_index(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + mod = + ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int highbd_mod_index(int sum_dist, int index, int rounding, + int strength, int filter_weight) { + int mod; + + assert(index >= 0 && index <= 13); + assert(highbd_index_mult[index] != 0); + + mod = (int)((clamp(sum_dist, 0, INT32_MAX) * highbd_index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE int get_filter_weight(unsigned int i, unsigned int j, + unsigned int block_height, + unsigned int block_width, + const int *const blk_fw, int use_32x32) { + // blk_fw[0] ~ blk_fw[3] are the same. + if (use_32x32) { + return blk_fw[0]; + } + + if (i < block_height / 2) { + if (j < block_width / 2) { + return blk_fw[0]; + } + + return blk_fw[1]; + } + + if (j < block_width / 2) { + return blk_fw[2]; + } + + return blk_fw[3]; +} + +void vp9_apply_temporal_filter_c( + const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, + int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, + int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, + int uv_buf_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) { + unsigned int i, j, k, m; int modifier; - int byte = 0; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + const int rounding = (1 << strength) >> 1; + const unsigned int uv_block_width = block_width >> ss_x; + const unsigned int uv_block_height = block_height >> ss_y; + DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]); + + int idx = 0, idy; assert(strength >= 0); assert(strength <= 6); - assert(filter_weight >= 0); - assert(filter_weight <= 2); + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = *frame2; + // Calculate diff^2 for each pixel of the 16x16 block. + // TODO(yunqing): the following code needs to be optimized. + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int16_t diff = + y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j]; + y_diff_sse[idx++] = diff * diff; + } + } + idx = 0; + for (i = 0; i < uv_block_height; i++) { + for (j = 0; j < uv_block_width; j++) { + const int16_t diffu = + u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j]; + const int16_t diffv = + v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j]; + u_diff_sse[idx] = diffu * diffu; + v_diff_sse[idx] = diffv * diffv; + idx++; + } + } + + for (i = 0, k = 0, m = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int pixel_value = y_pred[i * y_buf_stride + j]; + const int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); // non-local mean approach - int diff_sse[9] = { 0 }; - int idx, idy, index = 0; + int y_index = 0; + + const int uv_r = i >> ss_y; + const int uv_c = j >> ss_x; + modifier = 0; for (idy = -1; idy <= 1; ++idy) { for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; + const int row = (int)i + idy; + const int col = (int)j + idx; if (row >= 0 && row < (int)block_height && col >= 0 && col < (int)block_width) { - int diff = frame1[byte + idy * (int)stride + idx] - - frame2[idy * (int)block_width + idx]; - diff_sse[index] = diff * diff; - ++index; + modifier += y_diff_sse[row * (int)block_width + col]; + ++y_index; } } } - assert(index > 0); + assert(y_index > 0); - modifier = 0; - for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; + modifier += u_diff_sse[uv_r * uv_block_width + uv_c]; + modifier += v_diff_sse[uv_r * uv_block_width + uv_c]; - modifier *= 3; - modifier /= index; + y_index += 2; - ++frame2; + modifier = + mod_index(modifier, y_index, rounding, strength, filter_weight); - modifier += rounding; - modifier >>= strength; + y_count[k] += modifier; + y_accumulator[k] += modifier * pixel_value; - if (modifier > 16) modifier = 16; + ++k; - modifier = 16 - modifier; - modifier *= filter_weight; + // Process chroma component + if (!(i & ss_y) && !(j & ss_x)) { + const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c]; + const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c]; - count[k] += modifier; - accumulator[k] += modifier * pixel_value; + // non-local mean approach + int cr_index = 0; + int u_mod = 0, v_mod = 0; + int y_diff = 0; - byte++; + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + const int row = uv_r + idy; + const int col = uv_c + idx; + + if (row >= 0 && row < (int)uv_block_height && col >= 0 && + col < (int)uv_block_width) { + u_mod += u_diff_sse[row * uv_block_width + col]; + v_mod += v_diff_sse[row * uv_block_width + col]; + ++cr_index; + } + } + } + + assert(cr_index > 0); + + for (idy = 0; idy < 1 + ss_y; ++idy) { + for (idx = 0; idx < 1 + ss_x; ++idx) { + const int row = (uv_r << ss_y) + idy; + const int col = (uv_c << ss_x) + idx; + y_diff += y_diff_sse[row * (int)block_width + col]; + ++cr_index; + } + } + + u_mod += y_diff; + v_mod += y_diff; + + u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight); + v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight); + + u_count[m] += u_mod; + u_accumulator[m] += u_mod * u_pixel_value; + v_count[m] += v_mod; + v_accumulator[m] += v_mod * v_pixel_value; + + ++m; + } // Complete YUV pixel } - - byte += stride - block_width; } } #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_temporal_filter_apply_c( - const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8, - unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, uint32_t *accumulator, uint16_t *count) { - const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); - const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); - unsigned int i, j, k; - int modifier; - int byte = 0; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; +void vp9_highbd_apply_temporal_filter_c( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const int uv_block_width = block_width >> ss_x; + const int uv_block_height = block_height >> ss_y; + const int y_diff_stride = BW; + const int uv_diff_stride = BW; - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = *frame2; - int diff_sse[9] = { 0 }; - int idx, idy, index = 0; + DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]); - for (idy = -1; idy <= 1; ++idy) { - for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; + const int rounding = (1 << strength) >> 1; - if (row >= 0 && row < (int)block_height && col >= 0 && - col < (int)block_width) { - int diff = frame1[byte + idy * (int)stride + idx] - - frame2[idy * (int)block_width + idx]; - diff_sse[index] = diff * diff; - ++index; + // Loop variables + int row, col; + int uv_row, uv_col; + int row_step, col_step; + + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + + // Get the square diffs + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int diff = + y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col]; + y_diff_sse[row * y_diff_stride + col] = diff * diff; + } + } + + for (row = 0; row < uv_block_height; row++) { + for (col = 0; col < uv_block_width; col++) { + const int u_diff = + u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col]; + const int v_diff = + v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col]; + u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; + const int filter_weight = get_filter_weight( + row, col, block_height, block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 && + sub_col < (int)block_width) { + y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col]; + y_num_used++; } } } - assert(index > 0); - modifier = 0; - for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col]; - modifier *= 3; - modifier /= index; + y_num_used += 2; - ++frame2; - modifier += rounding; - modifier >>= strength; + // Set the modifier + y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength, + filter_weight); - if (modifier > 16) modifier = 16; - - modifier = 16 - modifier; - modifier *= filter_weight; - - count[k] += modifier; - accumulator[k] += modifier * pixel_value; - - byte++; + // Accumulate the result + y_count[row * block_width + col] += y_mod; + y_accum[row * block_width + col] += y_mod * y_pixel; } + } - byte += stride - block_width; + // Apply the filter to chroma + for (uv_row = 0; uv_row < uv_block_height; uv_row++) { + for (uv_col = 0; uv_col < uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = get_filter_weight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (row_step = 0; row_step < 1 + ss_y; row_step++) { + for (col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * uv_block_width + uv_col] += u_mod; + u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel; + v_count[uv_row * uv_block_width + uv_col] += v_mod; + v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel; + } } } #endif // CONFIG_VP9_HIGHBITDEPTH -static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, - ThreadData *td, - uint8_t *arf_frame_buf, - uint8_t *frame_ptr_buf, - int stride, MV *ref_mv) { +static uint32_t temporal_filter_find_matching_mb_c( + VP9_COMP *cpi, ThreadData *td, uint8_t *arf_frame_buf, + uint8_t *frame_ptr_buf, int stride, MV *ref_mv, MV *blk_mvs, + int *blk_bestsme) { MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS search_method = HEX; + const SEARCH_METHODS search_method = MESH; + const SEARCH_METHODS search_method_16 = cpi->sf.temporal_filter_search_method; int step_param; int sadpb = x->sadperbit16; uint32_t bestsme = UINT_MAX; @@ -245,6 +572,7 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, // Save input state struct buf_2d src = x->plane[0].src; struct buf_2d pre = xd->plane[0].pre[0]; + int i, j, k = 0; best_ref_mv1_full.col = best_ref_mv1.col >> 3; best_ref_mv1_full.row = best_ref_mv1.row >> 3; @@ -260,19 +588,52 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, + vp9_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, search_method, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, ref_mv, 0, 0); /* restore UMV window */ x->mv_limits = tmp_mv_limits; - // Ignore mv costing by sending NULL pointer instead of cost array + // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost + // calculation. The start full mv and the search result are stored in + // ref_mv. bestsme = cpi->find_fractional_mv_step( x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, - mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL, - &distortion, &sse, NULL, 0, 0); + x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, BW, + BH, USE_8_TAPS_SHARP); + + // DO motion search on 4 16x16 sub_blocks. + best_ref_mv1.row = ref_mv->row; + best_ref_mv1.col = ref_mv->col; + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + for (i = 0; i < BH; i += SUB_BH) { + for (j = 0; j < BW; j += SUB_BW) { + // Setup frame pointers + x->plane[0].src.buf = arf_frame_buf + i * stride + j; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j; + xd->plane[0].pre[0].stride = stride; + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full, + step_param, search_method_16, sadpb, + cond_cost_list(cpi, cost_list), &best_ref_mv1, + &blk_mvs[k], 0, 0); + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + blk_bestsme[k] = cpi->find_fractional_mv_step( + x, &blk_mvs[k], &best_ref_mv1, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[TF_SUB_BLOCK], 0, + mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, + NULL, &distortion, &sse, NULL, SUB_BW, SUB_BH, USE_8_TAPS_SHARP); + k++; + } + } // Restore input state x->plane[0].src = src; @@ -293,25 +654,24 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, int byte; int frame; int mb_col; - unsigned int filter_weight; - int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; - int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; - DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); + int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2; + int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2; + DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]); MACROBLOCKD *mbd = &td->mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[BLK_PELS * 3]); uint8_t *predictor; #else - DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor[BLK_PELS * 3]); #endif - const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; - const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; + const int mb_uv_height = BH >> mbd->plane[1].subsampling_y; + const int mb_uv_width = BW >> mbd->plane[1].subsampling_x; // Addition of the tile col level offsets - int mb_y_offset = mb_row * 16 * (f->y_stride) + 16 * mb_col_start; + int mb_y_offset = mb_row * BH * (f->y_stride) + BW * mb_col_start; int mb_uv_offset = mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start; @@ -334,21 +694,21 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // 8 - VP9_INTERP_EXTEND. // To keep the mv in play for both Y and UV planes the max that it // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). - td->mb.mv_limits.row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * VP9_INTERP_EXTEND)); td->mb.mv_limits.row_max = - ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * VP9_INTERP_EXTEND); for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) { int i, j, k; int stride; MV ref_mv; - vp9_zero_array(accumulator, 16 * 16 * 3); - vp9_zero_array(count, 16 * 16 * 3); + vp9_zero_array(accumulator, BLK_PELS * 3); + vp9_zero_array(count, BLK_PELS * 3); - td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * VP9_INTERP_EXTEND)); td->mb.mv_limits.col_max = - ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * VP9_INTERP_EXTEND); if (cpi->oxcf.content == VP9E_CONTENT_FILM) { unsigned int src_variance; @@ -360,92 +720,130 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { src_variance = - vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd); + vp9_high_get_sby_perpixel_variance(cpi, &src, TF_BLOCK, mbd->bd); } else { - src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); } #else - src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); #endif // CONFIG_VP9_HIGHBITDEPTH - if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2); + if (src_variance <= 2) { + strength = VPXMAX(0, arnr_filter_data->strength - 2); + } } for (frame = 0; frame < frame_count; frame++) { - const uint32_t thresh_low = 10000; - const uint32_t thresh_high = 20000; + // MVs for 4 16x16 sub blocks. + MV blk_mvs[4]; + // Filter weights for 4 16x16 sub blocks. + int blk_fw[4] = { 0, 0, 0, 0 }; + int use_32x32 = 0; if (frames[frame] == NULL) continue; ref_mv.row = 0; ref_mv.col = 0; + blk_mvs[0] = kZeroMv; + blk_mvs[1] = kZeroMv; + blk_mvs[2] = kZeroMv; + blk_mvs[3] = kZeroMv; if (frame == alt_ref_index) { - filter_weight = 2; + blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2; + use_32x32 = 1; } else { + const int thresh_low = 10000; + const int thresh_high = 20000; + int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + // Find best match in this frame by MC - uint32_t err = temporal_filter_find_matching_mb_c( + int err = temporal_filter_find_matching_mb_c( cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset, frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, - &ref_mv); + &ref_mv, blk_mvs, blk_bestsme); - // Assign higher weight to matching MB if its error - // score is lower. If not applying MC default behavior - // is to weight all MBs equal. - filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; + int err16 = + blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3]; + int max_err = INT_MIN, min_err = INT_MAX; + for (k = 0; k < 4; k++) { + if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k]; + if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k]; + } + + if (((err * 15 < (err16 << 4)) && max_err - min_err < 10000) || + ((err * 14 < (err16 << 4)) && max_err - min_err < 5000)) { + use_32x32 = 1; + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + blk_fw[0] = err < (thresh_low << THR_SHIFT) + ? 2 + : err < (thresh_high << THR_SHIFT) ? 1 : 0; + blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0]; + } else { + use_32x32 = 0; + for (k = 0; k < 4; k++) + blk_fw[k] = blk_bestsme[k] < thresh_low + ? 2 + : blk_bestsme[k] < thresh_high ? 1 : 0; + } + + for (k = 0; k < 4; k++) { + switch (abs(frame - alt_ref_index)) { + case 1: blk_fw[k] = VPXMIN(blk_fw[k], 2); break; + case 2: + case 3: blk_fw[k] = VPXMIN(blk_fw[k], 1); break; + default: break; + } + } } - if (filter_weight != 0) { + if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) { // Construct the predictors temporal_filter_predictors_mb_c( mbd, frames[frame]->y_buffer + mb_y_offset, frames[frame]->u_buffer + mb_uv_offset, frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale, - mb_col * 16, mb_row * 16); + mb_col * BW, mb_row * BH, blk_mvs, use_32x32); #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, - adj_strength, filter_weight, accumulator, count); - vp9_highbd_temporal_filter_apply( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 256, count + 256); - vp9_highbd_temporal_filter_apply( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 512, count + 512); + vp9_highbd_apply_temporal_filter( + CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride, + CONVERT_TO_SHORTPTR(predictor), BW, + CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset), + CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride, + CONVERT_TO_SHORTPTR(predictor + BLK_PELS), + CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW, + BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y, + adj_strength, blk_fw, use_32x32, accumulator, count, + accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); } else { // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, filter_weight, - accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 256, - count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 512, - count + 512); + vp9_apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); } #else // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, filter_weight, - accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 256, - count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 512, - count + 512); + vp9_apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); #endif // CONFIG_VP9_HIGHBITDEPTH } } @@ -459,8 +857,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst1_16 = CONVERT_TO_SHORTPTR(dst1); stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); pval *= fixed_divide[count[k]]; pval >>= 19; @@ -471,7 +869,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, byte++; } - byte += stride - 16; + byte += stride - BW; } dst1 = cpi->alt_ref_buffer.u_buffer; @@ -480,9 +878,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst2_16 = CONVERT_TO_SHORTPTR(dst2); stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U unsigned int pval = accumulator[k] + (count[k] >> 1); @@ -507,8 +905,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst1 = cpi->alt_ref_buffer.y_buffer; stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); pval *= fixed_divide[count[k]]; pval >>= 19; @@ -518,16 +916,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // move to next pixel byte++; } - byte += stride - 16; + byte += stride - BW; } dst1 = cpi->alt_ref_buffer.u_buffer; dst2 = cpi->alt_ref_buffer.v_buffer; stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U unsigned int pval = accumulator[k] + (count[k] >> 1); @@ -552,8 +950,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst1 = cpi->alt_ref_buffer.y_buffer; stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); pval *= fixed_divide[count[k]]; pval >>= 19; @@ -563,16 +961,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // move to next pixel byte++; } - byte += stride - 16; + byte += stride - BW; } dst1 = cpi->alt_ref_buffer.u_buffer; dst2 = cpi->alt_ref_buffer.v_buffer; stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U unsigned int pval = accumulator[k] + (count[k] >> 1); @@ -592,7 +990,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, byte += stride - mb_uv_width; } #endif // CONFIG_VP9_HIGHBITDEPTH - mb_y_offset += 16; + mb_y_offset += BW; mb_uv_offset += mb_uv_width; } } @@ -603,10 +1001,10 @@ static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row, const int tile_cols = 1 << cm->log2_tile_cols; TileInfo *tile_info = &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; - const int mb_row_start = (tile_info->mi_row_start) >> 1; - const int mb_row_end = (tile_info->mi_row_end + 1) >> 1; - const int mb_col_start = (tile_info->mi_col_start) >> 1; - const int mb_col_end = (tile_info->mi_col_end + 1) >> 1; + const int mb_row_start = (tile_info->mi_row_start) >> TF_SHIFT; + const int mb_row_end = (tile_info->mi_row_end + TF_ROUND) >> TF_SHIFT; + const int mb_col_start = (tile_info->mi_col_start) >> TF_SHIFT; + const int mb_col_end = (tile_info->mi_col_end + TF_ROUND) >> TF_SHIFT; int mb_row; for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) { @@ -620,13 +1018,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi) { const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; int tile_row, tile_col; - MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; - // Save input state - uint8_t *input_buffer[MAX_MB_PLANE]; - int i; - - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; - vp9_init_tile_data(cpi); for (tile_row = 0; tile_row < tile_rows; ++tile_row) { @@ -634,15 +1025,13 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi) { temporal_filter_iterate_tile_c(cpi, tile_row, tile_col); } } - - // Restore input state - for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i]; } // Apply buffer limits and context specific adjustments to arnr filter. static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost, int *arnr_frames, int *arnr_strength) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const int frames_after_arf = vp9_lookahead_depth(cpi->lookahead) - distance - 1; int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; @@ -696,12 +1085,17 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost, } // Adjustments for second level arf in multi arf case. - if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { - strength >>= 1; - } - } + // Leave commented out place holder for possible filtering adjustment with + // new multi-layer arf code. + // if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) + // if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) strength >>= 1; + + // TODO(jingning): Skip temporal filtering for intermediate frames that will + // be used as show_existing_frame. Need to further explore the possibility to + // apply certain filter. + if (gf_group->arf_src_offset[gf_group->index] < + cpi->rc.baseline_gf_interval - 1) + frames = 1; *arnr_frames = frames; *arnr_strength = strength; @@ -800,8 +1194,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { } // Initialize errorperbit and sabperbit. - rdmult = (int)vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX); - if (rdmult < 1) rdmult = 1; + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX); set_error_per_bit(&cpi->td.mb, rdmult); vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX); diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h index 775e49cc53..553a468280 100644 --- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h +++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -8,14 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ -#define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#ifndef VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#define VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ #ifdef __cplusplus extern "C" { #endif #define ARNR_FILT_QINDEX 128 +static const MV kZeroMv = { 0, 0 }; + +// Block size used in temporal filtering +#define TF_BLOCK BLOCK_32X32 +#define BH 32 +#define BH_LOG2 5 +#define BW 32 +#define BW_LOG2 5 +#define BLK_PELS ((BH) * (BW)) // Pixels in the block +#define TF_SHIFT 2 +#define TF_ROUND 3 +#define THR_SHIFT 2 +#define TF_SUB_BLOCK BLOCK_16X16 +#define SUB_BH 16 +#define SUB_BW 16 void vp9_temporal_filter_init(void); void vp9_temporal_filter(VP9_COMP *cpi, int distance); @@ -28,4 +43,4 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#endif // VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.h b/libs/libvpx/vp9/encoder/vp9_tokenize.h index b2f63ffef5..6407ff9237 100644 --- a/libs/libvpx/vp9/encoder/vp9_tokenize.h +++ b/libs/libvpx/vp9/encoder/vp9_tokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TOKENIZE_H_ -#define VP9_ENCODER_VP9_TOKENIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_TOKENIZE_H_ +#define VPX_VP9_ENCODER_VP9_TOKENIZE_H_ #include "vp9/common/vp9_entropy.h" @@ -127,4 +127,4 @@ static INLINE int vp9_get_token_cost(int v, int16_t *token, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TOKENIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_TOKENIZE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_treewriter.h b/libs/libvpx/vp9/encoder/vp9_treewriter.h index a8b9c2cd31..86c5fa2244 100644 --- a/libs/libvpx/vp9/encoder/vp9_treewriter.h +++ b/libs/libvpx/vp9/encoder/vp9_treewriter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TREEWRITER_H_ -#define VP9_ENCODER_VP9_TREEWRITER_H_ +#ifndef VPX_VP9_ENCODER_VP9_TREEWRITER_H_ +#define VPX_VP9_ENCODER_VP9_TREEWRITER_H_ #include "vpx_dsp/bitwriter.h" @@ -48,4 +48,4 @@ static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TREEWRITER_H_ +#endif // VPX_VP9_ENCODER_VP9_TREEWRITER_H_ diff --git a/libs/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/libs/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c new file mode 100644 index 0000000000..4fa24512c5 --- /dev/null +++ b/libs/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -0,0 +1,943 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/x86/temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu16_epi32(a_reg); + const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero); + const __m128i b_first = _mm_cvtepu16_epi32(b_reg); + const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi32(a_first, b_first); + dist_second = _mm_sub_epi32(a_second, b_second); + dist_first = _mm_mullo_epi32(dist_first, dist_first); + dist_second = _mm_mullo_epi32(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 4), dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)dist); + dist_left = _mm_loadu_si128((const __m128i *)(dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(dist + 1)); + + *sum = _mm_add_epi32(dist_reg, dist_left); + *sum = _mm_add_epi32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first, + __m128i *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(__m128i *output, const __m128i *sum, + const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u32 = _mm_set1_epi32(rounding); + const __m128i weight_u32 = _mm_set1_epi32(weight); + const __m128i sixteen = _mm_set1_epi32(16); + const __m128i zero = _mm_setzero_si128(); + + // modifier * 3 / index; + const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); + const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero); + const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero); + const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero); + + const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); + const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32); + const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi); + const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32); + + // Now we have + // mul_lo: 00 a1 00 a0 + // mul_hi: 00 a3 00 a2 + // Unpack as 64 bit words to get even and odd elements + // unpack_lo: 00 a2 00 a0 + // unpack_hi: 00 a3 00 a1 + // Then we can shift and OR the results to get everything in 32-bits + const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4); + const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift); + + // Round + *output = _mm_add_epi32(mul, rounding_u32); + *output = _mm_srl_epi32(*output, strength_u128); + + // Multiply with the weight + *output = _mm_min_epu32(*output, sixteen); + *output = _mm_sub_epi32(sixteen, *output); + *output = _mm_mullo_epi32(*output, weight_u32); +} + +static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1, + const __m128i *sum_0_u32, + const __m128i *sum_1_u32, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, + const __m128i sum_second_u32, + const uint16_t *pred, + uint16_t *count, + uint32_t *accumulator) { + // Cast down to 16-bit ints + const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32); + const __m128i zero = _mm_setzero_si128(); + + __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first, + __m128i *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first, + __m128i *u_second, __m128i *v_first, __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + highbd_read_dist_4(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi32(u_reg, u_reg); + *u_second = _mm_unpackhi_epi32(u_reg, u_reg); + + highbd_read_dist_4(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi32(v_reg, v_reg); + *v_second = _mm_unpackhi_epi32(v_reg, v_reg); + } +} + +static void vp9_highbd_apply_temporal_filter_luma_8( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, + uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist, + const uint32_t *v_dist, const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_highbd_apply_temporal_filter_luma( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst, + __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) { + __m128i y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst); + y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + __m128i y_fst, y_snd; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_fst = _mm_hadd_epi32(y_fst, y_snd); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_snd = _mm_hadd_epi32(y_fst, y_snd); + } + + *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst); + *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd); + *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst); + *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_highbd_apply_temporal_filter_chroma_8( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int uv_block_width, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_fst, mul_snd; + + __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + __m128i u_sum_row_fst, v_sum_row_fst; + __m128i u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_highbd_apply_temporal_filter_chroma( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + vp9_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, + top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, + top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_sse4_1( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_highbd_apply_temporal_filter_luma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr, + v_dist_ptr); + + vp9_highbd_apply_temporal_filter_chroma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/libs/libvpx/vp9/encoder/x86/temporal_filter_constants.h b/libs/libvpx/vp9/encoder/x86/temporal_filter_constants.h new file mode 100644 index 0000000000..7dcedda192 --- /dev/null +++ b/libs/libvpx/vp9/encoder/x86/temporal_filter_constants.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#include "./vpx_config.h" + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_5 (int16_t)39322 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_7 (int16_t)28087 +#define NEIGHBOR_CONSTANT_8 (int16_t)24576 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 +#define NEIGHBOR_CONSTANT_10 (int16_t)19661 +#define NEIGHBOR_CONSTANT_11 (int16_t)17874 +#define NEIGHBOR_CONSTANT_13 (int16_t)15124 + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4 +}; + +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U +#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U +#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U +#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U +#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U +#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U +#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U +#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U +#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 + }; +#endif // CONFIG_VP9_HIGHBITDEPTH + +#define DIST_STRIDE ((BW) + 2) + +#endif // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ diff --git a/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c index 460dab6593..437f49f5a0 100644 --- a/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c +++ b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -14,96 +14,58 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/x86/temporal_filter_constants.h" -// Division using multiplication and shifting. The C implementation does: -// modifier *= 3; -// modifier /= index; -// where 'modifier' is a set of summed values and 'index' is the number of -// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values -// which may be bound by the edges of the block being filtered. -// -// This equation works out to (m * 3) / i which reduces to: -// m * 3/4 -// m * 1/2 -// m * 1/3 -// -// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): -// m * C / 65536 -// we can create a C to replicate the division. -// -// m * 49152 / 65536 = m * 3/4 -// m * 32758 / 65536 = m * 1/2 -// m * 21846 / 65536 = m * 0.3333 -// -// These are loaded using an instruction expecting int16_t values but are used -// with _mm_mulhi_epu16(), which treats them as unsigned. -#define NEIGHBOR_CONSTANT_4 (int16_t)49152 -#define NEIGHBOR_CONSTANT_6 (int16_t)32768 -#define NEIGHBOR_CONSTANT_9 (int16_t)21846 +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b); -// Load values from 'a' and 'b'. Compute the difference squared and sum -// neighboring values such that: -// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 -// Values to the left and right of the row are set to 0. -// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. -static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { - const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); - const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); - const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); - const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); + __m128i dist_first; - const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); - const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); + dist_first = _mm_sub_epi16(a_first, b_first); + dist_first = _mm_mullo_epi16(dist_first, dist_first); - // Shift all the values one place to the left/right so we can efficiently sum - // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. - const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); - const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); - - // It becomes necessary to treat the values as unsigned at this point. The - // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point - // forward since the filter is only applied to smooth small pixel changes. - // Once the value has saturated to uint16_t it is well outside the useful - // range. - __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); - - *sum = sum_u16; + _mm_storeu_si128((__m128i *)dst, dist_first); } -static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, - __m128i *sum_1) { +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { const __m128i zero = _mm_setzero_si128(); - const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); - const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); - const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); - const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); - const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); - const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero); - const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); - const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); - const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); - const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); + __m128i dist_first, dist_second; - __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); - // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. - __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); + dist_first = _mm_sub_epi16(a_first, b_first); + dist_second = _mm_sub_epi16(a_second, b_second); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + dist_second = _mm_mullo_epi16(dist_second, dist_second); - __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 8), dist_second); +} - *sum_0 = sum_u16; +static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} - shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); - shift_right = _mm_srli_si128(diff_sq_1_u16, 2); - - sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); - - *sum_1 = sum_u16; +static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, + __m128i *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); } // Average the value based on the number of values summed (9 for pixels away @@ -111,17 +73,17 @@ static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, // // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply // by weight. -static __m128i average_8(__m128i sum, const __m128i mul_constants, - const int strength, const int rounding, - const int weight) { +static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const __m128i *weight) { // _mm_srl_epi16 uses the lower 64 bit value for the shift. const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); - const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i weight_u16 = *weight; const __m128i sixteen = _mm_set1_epi16(16); // modifier * 3 / index; - sum = _mm_mulhi_epu16(sum, mul_constants); + sum = _mm_mulhi_epu16(sum, *mul_constants); sum = _mm_adds_epu16(sum, rounding_u16); sum = _mm_srl_epi16(sum, strength_u128); @@ -136,34 +98,6 @@ static __m128i average_8(__m128i sum, const __m128i mul_constants, return _mm_mullo_epi16(sum, weight_u16); } -static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, - const __m128i mul_constants_0, - const __m128i mul_constants_1, const int strength, - const int rounding, const int weight) { - const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); - const __m128i rounding_u16 = _mm_set1_epi16(rounding); - const __m128i weight_u16 = _mm_set1_epi16(weight); - const __m128i sixteen = _mm_set1_epi16(16); - __m128i input_0, input_1; - - input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); - input_0 = _mm_adds_epu16(input_0, rounding_u16); - - input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); - input_1 = _mm_adds_epu16(input_1, rounding_u16); - - input_0 = _mm_srl_epi16(input_0, strength_u128); - input_1 = _mm_srl_epi16(input_1, strength_u128); - - input_0 = _mm_min_epu16(input_0, sixteen); - input_1 = _mm_min_epu16(input_1, sixteen); - input_0 = _mm_sub_epi16(sixteen, input_0); - input_1 = _mm_sub_epi16(sixteen, input_1); - - *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); - *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); -} - // Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, uint16_t *count, uint32_t *accumulator) { @@ -192,10 +126,10 @@ static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); } -static void accumulate_and_store_16(const __m128i sum_0_u16, - const __m128i sum_1_u16, - const uint8_t *pred, uint16_t *count, - uint32_t *accumulator) { +static INLINE void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), @@ -235,142 +169,768 @@ static void accumulate_and_store_16(const __m128i sum_0_u16, _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); } -void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, - const uint8_t *b, unsigned int width, - unsigned int height, int strength, - int weight, uint32_t *accumulator, - uint16_t *count) { +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)y_dist); + dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1)); + + *sum = _mm_adds_epu16(dist_reg, dist_left); + *sum = _mm_adds_epu16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, + __m128i *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + __m128i *u_first, __m128i *u_second, + __m128i *v_first, + __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + read_dist_8(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi16(u_reg, u_reg); + *u_second = _mm_unpackhi_epi16(u_reg, u_reg); + + read_dist_8(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi16(v_reg, v_reg); + *v_second = _mm_unpackhi_epi16(v_reg, v_reg); + } +} + +// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit +// int in dst. +static INLINE void hadd_epu16(__m128i *src, __m128i *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i shift_right = _mm_srli_si128(*src, 2); + + const __m128i odd = _mm_blend_epi16(shift_right, zero, 170); + const __m128i even = _mm_blend_epi16(*src, zero, 170); + + *dst = _mm_add_epi32(even, odd); +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + __m128i *u_mod, + __m128i *v_mod) { + __m128i y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + __m128i y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = _mm_adds_epu16(y_reg, y_tmp); + } + } else { + __m128i y_first, y_second; + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + __m128i y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = _mm_adds_epu16(y_first, y_tmp_0); + y_second = _mm_adds_epu16(y_second, y_tmp_1); + } + + hadd_epu16(&y_first, &y_first); + hadd_epu16(&y_second, &y_second); + + y_reg = _mm_packus_epi32(y_first, y_second); + } + + *u_mod = _mm_adds_epu16(*u_mod, y_reg); + *v_mod = _mm_adds_epu16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_luma_16( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, + uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist, + const uint16_t *v_dist, const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + __m128i weight_first, weight_second; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables unsigned int h; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; assert(strength >= 0); assert(strength <= 6); - assert(weight >= 0); - assert(weight <= 2); + assert(block_width == 16); - assert(width == 8 || width == 16); + (void)block_width; - if (width == 8) { - __m128i sum_row_a, sum_row_b, sum_row_c; - __m128i mul_constants = _mm_setr_epi16( - NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - - sum_8(a, b, &sum_row_a); - sum_8(a + stride, b + width, &sum_row_b); - sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_c, b, count, accumulator); - - a += stride + stride; - b += width; - count += width; - accumulator += width; - - mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); - - for (h = 0; h < height - 2; ++h) { - sum_8(a, b + width, &sum_row_c); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); - sum_row_a = - average_8(sum_row_a, mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_a, b, count, accumulator); - - a += stride; - b += width; - count += width; - accumulator += width; - - sum_row_a = sum_row_b; - sum_row_b = sum_row_c; - } - - mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_a, b, count, accumulator); - - } else { // width == 16 - __m128i sum_row_a_0, sum_row_a_1; - __m128i sum_row_b_0, sum_row_b_1; - __m128i sum_row_c_0, sum_row_c_1; - __m128i mul_constants_0 = _mm_setr_epi16( - NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), - mul_constants_1 = _mm_setr_epi16( - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - - sum_16(a, b, &sum_row_a_0, &sum_row_a_1); - sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); - - sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - - average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, - strength, rounding, weight); - accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); - - a += stride + stride; - b += width; - count += width; - accumulator += width; - - mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); - mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); - for (h = 0; h < height - 2; ++h) { - sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); - - sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); - sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); - - average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1, - strength, rounding, weight); - accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); - - a += stride; - b += width; - count += width; - accumulator += width; - - sum_row_a_0 = sum_row_b_0; - sum_row_a_1 = sum_row_b_1; - sum_row_b_0 = sum_row_c_0; - sum_row_b_1 = sum_row_c_1; - } - - mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); - mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - - average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, - strength, rounding, weight); - accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + // Initialize the weights + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[0]); + weight_second = _mm_set1_epi16(blk_fw[1]); + } else { + weight_first = _mm_set1_epi16(top_weight); + weight_second = weight_first; } + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[2]); + weight_second = _mm_set1_epi16(blk_fw[3]); + } else { + weight_first = _mm_set1_epi16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_apply_temporal_filter_luma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist, + const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The blockwidth is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usualy left-midle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_chroma_8( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int uv_block_width, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + __m128i weight; + + __m128i mul; + + __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3; + __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3; + + __m128i u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // Initilize weight + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0], + blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]); + } else { + weight = _mm_set1_epi16(top_weight); + } + + // First row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = _mm_load_si128((const __m128i *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2], + blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]); + } else { + weight = _mm_set1_epi16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_apply_temporal_filter_chroma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_sse4_1( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference sqaured + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_apply_temporal_filter_luma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + vp9_apply_temporal_filter_chroma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); } diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c index dbd243ac10..2188903b17 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -14,6 +14,7 @@ #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/fwd_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" @@ -170,452 +171,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, fadst4_sse2(in); write_buffer_4x4(output, in); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_4x4(input, in, stride); fadst4_sse2(in); fadst4_sse2(in); write_buffer_4x4(output, in); break; - default: assert(0); break; - } -} - -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, - int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { - __m128i zero; - int pass; - - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - __m128i *in[8]; - int index = 0; - - (void)scan_ptr; - (void)coeff_ptr; - - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - in[0] = &in0; - in[1] = &in1; - in[2] = &in2; - in[3] = &in3; - in[4] = &in4; - in[5] = &in5; - in[6] = &in6; - in[7] = &in7; - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = _mm_add_epi16(in0, in7); - const __m128i q1 = _mm_add_epi16(in1, in6); - const __m128i q2 = _mm_add_epi16(in2, in5); - const __m128i q3 = _mm_add_epi16(in3, in4); - const __m128i q4 = _mm_sub_epi16(in3, in4); - const __m128i q5 = _mm_sub_epi16(in2, in5); - const __m128i q6 = _mm_sub_epi16(in1, in6); - const __m128i q7 = _mm_sub_epi16(in0, in7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - } - - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = *in[0]; - coeff1 = *in[1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - index = 2; - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); - coeff0 = *in[index]; - coeff1 = *in[index + 1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - index += 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; } } @@ -1097,14 +659,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_8x8(input, in, stride); fadst8_sse2(in); fadst8_sse2(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - default: assert(0); break; } } @@ -1963,13 +1525,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_16x16(input, in0, in1, stride); fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - default: assert(0); break; } } diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c deleted file mode 100644 index bf874a09ec..0000000000 --- a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c +++ /dev/null @@ -1,465 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include // SSSE3 - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" -#include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" - -void vp9_fdct8x8_quant_ssse3( - const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; - int pass; - - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - __m128i *in[8]; - int index = 0; - - (void)scan_ptr; - (void)coeff_ptr; - - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - in[0] = &in0; - in[1] = &in1; - in[2] = &in2; - in[3] = &in3; - in[4] = &in4; - in[5] = &in5; - in[6] = &in6; - in[7] = &in7; - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = _mm_add_epi16(in0, in7); - const __m128i q1 = _mm_add_epi16(in1, in6); - const __m128i q2 = _mm_add_epi16(in2, in5); - const __m128i q3 = _mm_add_epi16(in3, in4); - const __m128i q4 = _mm_sub_epi16(in3, in4); - const __m128i q5 = _mm_sub_epi16(in2, in5); - const __m128i q6 = _mm_sub_epi16(in1, in6); - const __m128i q7 = _mm_sub_epi16(in0, in7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_sub_epi16(q6, q5); - const __m128i d1 = _mm_add_epi16(q6, q5); - const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); - const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); - - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - } - - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant, thr; - int16_t nzflag; - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = *in[0]; - coeff1 = *in[1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - index = 2; - thr = _mm_srai_epi16(dequant, 1); - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); - coeff0 = *in[index]; - coeff1 = *in[index + 1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } else { - // Maybe a more efficient way to store 0? - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - index += 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; - } -} diff --git a/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index 2f3c66c083..aa46c5889d 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -160,7 +160,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, } // The inverse mask indicates which of the MVs are outside - v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); + v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c index 91f627c343..d7aafe7b01 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -11,27 +11,28 @@ #include #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" -int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, - int bps) { +int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; - const int shift = 2 * (bps - 8); + const int shift = 2 * (bd - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers - __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); - __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); - __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); - __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); + __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); - min = _mm_set1_epi32(0xffffc000); + min = _mm_set1_epi32((int32_t)0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/libs/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c new file mode 100644 index 0000000000..8dfdbd50f6 --- /dev/null +++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // AVX2 + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +// Zero fill 8 positions in the output buffer. +static INLINE void store_zero_tran_low(tran_low_t *a) { + const __m256i zero = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(a), zero); + _mm256_storeu_si256((__m256i *)(a + 8), zero); +#else + _mm256_storeu_si256((__m256i *)(a), zero); +#endif +} + +static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr, + __m256i *coeff256) { + const __m256i iscan = _mm256_loadu_si256(iscan_ptr); + const __m256i zero256 = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as + // B1 A1 B0 A0. Shuffle to B1 B0 A1 A0 in order to scan eob correctly. + const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8); + const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256); +#else + const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256); +#endif + const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256); + // Add one to convert from indices to counts + const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0); + return _mm256_and_si256(iscan_plus_one, nzero_coeff0); +} + +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m128i eob; + __m256i round256, quant256, dequant256; + __m256i eob256, thr256; + + (void)scan; + (void)skip_block; + assert(!skip_block); + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + { + __m256i coeff256; + + // Setup global values + { + const __m128i round = _mm_load_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr); + round256 = _mm256_castsi128_si256(round); + round256 = _mm256_permute4x64_epi64(round256, 0x54); + + quant256 = _mm256_castsi128_si256(quant); + quant256 = _mm256_permute4x64_epi64(quant256, 0x54); + + dequant256 = _mm256_castsi128_si256(dequant); + dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); + } + + { + __m256i qcoeff256; + __m256i qtmp256; + coeff256 = load_tran_low(coeff_ptr + n_coeffs); + qcoeff256 = _mm256_abs_epi16(coeff256); + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); + } + + eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256); + n_coeffs += 8 * 2; + } + + // remove dc constants + dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); + quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); + round256 = _mm256_permute2x128_si256(round256, round256, 0x31); + + thr256 = _mm256_srai_epi16(dequant256, 1); + + // AC only loop + while (n_coeffs < 0) { + __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs); + __m256i qcoeff256 = _mm256_abs_epi16(coeff256); + int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256)); + + if (nzflag) { + __m256i qtmp256; + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); + eob256 = _mm256_max_epi16( + eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256)); + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + } + n_coeffs += 8 * 2; + } + + eob = _mm_max_epi16(_mm256_castsi256_si128(eob256), + _mm256_extracti128_si256(eob256, 1)); + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c index ca0ad4407e..885220a712 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -21,20 +21,20 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { __m128i zero; __m128i thr; int16_t nzflag; __m128i eob; __m128i round, quant, dequant; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -100,8 +100,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -175,8 +175,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); diff --git a/libs/libvpx/vp9/vp9_common.mk b/libs/libvpx/vp9/vp9_common.mk index 5bfc0d3599..c9a55669e1 100644 --- a/libs/libvpx/vp9/vp9_common.mk +++ b/libs/libvpx/vp9/vp9_common.mk @@ -63,30 +63,36 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_VSX) += common/ppc/vp9_idct_vsx.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht_neon.h + ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm endif ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c -endif - -# common (msa) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c - -ifeq ($(CONFIG_VP9_POSTPROC),yes) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c -endif - -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c - -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c +else +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c endif $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/libs/libvpx/vp9/vp9_cx_iface.c b/libs/libvpx/vp9/vp9_cx_iface.c index 881caae78b..45e03f2def 100644 --- a/libs/libvpx/vp9/vp9_cx_iface.c +++ b/libs/libvpx/vp9/vp9_cx_iface.c @@ -15,6 +15,7 @@ #include "vpx/vpx_encoder.h" #include "vpx_ports/vpx_once.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_timestamp.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" #include "vp9/encoder/vp9_encoder.h" @@ -30,6 +31,7 @@ struct vp9_extracfg { unsigned int static_thresh; unsigned int tile_columns; unsigned int tile_rows; + unsigned int enable_tpl_model; unsigned int arnr_max_frames; unsigned int arnr_strength; unsigned int min_gf_interval; @@ -63,6 +65,7 @@ static struct vp9_extracfg default_extra_cfg = { 0, // static_thresh 6, // tile_columns 0, // tile_rows + 1, // enable_tpl_model 7, // arnr_max_frames 5, // arnr_strength 0, // min_gf_interval; 0 -> default decision @@ -92,6 +95,9 @@ struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; struct vp9_extracfg extra_cfg; + vpx_rational64_t timestamp_ratio; + vpx_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; VP9EncoderConfig oxcf; VP9_COMP *cpi; unsigned char *cx_data; @@ -128,10 +134,10 @@ static vpx_codec_err_t update_error_state( return VPX_CODEC_INVALID_PARAM; \ } while (0) -#define RANGE_CHECK(p, memb, lo, hi) \ - do { \ - if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \ - ERROR(#memb " out of range [" #lo ".." #hi "]"); \ +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ } while (0) #define RANGE_CHECK_HI(p, memb, hi) \ @@ -149,6 +155,22 @@ static vpx_codec_err_t update_error_state( if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \ } while (0) +#if defined(_MSC_VER) +#define COMPILE_TIME_ASSERT(boolexp) \ + do { \ + char compile_time_assert[(boolexp) ? 1 : -1]; \ + (void)compile_time_assert; \ + } while (0) +#else // !_MSC_VER +#define COMPILE_TIME_ASSERT(boolexp) \ + do { \ + struct { \ + unsigned int compile_time_assert : (boolexp) ? 1 : -1; \ + } compile_time_assert; \ + (void)compile_time_assert; \ + } while (0) +#endif // _MSC_VER + static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { @@ -237,22 +259,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("ts_rate_decimator factors are not powers of 2"); } -#if CONFIG_SPATIAL_SVC - - if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) && - cfg->g_pass == VPX_RC_LAST_PASS) { - unsigned int i, alt_ref_sum = 0; - for (i = 0; i < cfg->ss_number_layers; ++i) { - if (cfg->ss_enable_auto_alt_ref[i]) ++alt_ref_sum; - } - if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers) - ERROR("Not enough ref buffers for svc alt ref frames"); - if (cfg->ss_number_layers * cfg->ts_number_layers > 3 && - cfg->g_error_resilient == 0) - ERROR("Multiple frame context are not supported for more than 3 layers"); - } -#endif - // VP9 does not support a lower bound on the keyframe interval in // automatic keyframe placement mode. if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist && @@ -263,8 +269,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, row_mt, 0, 1); RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2); - RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); - RANGE_CHECK(extra_cfg, cpu_used, -8, 8); + RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, MAX_ARF_LAYERS); + RANGE_CHECK(extra_cfg, cpu_used, -9, 9); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); RANGE_CHECK(extra_cfg, tile_columns, 0, 6); RANGE_CHECK(extra_cfg, tile_rows, 0, 2); @@ -277,10 +283,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, content, VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1); - // TODO(yaowu): remove this when ssim tuning is implemented for vp9 - if (extra_cfg->tuning == VP8_TUNE_SSIM) - ERROR("Option --tune=ssim is not currently supported in VP9."); - #if !CONFIG_REALTIME_ONLY if (cfg->g_pass == VPX_RC_LAST_PASS) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); @@ -560,6 +562,8 @@ static vpx_codec_err_t set_encoder_config( oxcf->tile_columns = extra_cfg->tile_columns; + oxcf->enable_tpl_model = extra_cfg->enable_tpl_model; + // TODO(yunqing): The dependencies between row tiles cause error in multi- // threaded encoding. For now, tile_rows is forced to be 0 in this case. // The further fix can be done by adding synchronizations after a tile row @@ -589,9 +593,6 @@ static vpx_codec_err_t set_encoder_config( oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { -#if CONFIG_SPATIAL_SVC - oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl]; -#endif for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] = 1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl]; @@ -599,9 +600,6 @@ static vpx_codec_err_t set_encoder_config( } if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) { oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; -#if CONFIG_SPATIAL_SVC - oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref; -#endif } if (oxcf->ts_number_layers > 1) { for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) { @@ -716,7 +714,10 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; + // Use fastest speed setting (speed 9 or -9) if it's set beyond the range. extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + extra_cfg.cpu_used = VPXMIN(9, extra_cfg.cpu_used); + extra_cfg.cpu_used = VPXMAX(-9, extra_cfg.cpu_used); return update_extra_cfg(ctx, &extra_cfg); } @@ -762,6 +763,13 @@ static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_tpl_model(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tpl_model = CAST(VP9E_SET_TPL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; @@ -809,7 +817,7 @@ static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct( vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.rc_max_inter_bitrate_pct = - CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args); + CAST(VP9E_SET_MAX_INTER_BITRATE_PCT, args); return update_extra_cfg(ctx, &extra_cfg); } @@ -926,6 +934,12 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, res = validate_config(priv, &priv->cfg, &priv->extra_cfg); if (res == VPX_CODEC_OK) { + priv->pts_offset_initialized = 0; + priv->timestamp_ratio.den = priv->cfg.g_timebase.den; + priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num; + priv->timestamp_ratio.num *= TICKS_PER_SEC; + reduce_ratio(&priv->timestamp_ratio); + set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); #if CONFIG_VP9_HIGHBITDEPTH priv->oxcf.use_highbitdepth = @@ -962,12 +976,14 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, switch (ctx->cfg.g_pass) { case VPX_RC_ONE_PASS: if (deadline > 0) { - const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg; - // Convert duration parameter from stream timebase to microseconds. - const uint64_t duration_us = (uint64_t)duration * 1000000 * - (uint64_t)cfg->g_timebase.num / - (uint64_t)cfg->g_timebase.den; + uint64_t duration_us; + + COMPILE_TIME_ASSERT(TICKS_PER_SEC > 1000000 && + (TICKS_PER_SEC % 1000000) == 0); + + duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / + (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); // If the deadline is more that the duration this frame is to be shown, // use good quality mode. Otherwise use realtime mode. @@ -1051,15 +1067,16 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { return index_sz; } -static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase, +static int64_t timebase_units_to_ticks(const vpx_rational64_t *timestamp_ratio, int64_t n) { - return n * TICKS_PER_SEC * timebase->num / timebase->den; + return n * timestamp_ratio->num / timestamp_ratio->den; } -static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase, +static int64_t ticks_to_timebase_units(const vpx_rational64_t *timestamp_ratio, int64_t n) { - const int64_t round = (int64_t)TICKS_PER_SEC * timebase->num / 2 - 1; - return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; + int64_t round = timestamp_ratio->num / 2; + if (round > 0) --round; + return (n * timestamp_ratio->den + round) / timestamp_ratio->num; } static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, @@ -1067,12 +1084,11 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, vpx_codec_frame_flags_t flags = lib_flags << 16; if (lib_flags & FRAMEFLAGS_KEY || - (cpi->use_svc && - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id] - .is_key_frame)) + (cpi->use_svc && cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id] + .is_key_frame)) flags |= VPX_FRAME_IS_KEY; if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE; @@ -1083,37 +1099,26 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, const size_t kMinCompressedSize = 8192; static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, - vpx_codec_pts_t pts, + vpx_codec_pts_t pts_val, unsigned long duration, vpx_enc_frame_flags_t enc_flags, unsigned long deadline) { volatile vpx_codec_err_t res = VPX_CODEC_OK; volatile vpx_enc_frame_flags_t flags = enc_flags; + volatile vpx_codec_pts_t pts = pts_val; VP9_COMP *const cpi = ctx->cpi; - const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; + const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio; size_t data_sz; if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 && !cpi->level_constraint.rc_config_updated) { - SVC *const svc = &cpi->svc; - const int is_two_pass_svc = - (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); const VP9EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->twopass; FIRSTPASS_STATS *stats = &twopass->total_stats; - if (is_two_pass_svc) { - const double frame_rate = 10000000.0 * stats->count / stats->duration; - vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * - svc->layer_context[svc->spatial_layer_id].target_bandwidth / - 10000000.0); - } else { - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - } + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); cpi->level_constraint.rc_config_updated = 1; } @@ -1123,7 +1128,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // There's no codec control for multiple alt-refs so check the encoder // instance for its status to determine the compressed data size. data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 * - (cpi->multi_arf_allowed ? 8 : 2); + (cpi->multi_layer_arf ? 8 : 2); if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize; if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) { ctx->cx_data_sz = data_sz; @@ -1136,6 +1141,12 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } } + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts; + ctx->pts_offset_initialized = 1; + } + pts -= ctx->pts_offset; + pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); @@ -1168,12 +1179,15 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; - int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); + int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts); int64_t dst_end_time_stamp = - timebase_units_to_ticks(timebase, pts + duration); + timebase_units_to_ticks(timestamp_ratio, pts + duration); size_t size, cx_data_sz; unsigned char *cx_data; + cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1); + cpi->svc.time_stamp_superframe = dst_time_stamp; + // Set up internal flags if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; @@ -1213,34 +1227,31 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img)) { - if (size) { + if (size || (cpi->use_svc && cpi->svc.skip_enhancement_layer)) { vpx_codec_cx_pkt_t pkt; -#if CONFIG_SPATIAL_SVC - if (cpi->use_svc) - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers] - .layer_size += size; -#endif - // Pack invisible frames with the next visible frame if (!cpi->common.show_frame || (cpi->use_svc && cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)) { if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; ctx->pending_cx_data_sz += size; - ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; ctx->pending_frame_magnitude |= size; cx_data += size; cx_data_sz -= size; + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; if (ctx->output_cx_pkt_cb.output_cx_pkt) { pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = - ticks_to_timebase_units(timebase, dst_time_stamp); + ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ctx->pts_offset; pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timebase, dst_end_time_stamp - dst_time_stamp); + timestamp_ratio, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); pkt.data.frame.buf = ctx->pending_cx_data; pkt.data.frame.sz = size; @@ -1256,13 +1267,19 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // Add the frame packet to the list of returned packets. pkt.kind = VPX_CODEC_CX_FRAME_PKT; - pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp); + pkt.data.frame.pts = + ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ctx->pts_offset; pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timebase, dst_end_time_stamp - dst_time_stamp); + timestamp_ratio, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; if (ctx->pending_cx_data) { - ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; ctx->pending_frame_magnitude |= size; ctx->pending_cx_data_sz += size; // write the superframe only for the case when @@ -1288,27 +1305,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; -#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) - if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) { - vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr; - int sl; - vp9_zero(pkt_sizes); - vp9_zero(pkt_psnr); - pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; - pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR; - for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { - LAYER_CONTEXT *lc = - &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers]; - pkt_sizes.data.layer_sizes[sl] = lc->layer_size; - pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt; - lc->layer_size = 0; - } - - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes); - - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr); - } -#endif if (is_one_pass_cbr_svc(cpi) && (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { // Encoded all spatial layers; exit loop. @@ -1338,9 +1334,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, @@ -1354,9 +1349,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, vp9_copy_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, @@ -1364,14 +1358,13 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *); if (frame != NULL) { - YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx); + const int fb_idx = ctx->cpi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->cpi->common, fb_idx); if (fb == NULL) return VPX_CODEC_ERROR; - yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, @@ -1381,9 +1374,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, if (config != NULL) { ctx->preview_ppcfg = *config; return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; #else (void)ctx; (void)args; @@ -1405,17 +1397,24 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) { yuvconfig2image(&ctx->preview_img, &sd, NULL); return &ctx->preview_img; - } else { - return NULL; } + return NULL; } static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, va_list args) { - (void)ctx; - (void)args; + vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); - // TODO(yaowu): Need to re-implement and test for VP9. + if (data) { + vpx_roi_map_t *roi = (vpx_roi_map_t *)data; + + if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->skip, + roi->ref_frame)) { + return VPX_CODEC_OK; + } + return VPX_CODEC_INVALID_PARAM; + } return VPX_CODEC_INVALID_PARAM; } @@ -1427,11 +1426,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, @@ -1442,11 +1440,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, @@ -1458,9 +1455,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode, (VPX_SCALING)mode->v_scaling_mode); return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { @@ -1491,22 +1487,23 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *); VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; SVC *const svc = &cpi->svc; + int sl; - svc->first_spatial_layer_to_encode = data->spatial_layer_id; svc->spatial_layer_to_encode = data->spatial_layer_id; + svc->first_spatial_layer_to_encode = data->spatial_layer_id; + // TODO(jianj): Deprecated to be removed. svc->temporal_layer_id = data->temporal_layer_id; + // Allow for setting temporal layer per spatial layer for superframe. + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + svc->temporal_layer_id_per_spatial[sl] = + data->temporal_layer_id_per_spatial[sl]; + } // Checks on valid layer_id input. if (svc->temporal_layer_id < 0 || svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) { return VPX_CODEC_INVALID_PARAM; } - if (svc->first_spatial_layer_to_encode < 0 || - svc->first_spatial_layer_to_encode >= (int)ctx->cfg.ss_number_layers) { - return VPX_CODEC_INVALID_PARAM; - } - // First spatial layer to encode not implemented for two-pass. - if (is_two_pass_svc(cpi) && svc->first_spatial_layer_to_encode > 0) - return VPX_CODEC_INVALID_PARAM; + return VPX_CODEC_OK; } @@ -1546,20 +1543,87 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); + int sl; + for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) { + data->update_buffer_slot[sl] = cpi->svc.update_buffer_slot[sl]; + data->reference_last[sl] = cpi->svc.reference_last[sl]; + data->reference_golden[sl] = cpi->svc.reference_golden[sl]; + data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl]; + data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl]; + data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl]; + data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl]; + // TODO(jianj): Remove these 3, deprecated. + data->update_last[sl] = cpi->svc.update_last[sl]; + data->update_golden[sl] = cpi->svc.update_golden[sl]; + data->update_alt_ref[sl] = cpi->svc.update_altref[sl]; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); int sl; + cpi->svc.use_set_ref_frame_config = 1; for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { - cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl]; - cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl]; - cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl]; - cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.update_buffer_slot[sl] = data->update_buffer_slot[sl]; + cpi->svc.reference_last[sl] = data->reference_last[sl]; + cpi->svc.reference_golden[sl] = data->reference_golden[sl]; + cpi->svc.reference_altref[sl] = data->reference_alt_ref[sl]; + cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl]; + cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl]; + cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.duration[sl] = data->duration[sl]; } return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_svc_inter_layer_pred(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int data = va_arg(args, int); + VP9_COMP *const cpi = ctx->cpi; + cpi->svc.disable_inter_layer_pred = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_frame_drop_t *data = va_arg(args, vpx_svc_frame_drop_t *); + int sl; + cpi->svc.framedrop_mode = data->framedrop_mode; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl]; + // Don't allow max_consec_drop values below 1. + cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_gf_temporal_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->svc.use_gf_temporal_ref = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_spatial_layer_sync( + vpx_codec_alg_priv_t *ctx, va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_spatial_layer_sync_t *data = + va_arg(args, vpx_svc_spatial_layer_sync_t *); + int sl; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.spatial_layer_sync[sl] = data->spatial_layer_sync[sl]; + cpi->svc.set_intra_only_frame = data->base_layer_intra_only; + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp = @@ -1600,13 +1664,21 @@ static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->rc.ext_use_post_encode_drop = data; + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, // Setters { VP8_SET_REFERENCE, ctrl_set_reference }, { VP8_SET_POSTPROC, ctrl_set_previewpp }, - { VP8E_SET_ROI_MAP, ctrl_set_roi_map }, + { VP9E_SET_ROI_MAP, ctrl_set_roi_map }, { VP8E_SET_ACTIVEMAP, ctrl_set_active_map }, { VP8E_SET_SCALEMODE, ctrl_set_scale_mode }, { VP8E_SET_CPUUSED, ctrl_set_cpuused }, @@ -1615,6 +1687,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, { VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, { VP9E_SET_TILE_ROWS, ctrl_set_tile_rows }, + { VP9E_SET_TPL, ctrl_set_tpl_model }, { VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, { VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength }, { VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type }, @@ -1642,7 +1715,12 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_RENDER_SIZE, ctrl_set_render_size }, { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, { VP9E_SET_ROW_MT, ctrl_set_row_mt }, + { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop }, { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, + { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred }, + { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer }, + { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref }, + { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, @@ -1651,6 +1729,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id }, { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, + { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, { -1, NULL }, }; @@ -1659,7 +1738,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0, { // NOLINT - 0, // g_usage + 0, // g_usage (unused) 8, // g_threads 0, // g_profile diff --git a/libs/libvpx/vp9/vp9_dx_iface.c b/libs/libvpx/vp9/vp9_dx_iface.c index 657490f4bd..fa79f7aedc 100644 --- a/libs/libvpx/vp9/vp9_dx_iface.c +++ b/libs/libvpx/vp9/vp9_dx_iface.c @@ -97,7 +97,7 @@ static vpx_codec_err_t decoder_peek_si_internal( const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si, int *is_intra_only, vpx_decrypt_cb decrypt_cb, void *decrypt_state) { int intra_only_flag = 0; - uint8_t clear_buffer[10]; + uint8_t clear_buffer[11]; if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM; @@ -158,6 +158,9 @@ static vpx_codec_err_t decoder_peek_si_internal( if (profile > PROFILE_0) { if (!parse_bitdepth_colorspace_sampling(profile, &rb)) return VPX_CODEC_UNSUP_BITSTREAM; + // The colorspace info may cause vp9_read_frame_size() to need 11 + // bytes. + if (data_sz < 11) return VPX_CODEC_UNSUP_BITSTREAM; } rb.bit_offset += REF_FRAMES; // refresh_frame_flags vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); @@ -235,6 +238,19 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { flags->noise_level = ctx->postproc_cfg.noise_level; } +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return VPX_CODEC_INVALID_PARAM; \ + } while (0) + +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) + static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { ctx->last_show_frame = -1; ctx->need_resync = 1; @@ -251,6 +267,12 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { ctx->pbi->max_threads = ctx->cfg.threads; ctx->pbi->inv_tile_order = ctx->invert_tile_order; + RANGE_CHECK(ctx, row_mt, 0, 1); + ctx->pbi->row_mt = ctx->row_mt; + + RANGE_CHECK(ctx, lpf_opt, 0, 1); + ctx->pbi->lpf_mt_opt = ctx->lpf_opt; + // If postprocessing was enabled by the application and a // configuration has not been provided, default it. if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) @@ -452,8 +474,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); if (data) { - YV12_BUFFER_CONFIG *fb; - fb = get_ref_frame(&ctx->pbi->common, data->idx); + const int fb_idx = ctx->pbi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->pbi->common, fb_idx); if (fb == NULL) return VPX_CODEC_ERROR; yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; @@ -632,6 +654,20 @@ static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->row_mt = va_arg(args, int); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->lpf_opt = va_arg(args, int); + + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -643,6 +679,8 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, + { VP9D_SET_ROW_MT, ctrl_set_row_mt }, + { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt }, // Getters { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/libs/libvpx/vp9/vp9_dx_iface.h b/libs/libvpx/vp9/vp9_dx_iface.h index 18bc7ab0d6..f60688c4db 100644 --- a/libs/libvpx/vp9/vp9_dx_iface.h +++ b/libs/libvpx/vp9/vp9_dx_iface.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_VP9_DX_IFACE_H_ -#define VP9_VP9_DX_IFACE_H_ +#ifndef VPX_VP9_VP9_DX_IFACE_H_ +#define VPX_VP9_VP9_DX_IFACE_H_ #include "vp9/decoder/vp9_decoder.h" @@ -45,6 +45,8 @@ struct vpx_codec_alg_priv { // Allow for decoding up to a given spatial layer for SVC stream. int svc_decoding; int svc_spatial_layer; + int row_mt; + int lpf_opt; }; -#endif // VP9_VP9_DX_IFACE_H_ +#endif // VPX_VP9_VP9_DX_IFACE_H_ diff --git a/libs/libvpx/vp9/vp9_iface_common.h b/libs/libvpx/vp9/vp9_iface_common.h index d68872750b..a1921db636 100644 --- a/libs/libvpx/vp9/vp9_iface_common.h +++ b/libs/libvpx/vp9/vp9_iface_common.h @@ -7,17 +7,17 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_VP9_IFACE_COMMON_H_ -#define VP9_VP9_IFACE_COMMON_H_ +#ifndef VPX_VP9_VP9_IFACE_COMMON_H_ +#define VPX_VP9_VP9_IFACE_COMMON_H_ #include "vpx_ports/mem.h" static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, void *user_priv) { /** vpx_img_wrap() doesn't allow specifying independent strides for - * the Y, U, and V planes, nor other alignment adjustments that - * might be representable by a YV12_BUFFER_CONFIG, so we just - * initialize all the fields.*/ + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ int bps; if (!yv12->subsampling_y) { if (!yv12->subsampling_x) { @@ -142,4 +142,4 @@ static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { assert(0 && "Invalid Reference Frame"); return VP9_LAST_FLAG; } -#endif // VP9_VP9_IFACE_COMMON_H_ +#endif // VPX_VP9_VP9_IFACE_COMMON_H_ diff --git a/libs/libvpx/vp9/vp9cx.mk b/libs/libvpx/vp9/vp9cx.mk index d633ed1429..736ff01706 100644 --- a/libs/libvpx/vp9/vp9cx.mk +++ b/libs/libvpx/vp9/vp9cx.mk @@ -64,6 +64,7 @@ VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c VP9_CX_SRCS-yes += encoder/vp9_rd.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_pickmode.c +VP9_CX_SRCS-yes += encoder/vp9_partition_models.h VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h VP9_CX_SRCS-yes += encoder/vp9_speed_features.c @@ -74,6 +75,7 @@ VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c VP9_CX_SRCS-yes += encoder/vp9_resize.c VP9_CX_SRCS-yes += encoder/vp9_resize.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.h VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c @@ -101,11 +103,14 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm @@ -116,7 +121,6 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) @@ -129,20 +133,34 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c endif -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h +endif # !CONFIG_VP9_HIGHBITDEPTH + +VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c # Strip unnecessary files with CONFIG_REALTIME_ONLY VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.h VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/libs/libvpx/vp9/vp9dx.mk b/libs/libvpx/vp9/vp9dx.mk index 59f612b94c..93a5f368bd 100644 --- a/libs/libvpx/vp9/vp9dx.mk +++ b/libs/libvpx/vp9/vp9dx.mk @@ -28,5 +28,7 @@ VP9_DX_SRCS-yes += decoder/vp9_decoder.c VP9_DX_SRCS-yes += decoder/vp9_decoder.h VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h +VP9_DX_SRCS-yes += decoder/vp9_job_queue.c +VP9_DX_SRCS-yes += decoder/vp9_job_queue.h VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) diff --git a/libs/libvpx/vpx/exports_spatial_svc b/libs/libvpx/vpx/exports_spatial_svc deleted file mode 100644 index d258a1d618..0000000000 --- a/libs/libvpx/vpx/exports_spatial_svc +++ /dev/null @@ -1,6 +0,0 @@ -text vpx_svc_dump_statistics -text vpx_svc_encode -text vpx_svc_get_message -text vpx_svc_init -text vpx_svc_release -text vpx_svc_set_options diff --git a/libs/libvpx/vpx/internal/vpx_codec_internal.h b/libs/libvpx/vpx/internal/vpx_codec_internal.h index 522e5c1684..9eed85e5de 100644 --- a/libs/libvpx/vpx/internal/vpx_codec_internal.h +++ b/libs/libvpx/vpx/internal/vpx_codec_internal.h @@ -40,8 +40,8 @@ * Once initialized, the instance is manged using other functions from * the vpx_codec_* family. */ -#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ -#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#ifndef VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#define VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ #include "../vpx_decoder.h" #include "../vpx_encoder.h" #include @@ -442,4 +442,4 @@ void vpx_internal_error(struct vpx_internal_error_info *info, } // extern "C" #endif -#endif // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#endif // VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ diff --git a/libs/libvpx/vpx/src/vpx_encoder.c b/libs/libvpx/vpx/src/vpx_encoder.c index 1cf2dca695..c227ee902d 100644 --- a/libs/libvpx/vpx/src/vpx_encoder.c +++ b/libs/libvpx/vpx/src/vpx_encoder.c @@ -20,7 +20,7 @@ #include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" -#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) +#define SAVE_STATUS(ctx, var) ((ctx) ? ((ctx)->err = (var)) : (var)) static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { return (vpx_codec_alg_priv_t *)ctx->priv; @@ -82,6 +82,9 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( res = VPX_CODEC_INCAPABLE; else { int i; +#if CONFIG_MULTI_RES_ENCODING + int mem_loc_owned = 0; +#endif void *mem_loc = NULL; if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE; @@ -101,12 +104,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( mr_cfg.mr_down_sampling_factor.num = dsf->num; mr_cfg.mr_down_sampling_factor.den = dsf->den; - /* Force Key-frame synchronization. Namely, encoder at higher - * resolution always use the same frame_type chosen by the - * lowest-resolution encoder. - */ - if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; - ctx->iface = iface; ctx->name = iface->name; ctx->priv = NULL; @@ -129,13 +126,17 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( i--; } #if CONFIG_MULTI_RES_ENCODING - assert(mem_loc); - free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); - free(mem_loc); + if (!mem_loc_owned) { + assert(mem_loc); + free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); + free(mem_loc); + } #endif return SAVE_STATUS(ctx, res); } - +#if CONFIG_MULTI_RES_ENCODING + mem_loc_owned = 1; +#endif ctx++; cfg++; dsf++; @@ -154,7 +155,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, vpx_codec_enc_cfg_map_t *map; int i; - if (!iface || !cfg || usage > INT_MAX) + if (!iface || !cfg || usage != 0) res = VPX_CODEC_INVALID_PARAM; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; @@ -163,12 +164,9 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, for (i = 0; i < iface->enc.cfg_map_count; ++i) { map = iface->enc.cfg_maps + i; - if (map->usage == (int)usage) { - *cfg = map->cfg; - cfg->g_usage = usage; - res = VPX_CODEC_OK; - break; - } + *cfg = map->cfg; + res = VPX_CODEC_OK; + break; } } diff --git a/libs/libvpx/vpx/src/vpx_image.c b/libs/libvpx/vpx/src/vpx_image.c index af7c529a7b..a7c6ec0cea 100644 --- a/libs/libvpx/vpx/src/vpx_image.c +++ b/libs/libvpx/vpx/src/vpx_image.c @@ -38,23 +38,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, /* Get sample size for this format */ switch (fmt) { - case VPX_IMG_FMT_RGB32: - case VPX_IMG_FMT_RGB32_LE: - case VPX_IMG_FMT_ARGB: - case VPX_IMG_FMT_ARGB_LE: bps = 32; break; - case VPX_IMG_FMT_RGB24: - case VPX_IMG_FMT_BGR24: bps = 24; break; - case VPX_IMG_FMT_RGB565: - case VPX_IMG_FMT_RGB565_LE: - case VPX_IMG_FMT_RGB555: - case VPX_IMG_FMT_RGB555_LE: - case VPX_IMG_FMT_UYVY: - case VPX_IMG_FMT_YUY2: - case VPX_IMG_FMT_YVYU: bps = 16; break; case VPX_IMG_FMT_I420: - case VPX_IMG_FMT_YV12: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: bps = 12; break; + case VPX_IMG_FMT_YV12: bps = 12; break; case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I440: bps = 16; break; case VPX_IMG_FMT_I444: bps = 24; break; @@ -69,8 +54,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, switch (fmt) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_YV12: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I42216: xcs = 1; break; @@ -81,8 +64,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, case VPX_IMG_FMT_I420: case VPX_IMG_FMT_I440: case VPX_IMG_FMT_YV12: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I44016: ycs = 1; break; default: ycs = 0; break; diff --git a/libs/libvpx/vpx/vp8.h b/libs/libvpx/vpx/vp8.h index 059c9d0f65..f30dafed58 100644 --- a/libs/libvpx/vpx/vp8.h +++ b/libs/libvpx/vpx/vp8.h @@ -10,7 +10,7 @@ /*!\defgroup vp8 VP8 * \ingroup codecs - * VP8 is vpx's newest video compression algorithm that uses motion + * VP8 is a video compression algorithm that uses motion * compensated prediction, Discrete Cosine Transform (DCT) coding of the * prediction error signal and context dependent entropy coding techniques * based on arithmetic principles. It features: @@ -27,8 +27,8 @@ /*!\file * \brief Provides controls common to both the VP8 encoder and decoder. */ -#ifndef VPX_VP8_H_ -#define VPX_VP8_H_ +#ifndef VPX_VPX_VP8_H_ +#define VPX_VPX_VP8_H_ #include "./vpx_codec.h" #include "./vpx_image.h" @@ -47,10 +47,6 @@ enum vp8_com_control_id { VP8_SET_REFERENCE = 1, VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ - VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< \deprecated */ - VP8_SET_DBG_COLOR_MB_MODES = 5, /**< \deprecated */ - VP8_SET_DBG_COLOR_B_MODES = 6, /**< \deprecated */ - VP8_SET_DBG_DISPLAY_MV = 7, /**< \deprecated */ /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+) * for its control ids. These should be migrated to something like the @@ -70,12 +66,7 @@ enum vp8_postproc_level { VP8_DEBLOCK = 1 << 0, VP8_DEMACROBLOCK = 1 << 1, VP8_ADDNOISE = 1 << 2, - VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */ - VP8_DEBUG_TXT_MBLK_MODES = - 1 << 4, /**< print macro block modes over each macro block */ - VP8_DEBUG_TXT_DC_DIFF = 1 << 5, /**< print dc diff for each macro block */ - VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */ - VP8_MFQE = 1 << 10 + VP8_MFQE = 1 << 3 }; /*!\brief post process flags @@ -132,14 +123,6 @@ VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) #define VPX_CTRL_VP8_COPY_REFERENCE VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) #define VPX_CTRL_VP8_SET_POSTPROC -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_REF_FRAME, int) -#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_MB_MODES, int) -#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_B_MODES, int) -#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_DISPLAY_MV, int) -#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) #define VPX_CTRL_VP9_GET_REFERENCE @@ -150,4 +133,4 @@ VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) } // extern "C" #endif -#endif // VPX_VP8_H_ +#endif // VPX_VPX_VP8_H_ diff --git a/libs/libvpx/vpx/vp8cx.h b/libs/libvpx/vpx/vp8cx.h index c21b8b60db..6e613b7273 100644 --- a/libs/libvpx/vpx/vp8cx.h +++ b/libs/libvpx/vpx/vp8cx.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VP8CX_H_ -#define VPX_VP8CX_H_ +#ifndef VPX_VPX_VP8CX_H_ +#define VPX_VPX_VP8CX_H_ /*!\defgroup vp8_encoder WebM VP8/VP9 Encoder * \ingroup vp8 @@ -125,7 +125,7 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); enum vp8e_enc_control_id { /*!\brief Codec control function to pass an ROI map to encoder. * - * Supported in codecs: VP8, VP9 + * Supported in codecs: VP8 */ VP8E_SET_ROI_MAP = 8, @@ -148,13 +148,16 @@ enum vp8e_enc_control_id { * speed at the expense of quality. * * \note Valid range for VP8: -16..16 - * \note Valid range for VP9: -8..8 + * \note Valid range for VP9: -9..9 * * Supported in codecs: VP8, VP9 */ VP8E_SET_CPUUSED = 13, - /*!\brief Codec control function to enable automatic set and use alf frames. + /*!\brief Codec control function to enable automatic use of arf frames. + * + * \note Valid range for VP8: 0..1 + * \note Valid range for VP9: 0..6 * * Supported in codecs: VP8, VP9 */ @@ -169,7 +172,10 @@ enum vp8e_enc_control_id { */ VP8E_SET_NOISE_SENSITIVITY, - /*!\brief Codec control function to set sharpness. + /*!\brief Codec control function to set higher sharpness at the expense + * of a lower PSNR. + * + * \note Valid range: 0..7 * * Supported in codecs: VP8, VP9 */ @@ -227,8 +233,8 @@ enum vp8e_enc_control_id { /*!\brief Codec control function to set constrained quality level. * - * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be - * set to #VPX_CQ. + * \attention For this value to be used vpx_codec_enc_cfg_t::rc_end_usage must + * be set to #VPX_CQ * \note Valid range: 0..63 * * Supported in codecs: VP8, VP9 @@ -423,6 +429,12 @@ enum vp8e_enc_control_id { */ VP9E_SET_SVC, + /*!\brief Codec control function to pass an ROI map to encoder. + * + * Supported in codecs: VP9 + */ + VP9E_SET_ROI_MAP, + /*!\brief Codec control function to set parameters for SVC. * \note Parameters contain min_q, max_q, scaling factor for each of the * SVC layers. @@ -529,7 +541,7 @@ enum vp8e_enc_control_id { * struct #vpx_svc_ref_frame_config defined below. * * Supported in codecs: VP9 - */ + */ VP9E_SET_SVC_REF_FRAME_CONFIG, /*!\brief Codec control function to set intended rendering image size. @@ -550,11 +562,11 @@ enum vp8e_enc_control_id { VP9E_SET_TARGET_LEVEL, /*!\brief Codec control function to set row level multi-threading. - * - * 0 : off, 1 : on - * - * Supported in codecs: VP9 - */ + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ VP9E_SET_ROW_MT, /*!\brief Codec control function to get bitstream level. @@ -574,18 +586,18 @@ enum vp8e_enc_control_id { VP9E_SET_ALT_REF_AQ, /*!\brief Boost percentage for Golden Frame in CBR mode. - * - * This value controls the amount of boost given to Golden Frame in - * CBR mode. It is expressed as a percentage of the average - * per-frame bitrate, with the special (and default) value 0 meaning - * the feature is off, i.e., no golden frame boost in CBR mode and - * average bitrate target is used. - * - * For example, to allow 100% more bits, i.e, 2X, in a golden frame - * than average frame, set this to 100. - * - * Supported in codecs: VP8 - */ + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * than average frame, set this to 100. + * + * Supported in codecs: VP8 + */ VP8E_SET_GF_CBR_BOOST_PCT, /*!\brief Codec control function to enable the extreme motion vector unit test @@ -596,6 +608,74 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, + + /*!\brief Codec control function to constrain the inter-layer prediction + * (prediction of lower spatial resolution) in VP9 SVC. + * + * 0 : inter-layer prediction on, 1 : off, 2 : off only on non-key frames + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_INTER_LAYER_PRED, + + /*!\brief Codec control function to set mode and thresholds for frame + * dropping in SVC. Drop frame thresholds are set per-layer. Mode is set as: + * 0 : layer-dependent dropping, 1 : constrained dropping, current layer drop + * forces drop on all upper layers. Default mode is 0. + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_FRAME_DROP_LAYER, + + /*!\brief Codec control function to get the refresh and reference flags and + * the buffer indices, up to the last encoded spatial layer. + * + * Supported in codecs: VP9 + */ + VP9E_GET_SVC_REF_FRAME_CONFIG, + + /*!\brief Codec control function to enable/disable use of golden reference as + * a second temporal reference for SVC. Only used when inter-layer prediction + * is disabled on INTER frames. + * + * 0: Off, 1: Enabled (default) + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_GF_TEMPORAL_REF, + + /*!\brief Codec control function to enable spatial layer sync frame, for any + * spatial layer. Enabling it for layer k means spatial layer k will disable + * all temporal prediction, but keep the inter-layer prediction. It will + * refresh any temporal reference buffer for that layer, and reset the + * temporal layer for the superframe to 0. Setting the layer sync for base + * spatial layer forces a key frame. Default is off (0) for all spatial + * layers. Spatial layer sync flag is reset to 0 after each encoded layer, + * so when control is invoked it is only used for the current superframe. + * + * 0: Off (default), 1: Enabled + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_SPATIAL_LAYER_SYNC, + + /*!\brief Codec control function to enable temporal dependency model. + * + * Vp9 allows the encoder to run temporal dependency model and use it to + * improve the compression performance. To enable, set this parameter to be + * 1. The default value is set to be 1. + */ + VP9E_SET_TPL, + + /*!\brief Codec control function to enable postencode frame drop. + * + * This will allow encoder to drop frame after it's encoded. + * + * 0: Off (default), 1: Enabled + * + * Supported in codecs: VP9 + */ + VP9E_SET_POSTENCODE_DROP, }; /*!\brief vpx 1-D scaling mode @@ -643,16 +723,20 @@ typedef enum vp9e_temporal_layering_mode { */ typedef struct vpx_roi_map { - /*! An id between 0 and 3 for each 16x16 region within a frame. */ + /*! If ROI is enabled. */ + uint8_t enabled; + /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9) + * region within a frame. */ unsigned char *roi_map; unsigned int rows; /**< Number of rows. */ unsigned int cols; /**< Number of columns. */ - // TODO(paulwilkins): broken for VP9 which has 8 segments - // q and loop filter deltas for each segment - // (see MAX_MB_SEGMENTS) - int delta_q[4]; /**< Quantizer deltas. */ - int delta_lf[4]; /**< Loop filter deltas. */ - /*! Static breakout threshold for each segment. */ + /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */ + int delta_q[8]; /**< Quantizer deltas. */ + int delta_lf[8]; /**< Loop filter deltas. */ + /*! skip and ref frame segment is only used in VP9. */ + int skip[8]; /**< Skip this block. */ + int ref_frame[8]; /**< Reference frame for this block. */ + /*! Static breakout threshold for each segment. Only used in VP8. */ unsigned int static_threshold[4]; } vpx_roi_map_t; @@ -716,11 +800,13 @@ typedef enum { VP8_TUNE_PSNR, VP8_TUNE_SSIM } vp8e_tuning; * */ typedef struct vpx_svc_layer_id { - int spatial_layer_id; /**< Spatial layer id number. */ + int spatial_layer_id; /**< First spatial layer to start encoding. */ + // TODO(jianj): Deprecated, to be removed. int temporal_layer_id; /**< Temporal layer id number. */ + int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; /**< Temp layer id. */ } vpx_svc_layer_id_t; -/*!\brief vp9 svc frame flag parameters. +/*!\brief vp9 svc frame flag parameters. * * This defines the frame flags and buffer indices for each spatial layer for * svc encoding. @@ -729,12 +815,56 @@ typedef struct vpx_svc_layer_id { * */ typedef struct vpx_svc_ref_frame_config { - int frame_flags[VPX_TS_MAX_LAYERS]; /**< Frame flags. */ - int lst_fb_idx[VPX_TS_MAX_LAYERS]; /**< Last buffer index. */ - int gld_fb_idx[VPX_TS_MAX_LAYERS]; /**< Golden buffer index. */ - int alt_fb_idx[VPX_TS_MAX_LAYERS]; /**< Altref buffer index. */ + int lst_fb_idx[VPX_SS_MAX_LAYERS]; /**< Last buffer index. */ + int gld_fb_idx[VPX_SS_MAX_LAYERS]; /**< Golden buffer index. */ + int alt_fb_idx[VPX_SS_MAX_LAYERS]; /**< Altref buffer index. */ + int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */ + // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated. + int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ + int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ + int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ + int reference_last[VPX_SS_MAX_LAYERS]; /**< Last as reference. */ + int reference_golden[VPX_SS_MAX_LAYERS]; /**< Golden as reference. */ + int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */ + int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */ } vpx_svc_ref_frame_config_t; +/*!\brief VP9 svc frame dropping mode. + * + * This defines the frame drop mode for SVC. + * + */ +typedef enum { + CONSTRAINED_LAYER_DROP, + /**< Upper layers are constrained to drop if current layer drops. */ + LAYER_DROP, /**< Any spatial layer can drop. */ + FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */ +} SVC_LAYER_DROP_MODE; + +/*!\brief vp9 svc frame dropping parameters. + * + * This defines the frame drop thresholds for each spatial layer, and + * the frame dropping mode: 0 = layer based frame dropping (default), + * 1 = constrained dropping where current layer drop forces all upper + * spatial layers to drop. + */ +typedef struct vpx_svc_frame_drop { + int framedrop_thresh[VPX_SS_MAX_LAYERS]; /**< Frame drop thresholds */ + SVC_LAYER_DROP_MODE + framedrop_mode; /**< Layer-based or constrained dropping. */ + int max_consec_drop; /**< Maximum consecutive drops, for any layer. */ +} vpx_svc_frame_drop_t; + +/*!\brief vp9 svc spatial layer sync parameters. + * + * This defines the spatial layer sync flag, defined per spatial layer. + * + */ +typedef struct vpx_svc_spatial_layer_sync { + int spatial_layer_sync[VPX_SS_MAX_LAYERS]; /**< Sync layer flags */ + int base_layer_intra_only; /**< Flag for setting Intra-only frame on base */ +} vpx_svc_spatial_layer_sync_t; + /*!\cond */ /*!\brief VP8 encoder control function parameter type * @@ -749,6 +879,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int) #define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) #define VPX_CTRL_VP8E_SET_ROI_MAP +VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *) +#define VPX_CTRL_VP9E_SET_ROI_MAP VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) #define VPX_CTRL_VP8E_SET_ACTIVEMAP VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) @@ -792,6 +924,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) #define VPX_CTRL_VP9E_SET_TILE_ROWS +VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int) +#define VPX_CTRL_VP9E_SET_TPL + VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) @@ -801,8 +936,8 @@ VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) #define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT -VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int) -#define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int) +#define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) #define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT @@ -867,10 +1002,29 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int) +#define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED + +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *) +#define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER + +VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) +#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG + +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int) +#define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF + +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, + vpx_svc_spatial_layer_sync_t *) +#define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC + +VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int) +#define VPX_CTRL_VP9E_SET_POSTENCODE_DROP + /*!\endcond */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_VP8CX_H_ +#endif // VPX_VPX_VP8CX_H_ diff --git a/libs/libvpx/vpx/vp8dx.h b/libs/libvpx/vpx/vp8dx.h index 398c670220..af92f21ae3 100644 --- a/libs/libvpx/vpx/vp8dx.h +++ b/libs/libvpx/vpx/vp8dx.h @@ -17,8 +17,8 @@ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder * interface. */ -#ifndef VPX_VP8DX_H_ -#define VPX_VP8DX_H_ +#ifndef VPX_VPX_VP8DX_H_ +#define VPX_VPX_VP8DX_H_ #ifdef __cplusplus extern "C" { @@ -124,6 +124,24 @@ enum vp8_dec_control_id { */ VPXD_GET_LAST_QUANTIZER, + /*!\brief Codec control function to set row level multi-threading. + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ + VP9D_SET_ROW_MT, + + /*!\brief Codec control function to set loopfilter optimization. + * + * 0 : off, Loop filter is done after all tiles have been decoded + * 1 : on, Loop filter is done immediately after decode without + * waiting for all threads to sync. + * + * Supported in codecs: VP9 + */ + VP9D_SET_LOOP_FILTER_OPT, + VP8_DECODER_CTRL_ID_MAX }; @@ -145,10 +163,6 @@ typedef struct vpx_decrypt_init { void *decrypt_state; } vpx_decrypt_init; -/*!\brief A deprecated alias for vpx_decrypt_init. - */ -typedef vpx_decrypt_init vp8_decrypt_init; - /*!\cond */ /*!\brief VP8 decoder control function parameter type * @@ -181,6 +195,10 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) #define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) +#define VPX_CTRL_VP9_DECODE_SET_ROW_MT +VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int) +#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT +VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int) /*!\endcond */ /*! @} - end defgroup vp8_decoder */ @@ -189,4 +207,4 @@ VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) } // extern "C" #endif -#endif // VPX_VP8DX_H_ +#endif // VPX_VPX_VP8DX_H_ diff --git a/libs/libvpx/vpx/vpx_codec.h b/libs/libvpx/vpx/vpx_codec.h index ad05f4c74e..6371a6ca28 100644 --- a/libs/libvpx/vpx/vpx_codec.h +++ b/libs/libvpx/vpx/vpx_codec.h @@ -35,8 +35,8 @@ * Once initialized, the instance is manged using other functions from * the vpx_codec_* family. */ -#ifndef VPX_VPX_CODEC_H_ -#define VPX_VPX_CODEC_H_ +#ifndef VPX_VPX_VPX_CODEC_H_ +#define VPX_VPX_VPX_CODEC_H_ #ifdef __cplusplus extern "C" { @@ -241,11 +241,11 @@ typedef enum vpx_bit_depth { */ int vpx_codec_version(void); #define VPX_VERSION_MAJOR(v) \ - ((v >> 16) & 0xff) /**< extract major from packed version */ + (((v) >> 16) & 0xff) /**< extract major from packed version */ #define VPX_VERSION_MINOR(v) \ - ((v >> 8) & 0xff) /**< extract minor from packed version */ + (((v) >> 8) & 0xff) /**< extract minor from packed version */ #define VPX_VERSION_PATCH(v) \ - ((v >> 0) & 0xff) /**< extract patch from packed version */ + (((v) >> 0) & 0xff) /**< extract patch from packed version */ /*!\brief Return the version major number */ #define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff) @@ -465,4 +465,4 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); #ifdef __cplusplus } #endif -#endif // VPX_VPX_CODEC_H_ +#endif // VPX_VPX_VPX_CODEC_H_ diff --git a/libs/libvpx/vpx/vpx_codec.mk b/libs/libvpx/vpx/vpx_codec.mk index b77f45817b..4ed77ad6d9 100644 --- a/libs/libvpx/vpx/vpx_codec.mk +++ b/libs/libvpx/vpx/vpx_codec.mk @@ -15,10 +15,6 @@ API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h -ifeq ($(CONFIG_VP9_ENCODER),yes) - API_SRCS-$(CONFIG_SPATIAL_SVC) += src/svc_encodeframe.c - API_SRCS-$(CONFIG_SPATIAL_SVC) += svc_context.h -endif API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h diff --git a/libs/libvpx/vpx/vpx_decoder.h b/libs/libvpx/vpx/vpx_decoder.h index 2ff12112bc..f113f7196b 100644 --- a/libs/libvpx/vpx/vpx_decoder.h +++ b/libs/libvpx/vpx/vpx_decoder.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_DECODER_H_ -#define VPX_VPX_DECODER_H_ +#ifndef VPX_VPX_VPX_DECODER_H_ +#define VPX_VPX_VPX_DECODER_H_ /*!\defgroup decoder Decoder Algorithm Interface * \ingroup codec @@ -362,4 +362,4 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions( #ifdef __cplusplus } #endif -#endif // VPX_VPX_DECODER_H_ +#endif // VPX_VPX_VPX_DECODER_H_ diff --git a/libs/libvpx/vpx/vpx_encoder.h b/libs/libvpx/vpx/vpx_encoder.h index 464bc408c8..c18de703fb 100644 --- a/libs/libvpx/vpx/vpx_encoder.h +++ b/libs/libvpx/vpx/vpx_encoder.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_ENCODER_H_ -#define VPX_VPX_ENCODER_H_ +#ifndef VPX_VPX_VPX_ENCODER_H_ +#define VPX_VPX_VPX_ENCODER_H_ /*!\defgroup encoder Encoder Algorithm Interface * \ingroup codec @@ -39,15 +39,9 @@ extern "C" { /*! Temporal Scalability: Maximum number of coding layers */ #define VPX_TS_MAX_LAYERS 5 -/*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */ -#define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY - /*! Temporal+Spatial Scalability: Maximum number of coding layers */ #define VPX_MAX_LAYERS 12 // 3 temporal + 4 spatial layers are allowed. -/*!\deprecated Use #VPX_MAX_LAYERS instead. */ -#define MAX_LAYERS VPX_MAX_LAYERS // 3 temporal + 4 spatial layers allowed. - /*! Spatial Scalability: Maximum number of coding layers */ #define VPX_SS_MAX_LAYERS 5 @@ -63,7 +57,7 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (6 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (14 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -150,15 +144,10 @@ typedef uint32_t vpx_codec_er_flags_t; * extend this list to provide additional functionality. */ enum vpx_codec_cx_pkt_kind { - VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ - VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ - VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ - VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ -// Spatial SVC is still experimental and may be removed. -#if defined(VPX_TEST_SPATIAL_SVC) - VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ - VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/ -#endif + VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ + VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ + VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ }; @@ -182,6 +171,13 @@ typedef struct vpx_codec_cx_pkt { * Only applicable when "output partition" mode is enabled. First * partition has id 0.*/ int partition_id; + /*!\brief Width and height of frames in this packet. VP8 will only use the + * first one.*/ + unsigned int width[VPX_SS_MAX_LAYERS]; /**< frame width */ + unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */ + /*!\brief Flag to indicate if spatial layer frame in this packet is + * encoded or dropped. VP8 will always be set to 1.*/ + uint8_t spatial_layer_encoded[VPX_SS_MAX_LAYERS]; } frame; /**< data for compressed frame packet */ vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ @@ -191,11 +187,6 @@ typedef struct vpx_codec_cx_pkt { double psnr[4]; /**< PSNR, total/y/u/v */ } psnr; /**< data for PSNR packet */ vpx_fixed_buf_t raw; /**< data for arbitrary packets */ -// Spatial SVC is still experimental and may be removed. -#if defined(VPX_TEST_SPATIAL_SVC) - size_t layer_sizes[VPX_SS_MAX_LAYERS]; - struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS]; -#endif /* This packet size is fixed to allow codecs to extend this * interface without having to manage storage for raw packets, @@ -211,8 +202,6 @@ typedef struct vpx_codec_cx_pkt { * This callback function, when registered, returns with packets when each * spatial layer is encoded. */ -// putting the definitions here for now. (agrange: find if there -// is a better place for this) typedef void (*vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt, void *user_data); @@ -281,12 +270,9 @@ typedef struct vpx_codec_enc_cfg { * generic settings (g) */ - /*!\brief Algorithm specific "usage" value + /*!\brief Deprecated: Algorithm specific "usage" value * - * Algorithms may define multiple values for usage, which may convey the - * intent of how the application intends to use the stream. If this value - * is non-zero, consult the documentation for the codec to determine its - * meaning. + * This value must be zero. */ unsigned int g_usage; @@ -397,9 +383,6 @@ typedef struct vpx_codec_enc_cfg { * trade-off is often acceptable, but for many applications is not. It can * be disabled in these cases. * - * Note that not all codecs support this feature. All vpx VPx codecs do. - * For other codecs, consult the documentation for that algorithm. - * * This threshold is described as a percentage of the target data buffer. * When the data buffer falls below this percentage of fullness, a * dropped frame is indicated. Set the threshold to zero (0) to disable @@ -485,8 +468,7 @@ typedef struct vpx_codec_enc_cfg { * The quantizer is the most direct control over the quality of the * encoded image. The range of valid values for the quantizer is codec * specific. Consult the documentation for the codec to determine the - * values to use. To determine the range programmatically, call - * vpx_codec_enc_config_default() with a usage value of 0. + * values to use. */ unsigned int rc_min_quantizer; @@ -495,8 +477,7 @@ typedef struct vpx_codec_enc_cfg { * The quantizer is the most direct control over the quality of the * encoded image. The range of valid values for the quantizer is codec * specific. Consult the documentation for the codec to determine the - * values to use. To determine the range programmatically, call - * vpx_codec_enc_config_default() with a usage value of 0. + * values to use. */ unsigned int rc_max_quantizer; @@ -512,7 +493,7 @@ typedef struct vpx_codec_enc_cfg { * be subtracted from the target bitrate in order to compensate * for prior overshoot. * VP9: Expressed as a percentage of the target bitrate, a threshold - * undershoot level (current rate vs target) beyond which more agressive + * undershoot level (current rate vs target) beyond which more aggressive * corrective measures are taken. * * * Valid values in the range VP8:0-1000 VP9: 0-100. @@ -527,7 +508,7 @@ typedef struct vpx_codec_enc_cfg { * be added to the target bitrate in order to compensate for * prior undershoot. * VP9: Expressed as a percentage of the target bitrate, a threshold - * overshoot level (current rate vs target) beyond which more agressive + * overshoot level (current rate vs target) beyond which more aggressive * corrective measures are taken. * * Valid values in the range VP8:0-1000 VP9: 0-100. @@ -596,10 +577,10 @@ typedef struct vpx_codec_enc_cfg { unsigned int rc_2pass_vbr_maxsection_pct; /*!\brief Two-pass corpus vbr mode complexity control - * Used only in VP9: A value representing the corpus midpoint complexity - * for corpus vbr mode. This value defaults to 0 which disables corpus vbr - * mode in favour of normal vbr mode. - */ + * Used only in VP9: A value representing the corpus midpoint complexity + * for corpus vbr mode. This value defaults to 0 which disables corpus vbr + * mode in favour of normal vbr mode. + */ unsigned int rc_2pass_vbr_corpus_complexity; /* @@ -682,7 +663,7 @@ typedef struct vpx_codec_enc_cfg { * membership of frames to temporal layers. For example, if the * ts_periodicity = 8, then the frames are assigned to coding layers with a * repeated sequence of length 8. - */ + */ unsigned int ts_periodicity; /*!\brief Template defining the membership of frames to temporal layers. @@ -691,7 +672,7 @@ typedef struct vpx_codec_enc_cfg { * For a 2-layer encoding that assigns even numbered frames to one temporal * layer (0) and odd numbered frames to a second temporal layer (1) with * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1). - */ + */ unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY]; /*!\brief Target bitrate for each spatial/temporal layer. @@ -802,7 +783,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( * * \param[in] iface Pointer to the algorithm interface to use. * \param[out] cfg Configuration buffer to populate. - * \param[in] reserved Must set to 0 for VP8 and VP9. + * \param[in] usage Must be set to 0. * * \retval #VPX_CODEC_OK * The configuration was populated. @@ -813,7 +794,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( */ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, - unsigned int reserved); + unsigned int usage); /*!\brief Set or change configuration * @@ -862,7 +843,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); * implicit that limiting the available time to encode will degrade the * output quality. The encoder can be given an unlimited time to produce the * best possible frame by specifying a deadline of '0'. This deadline - * supercedes the VPx notion of "best quality, good quality, realtime". + * supersedes the VPx notion of "best quality, good quality, realtime". * Applications that wish to map these former settings to the new deadline * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY, * and #VPX_DL_BEST_QUALITY. @@ -984,4 +965,4 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx); #ifdef __cplusplus } #endif -#endif // VPX_VPX_ENCODER_H_ +#endif // VPX_VPX_VPX_ENCODER_H_ diff --git a/libs/libvpx/vpx/vpx_frame_buffer.h b/libs/libvpx/vpx/vpx_frame_buffer.h index ad70cdd572..fc8320017b 100644 --- a/libs/libvpx/vpx/vpx_frame_buffer.h +++ b/libs/libvpx/vpx/vpx_frame_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_FRAME_BUFFER_H_ -#define VPX_VPX_FRAME_BUFFER_H_ +#ifndef VPX_VPX_VPX_FRAME_BUFFER_H_ +#define VPX_VPX_VPX_FRAME_BUFFER_H_ /*!\file * \brief Describes the decoder external frame buffer interface. @@ -52,12 +52,12 @@ typedef struct vpx_codec_frame_buffer { * data. The callback is triggered when the decoder needs a frame buffer to * decode a compressed image into. This function may be called more than once * for every call to vpx_codec_decode. The application may set fb->priv to - * some data which will be passed back in the ximage and the release function - * call. |fb| is guaranteed to not be NULL. On success the callback must - * return 0. Any failure the callback must return a value less than 0. + * some data which will be passed back in the vpx_image_t and the release + * function call. |fb| is guaranteed to not be NULL. On success the callback + * must return 0. Any failure the callback must return a value less than 0. * * \param[in] priv Callback's private data - * \param[in] new_size Size in bytes needed by the buffer + * \param[in] min_size Size in bytes needed by the buffer * \param[in,out] fb Pointer to vpx_codec_frame_buffer_t */ typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, @@ -80,4 +80,4 @@ typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv, } // extern "C" #endif -#endif // VPX_VPX_FRAME_BUFFER_H_ +#endif // VPX_VPX_VPX_FRAME_BUFFER_H_ diff --git a/libs/libvpx/vpx/vpx_image.h b/libs/libvpx/vpx/vpx_image.h index d6d3166d2f..98be5966a2 100644 --- a/libs/libvpx/vpx/vpx_image.h +++ b/libs/libvpx/vpx/vpx_image.h @@ -12,8 +12,8 @@ * \brief Describes the vpx image descriptor and associated operations * */ -#ifndef VPX_VPX_IMAGE_H_ -#define VPX_VPX_IMAGE_H_ +#ifndef VPX_VPX_VPX_IMAGE_H_ +#define VPX_VPX_VPX_IMAGE_H_ #ifdef __cplusplus extern "C" { @@ -27,7 +27,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/ +#define VPX_IMAGE_ABI_VERSION (5) /**<\hideinitializer*/ #define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ #define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ @@ -37,29 +37,12 @@ extern "C" { /*!\brief List of supported image formats */ typedef enum vpx_img_fmt { VPX_IMG_FMT_NONE, - VPX_IMG_FMT_RGB24, /**< 24 bit per pixel packed RGB */ - VPX_IMG_FMT_RGB32, /**< 32 bit per pixel packed 0RGB */ - VPX_IMG_FMT_RGB565, /**< 16 bit per pixel, 565 */ - VPX_IMG_FMT_RGB555, /**< 16 bit per pixel, 555 */ - VPX_IMG_FMT_UYVY, /**< UYVY packed YUV */ - VPX_IMG_FMT_YUY2, /**< YUYV packed YUV */ - VPX_IMG_FMT_YVYU, /**< YVYU packed YUV */ - VPX_IMG_FMT_BGR24, /**< 24 bit per pixel packed BGR */ - VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */ - VPX_IMG_FMT_ARGB, /**< 32 bit packed ARGB, alpha=255 */ - VPX_IMG_FMT_ARGB_LE, /**< 32 bit packed BGRA, alpha=255 */ - VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */ - VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */ VPX_IMG_FMT_YV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2, - VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | - 3, /** < planar 4:2:0 format with vpx color space */ - VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4, VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7, - VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6, VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH, @@ -167,21 +150,21 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, * storage for descriptor has been allocated elsewhere, and a descriptor is * desired to "wrap" that storage. * - * \param[in] img Pointer to storage for descriptor. If this parameter - * is NULL, the storage for the descriptor will be - * allocated on the heap. - * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image - * \param[in] align Alignment, in bytes, of each row in the image. - * \param[in] img_data Storage to use for the image + * \param[in] img Pointer to storage for descriptor. If this + * parameter is NULL, the storage for the descriptor + * will be allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] stride_align Alignment, in bytes, of each row in the image. + * \param[in] img_data Storage to use for the image * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be * returned. */ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, - unsigned int d_h, unsigned int align, + unsigned int d_h, unsigned int stride_align, unsigned char *img_data); /*!\brief Set the rectangle identifying the displayed portion of the image @@ -221,4 +204,4 @@ void vpx_img_free(vpx_image_t *img); } // extern "C" #endif -#endif // VPX_VPX_IMAGE_H_ +#endif // VPX_VPX_VPX_IMAGE_H_ diff --git a/libs/libvpx/vpx/vpx_integer.h b/libs/libvpx/vpx/vpx_integer.h index 09bad9222d..4129d156f8 100644 --- a/libs/libvpx/vpx/vpx_integer.h +++ b/libs/libvpx/vpx/vpx_integer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_INTEGER_H_ -#define VPX_VPX_INTEGER_H_ +#ifndef VPX_VPX_VPX_INTEGER_H_ +#define VPX_VPX_VPX_INTEGER_H_ /* get ptrdiff_t, size_t, wchar_t, NULL */ #include @@ -18,27 +18,12 @@ #define VPX_FORCE_INLINE __forceinline #define VPX_INLINE __inline #else -#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline) +#define VPX_FORCE_INLINE __inline__ __attribute__((always_inline)) // TODO(jbb): Allow a way to force inline off for older compilers. #define VPX_INLINE inline #endif -#if defined(VPX_EMULATE_INTTYPES) -typedef signed char int8_t; -typedef signed short int16_t; -typedef signed int int32_t; - -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; - -#ifndef _UINTPTR_T_DEFINED -typedef size_t uintptr_t; -#endif - -#else - -/* Most platforms have the C99 standard integer types. */ +/* Assume platforms have the C99 standard integer types. */ #if defined(__cplusplus) #if !defined(__STDC_FORMAT_MACROS) @@ -49,15 +34,7 @@ typedef size_t uintptr_t; #endif #endif // __cplusplus +#include #include -#endif - -/* VS2010 defines stdint.h, but not inttypes.h */ -#if defined(_MSC_VER) && _MSC_VER < 1800 -#define PRId64 "I64d" -#else -#include -#endif - -#endif // VPX_VPX_INTEGER_H_ +#endif // VPX_VPX_VPX_INTEGER_H_ diff --git a/libs/libvpx/vpx_dsp/add_noise.c b/libs/libvpx/vpx_dsp/add_noise.c index cda6ae8814..6839e97928 100644 --- a/libs/libvpx/vpx_dsp/add_noise.c +++ b/libs/libvpx/vpx_dsp/add_noise.c @@ -52,6 +52,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) { const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i)); if (a_i) { for (j = 0; j < a_i; ++j) { + if (next + j >= 256) goto set_noise; char_dist[next + j] = (int8_t)i; } next = next + j; @@ -63,6 +64,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) { char_dist[next] = 0; } +set_noise: for (i = 0; i < size; ++i) { noise[i] = char_dist[rand() & 0xff]; // NOLINT } diff --git a/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c index 1370ec2d2e..5afdece0ab 100644 --- a/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c +++ b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c @@ -17,8 +17,8 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { if (width > 8) { - int x, y; - for (y = 0; y < height; ++y) { + int x, y = height; + do { for (x = 0; x < width; x += 16) { const uint8x16_t p = vld1q_u8(pred + x); const uint8x16_t r = vld1q_u8(ref + x); @@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, comp += width; pred += width; ref += ref_stride; - } - } else { - int i; - for (i = 0; i < width * height; i += 16) { + } while (--y); + } else if (width == 8) { + int i = width * height; + do { const uint8x16_t p = vld1q_u8(pred); uint8x16_t r; - - if (width == 4) { - r = load_unaligned_u8q(ref, ref_stride); - ref += 4 * ref_stride; - } else { - const uint8x8_t r_0 = vld1_u8(ref); - const uint8x8_t r_1 = vld1_u8(ref + ref_stride); - assert(width == 8); - r = vcombine_u8(r_0, r_1); - ref += 2 * ref_stride; - } + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; r = vrhaddq_u8(r, p); vst1q_u8(comp, r); pred += 16; comp += 16; - } + i -= 16; + } while (i); + } else { + int i = width * height; + assert(width == 4); + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); } } diff --git a/libs/libvpx/vpx_dsp/arm/deblock_neon.c b/libs/libvpx/vpx_dsp/arm/deblock_neon.c index 1fb41d2992..7efce32735 100644 --- a/libs/libvpx/vpx_dsp/arm/deblock_neon.c +++ b/libs/libvpx/vpx_dsp/arm/deblock_neon.c @@ -91,11 +91,6 @@ void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr, int row; int col; - // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for - // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8 - // (for U/V). - assert((size == 8 || size == 16) && cols % 8 == 0); - // While columns of length 16 can be processed, load them. for (col = 0; col < cols - 8; col += 16) { uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7; diff --git a/libs/libvpx/vpx_dsp/arm/fdct_neon.c b/libs/libvpx/vpx_dsp/arm/fdct_neon.c index 04646ed2e0..3708cbb11f 100644 --- a/libs/libvpx/vpx_dsp/arm/fdct_neon.c +++ b/libs/libvpx/vpx_dsp/arm/fdct_neon.c @@ -11,6 +11,7 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" diff --git a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c index 8049277b13..374a262b93 100644 --- a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c @@ -11,6 +11,7 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c index 5358839b53..654ab42ca4 100644 --- a/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -11,61 +11,37 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t, - int32x4x2_t *const d0, - int32x4x2_t *const d1) { - int32x2x2_t t32[4]; +static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) { + int32x2x2_t t32; - t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); - t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); - t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); - t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); - t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS); - t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS); - t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS); - t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS); - d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); - d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); - d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]); - d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]); + t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS); + t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS); + return vcombine_s32(t32.val[0], t32.val[1]); } -static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t, - int32x4_t *const d0, - int32x4_t *const d1) { - int32x2x2_t t32[2]; - - t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); - t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); - t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); - t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); - *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]); - *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]); +static INLINE void dct_const_round_shift_high_4_dual( + const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) { + *d0 = dct_const_round_shift_high_4(in[0]); + *d1 = dct_const_round_shift_high_4(in[1]); } static INLINE int32x4x2_t -highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) { - int32x2x2_t t32[2]; - int32x4x2_t d; - - t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); - t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); - t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); - t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); - d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); - d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); - return d; +dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = dct_const_round_shift_high_4(in[0]); + out.val[1] = dct_const_round_shift_high_4(in[1]); + return out; } -static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) { - int32x2x2_t t32; - - t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS); - t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS); - return vcombine_s32(t32.val[0], t32.val[1]); +static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0); + *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2); } static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, @@ -107,7 +83,7 @@ static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, vget_low_s32(cospi_2_30_10_22), 0); t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_low_s32(cospi_2_30_10_22), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, @@ -149,7 +125,7 @@ static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, vget_low_s32(cospi_4_12_20N_28), 0); t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_low_s32(cospi_4_12_20N_28), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, @@ -191,7 +167,7 @@ static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, vget_low_s32(cospi_6_26N_14_18N), 1); t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_low_s32(cospi_6_26N_14_18N), 1); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, @@ -233,7 +209,7 @@ static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, vget_high_s32(cospi_2_30_10_22), 0); t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_2_30_10_22), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, @@ -275,7 +251,7 @@ static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, vget_high_s32(cospi_4_12_20N_28), 0); t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_4_12_20N_28), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, @@ -317,7 +293,7 @@ static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, vget_high_s32(cospi_6_26N_14_18N), 1); t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_6_26N_14_18N), 1); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_q_kernel( @@ -386,7 +362,7 @@ static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0, int64x2x2_t t[4]; highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, @@ -397,7 +373,7 @@ static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, int64x2x2_t t[2]; highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); - highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); + dct_const_round_shift_high_4_dual(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, @@ -412,7 +388,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]); t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]); t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, @@ -425,7 +401,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]); t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]); - highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); + dct_const_round_shift_high_4_dual(t, d0, d1); } static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, @@ -459,7 +435,7 @@ static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, vget_high_s32(cospi_0_8_16_24), 0); t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_0_8_16_24), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, @@ -481,7 +457,7 @@ static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, vget_high_s32(cospi_0_8_16_24), 0); t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 0); - highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); + dct_const_round_shift_high_4_dual(t, d0, d1); } static INLINE void highbd_idct16x16_add_stage7_dual( @@ -540,62 +516,9 @@ static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2, out[15] = vsubq_s32(step2[0], step2[15]); } -static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out, - int32_t *output) { - // Save the result into output - vst1q_s32(output + 0, out[0].val[0]); - vst1q_s32(output + 4, out[0].val[1]); - output += 16; - vst1q_s32(output + 0, out[1].val[0]); - vst1q_s32(output + 4, out[1].val[1]); - output += 16; - vst1q_s32(output + 0, out[2].val[0]); - vst1q_s32(output + 4, out[2].val[1]); - output += 16; - vst1q_s32(output + 0, out[3].val[0]); - vst1q_s32(output + 4, out[3].val[1]); - output += 16; - vst1q_s32(output + 0, out[4].val[0]); - vst1q_s32(output + 4, out[4].val[1]); - output += 16; - vst1q_s32(output + 0, out[5].val[0]); - vst1q_s32(output + 4, out[5].val[1]); - output += 16; - vst1q_s32(output + 0, out[6].val[0]); - vst1q_s32(output + 4, out[6].val[1]); - output += 16; - vst1q_s32(output + 0, out[7].val[0]); - vst1q_s32(output + 4, out[7].val[1]); - output += 16; - vst1q_s32(output + 0, out[8].val[0]); - vst1q_s32(output + 4, out[8].val[1]); - output += 16; - vst1q_s32(output + 0, out[9].val[0]); - vst1q_s32(output + 4, out[9].val[1]); - output += 16; - vst1q_s32(output + 0, out[10].val[0]); - vst1q_s32(output + 4, out[10].val[1]); - output += 16; - vst1q_s32(output + 0, out[11].val[0]); - vst1q_s32(output + 4, out[11].val[1]); - output += 16; - vst1q_s32(output + 0, out[12].val[0]); - vst1q_s32(output + 4, out[12].val[1]); - output += 16; - vst1q_s32(output + 0, out[13].val[0]); - vst1q_s32(output + 4, out[13].val[1]); - output += 16; - vst1q_s32(output + 0, out[14].val[0]); - vst1q_s32(output + 4, out[14].val[1]); - output += 16; - vst1q_s32(output + 0, out[15].val[0]); - vst1q_s32(output + 4, out[15].val[1]); -} - -static void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, - int32_t *output, uint16_t *dest, - const int stride, - const int bd) { +void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd) { const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); @@ -815,7 +738,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s, t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0); t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0); t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0); - return highbd_idct16x16_add_wrap_low_8x1(t); + return dct_const_round_shift_high_4x2_int64x2x2(t); } static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, @@ -824,7 +747,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0); t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0); - return highbd_idct16x16_add_wrap_low_4x1(t); + return dct_const_round_shift_high_4(t); } static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, @@ -835,7 +758,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1); t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1); t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1); - return highbd_idct16x16_add_wrap_low_8x1(t); + return dct_const_round_shift_high_4x2_int64x2x2(t); } static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, @@ -844,7 +767,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1); t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1); - return highbd_idct16x16_add_wrap_low_4x1(t); + return dct_const_round_shift_high_4(t); } static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input, @@ -1003,8 +926,8 @@ static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input, } } -void vpx_highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input, - int32_t *output) { +static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int32_t *output) { const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); @@ -1142,10 +1065,11 @@ void vpx_highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input, vst1q_s32(output, out[15]); } -void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, - int32_t *const output, - uint16_t *const dest, - const int stride, const int bd) { +static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, + int32_t *const output, + uint16_t *const dest, + const int stride, + const int bd) { const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); @@ -1366,16 +1290,16 @@ void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, // pass 1 // Parallel idct on the upper 8 rows - vpx_highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output); + highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output); // pass 2 // Parallel idct to get the left 8 columns - vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, - stride, bd); + highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, + bd); // Parallel idct to get the right 8 columns - vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, - dest + 8, stride, bd); + highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, + dest + 8, stride, bd); } } diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c index 96a55c472f..5b36f73367 100644 --- a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -124,83 +124,77 @@ static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1, vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS)); } -static INLINE void load_s32x4q_dual( - const int32_t *in, int32x4x2_t *const s0, int32x4x2_t *const s1, - int32x4x2_t *const s2, int32x4x2_t *const s3, int32x4x2_t *const s4, - int32x4x2_t *const s5, int32x4x2_t *const s6, int32x4x2_t *const s7) { - s0->val[0] = vld1q_s32(in); - s0->val[1] = vld1q_s32(in + 4); +static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) { + s[0].val[0] = vld1q_s32(in); + s[0].val[1] = vld1q_s32(in + 4); in += 32; - s1->val[0] = vld1q_s32(in); - s1->val[1] = vld1q_s32(in + 4); + s[1].val[0] = vld1q_s32(in); + s[1].val[1] = vld1q_s32(in + 4); in += 32; - s2->val[0] = vld1q_s32(in); - s2->val[1] = vld1q_s32(in + 4); + s[2].val[0] = vld1q_s32(in); + s[2].val[1] = vld1q_s32(in + 4); in += 32; - s3->val[0] = vld1q_s32(in); - s3->val[1] = vld1q_s32(in + 4); + s[3].val[0] = vld1q_s32(in); + s[3].val[1] = vld1q_s32(in + 4); in += 32; - s4->val[0] = vld1q_s32(in); - s4->val[1] = vld1q_s32(in + 4); + s[4].val[0] = vld1q_s32(in); + s[4].val[1] = vld1q_s32(in + 4); in += 32; - s5->val[0] = vld1q_s32(in); - s5->val[1] = vld1q_s32(in + 4); + s[5].val[0] = vld1q_s32(in); + s[5].val[1] = vld1q_s32(in + 4); in += 32; - s6->val[0] = vld1q_s32(in); - s6->val[1] = vld1q_s32(in + 4); + s[6].val[0] = vld1q_s32(in); + s[6].val[1] = vld1q_s32(in + 4); in += 32; - s7->val[0] = vld1q_s32(in); - s7->val[1] = vld1q_s32(in + 4); + s[7].val[0] = vld1q_s32(in); + s[7].val[1] = vld1q_s32(in + 4); } -static INLINE void transpose_and_store_s32_8x8(int32x4x2_t a0, int32x4x2_t a1, - int32x4x2_t a2, int32x4x2_t a3, - int32x4x2_t a4, int32x4x2_t a5, - int32x4x2_t a6, int32x4x2_t a7, +static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a, int32_t **out) { - transpose_s32_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); - vst1q_s32(*out, a0.val[0]); + vst1q_s32(*out, a[0].val[0]); *out += 4; - vst1q_s32(*out, a0.val[1]); + vst1q_s32(*out, a[0].val[1]); *out += 4; - vst1q_s32(*out, a1.val[0]); + vst1q_s32(*out, a[1].val[0]); *out += 4; - vst1q_s32(*out, a1.val[1]); + vst1q_s32(*out, a[1].val[1]); *out += 4; - vst1q_s32(*out, a2.val[0]); + vst1q_s32(*out, a[2].val[0]); *out += 4; - vst1q_s32(*out, a2.val[1]); + vst1q_s32(*out, a[2].val[1]); *out += 4; - vst1q_s32(*out, a3.val[0]); + vst1q_s32(*out, a[3].val[0]); *out += 4; - vst1q_s32(*out, a3.val[1]); + vst1q_s32(*out, a[3].val[1]); *out += 4; - vst1q_s32(*out, a4.val[0]); + vst1q_s32(*out, a[4].val[0]); *out += 4; - vst1q_s32(*out, a4.val[1]); + vst1q_s32(*out, a[4].val[1]); *out += 4; - vst1q_s32(*out, a5.val[0]); + vst1q_s32(*out, a[5].val[0]); *out += 4; - vst1q_s32(*out, a5.val[1]); + vst1q_s32(*out, a[5].val[1]); *out += 4; - vst1q_s32(*out, a6.val[0]); + vst1q_s32(*out, a[6].val[0]); *out += 4; - vst1q_s32(*out, a6.val[1]); + vst1q_s32(*out, a[6].val[1]); *out += 4; - vst1q_s32(*out, a7.val[0]); + vst1q_s32(*out, a[7].val[0]); *out += 4; - vst1q_s32(*out, a7.val[1]); + vst1q_s32(*out, a[7].val[1]); *out += 4; } static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) { int i; - int32x4x2_t s0, s1, s2, s3, s4, s5, s6, s7; + int32x4x2_t s[8]; for (i = 0; i < 4; i++, input += 8) { - load_s32x4q_dual(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - transpose_and_store_s32_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); + load_s32x4q_dual(input, s); + transpose_and_store_s32_8x8(s, &t_buf); } } diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c index 3970a5a861..6750c1a426 100644 --- a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c index 5d9063b15d..f05932cec3 100644 --- a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c index 1418a75a15..7be1dad1d3 100644 --- a/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -11,27 +11,10 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, - const int stride, - const int16x8_t res, - const int16x8_t max) { - const uint16x4_t a0 = vld1_u16(*dest); - const uint16x4_t a1 = vld1_u16(*dest + stride); - const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); - // Note: In some profile tests, res is quite close to +/-32767. - // We use saturating addition. - const int16x8_t b = vqaddq_s16(res, a); - const int16x8_t c = vminq_s16(b, max); - const uint16x8_t d = vqshluq_n_s16(c, 0); - vst1_u16(*dest, vget_low_u16(d)); - *dest += stride; - vst1_u16(*dest, vget_high_u16(d)); - *dest += stride; -} - // res is in reverse row order static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, const int stride, @@ -65,109 +48,42 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); } -static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, - int32x4_t *const a0, - int32x4_t *const a1, - int32x4_t *const a2, - int32x4_t *const a3) { - int32x4_t b0, b1, b2, b3; - - transpose_s32_4x4(a0, a1, a2, a3); - b0 = vaddq_s32(*a0, *a2); - b1 = vsubq_s32(*a0, *a2); - b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); - b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); - b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); - b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); - b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); - b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); - b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); - b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); - b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); - b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); - *a0 = vaddq_s32(b0, b3); - *a1 = vaddq_s32(b1, b2); - *a2 = vsubq_s32(b1, b2); - *a3 = vsubq_s32(b0, b3); -} - -static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, - int32x4_t *const a0, - int32x4_t *const a1, - int32x4_t *const a2, - int32x4_t *const a3) { - int32x4_t b0, b1, b2, b3; - int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; - - transpose_s32_4x4(a0, a1, a2, a3); - b0 = vaddq_s32(*a0, *a2); - b1 = vsubq_s32(*a0, *a2); - c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); - c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); - c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); - c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); - c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); - c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); - c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); - c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); - c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); - c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); - c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); - c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); - c4 = vsubq_s64(c4, c8); - c5 = vsubq_s64(c5, c9); - c6 = vaddq_s64(c6, c10); - c7 = vaddq_s64(c7, c11); - b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), - vrshrn_n_s64(c1, DCT_CONST_BITS)); - b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), - vrshrn_n_s64(c3, DCT_CONST_BITS)); - b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), - vrshrn_n_s64(c5, DCT_CONST_BITS)); - b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), - vrshrn_n_s64(c7, DCT_CONST_BITS)); - *a0 = vaddq_s32(b0, b3); - *a1 = vaddq_s32(b1, b2); - *a2 = vsubq_s32(b1, b2); - *a3 = vsubq_s32(b0, b3); -} - void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - int32x4_t c0 = vld1q_s32(input); - int32x4_t c1 = vld1q_s32(input + 4); - int32x4_t c2 = vld1q_s32(input + 8); - int32x4_t c3 = vld1q_s32(input + 12); - int16x8_t a0, a1; + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); if (bd == 8) { - const int16x4_t cospis = vld1_s16(kCospi); - // Rows - a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); - a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_idct4x4_16_bd8(a); // Columns - a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); - a0 = vrshrq_n_s16(a0, 4); - a1 = vrshrq_n_s16(a1, 4); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); } else { const int32x4_t cospis = vld1q_s32(kCospi32); if (bd == 10) { - idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); - idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); } else { - idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); - idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); } - a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); - a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4)); } - highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); - highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max); } diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c index dd90134a6e..bed3227ca7 100644 --- a/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -11,6 +11,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/inv_txfm.h" @@ -127,7 +128,7 @@ static INLINE void idct8x8_12_half1d_bd12( int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { - int32x2_t input_1l, input_1h, input_3l, input_3h; + int32x2_t input1l, input1h, input3l, input3h; int32x2_t step1l[2], step1h[2]; int32x4_t step1[8], step2[8]; int64x2_t t64[8]; @@ -136,23 +137,23 @@ static INLINE void idct8x8_12_half1d_bd12( transpose_s32_4x4(io0, io1, io2, io3); // stage 1 - input_1l = vget_low_s32(*io1); - input_1h = vget_high_s32(*io1); - input_3l = vget_low_s32(*io3); - input_3h = vget_high_s32(*io3); + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); step1l[0] = vget_low_s32(*io0); step1h[0] = vget_high_s32(*io0); step1l[1] = vget_low_s32(*io2); step1h[1] = vget_high_s32(*io2); - t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); - t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); - t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); - t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); - t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); - t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); - t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); - t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); @@ -222,82 +223,15 @@ static INLINE void idct8x8_12_half1d_bd12( *io7 = vsubq_s32(step1[0], step2[7]); } -static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, - int16x8_t a3, int16x8_t a4, int16x8_t a5, - int16x8_t a6, int16x8_t a7, uint16_t *dest, - const int stride, const int bd) { - const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - const uint16_t *dst = dest; - uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7; - uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; - int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16; - - d0 = vld1q_u16(dst); - dst += stride; - d1 = vld1q_u16(dst); - dst += stride; - d2 = vld1q_u16(dst); - dst += stride; - d3 = vld1q_u16(dst); - dst += stride; - d4 = vld1q_u16(dst); - dst += stride; - d5 = vld1q_u16(dst); - dst += stride; - d6 = vld1q_u16(dst); - dst += stride; - d7 = vld1q_u16(dst); - - d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0)); - d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1)); - d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2)); - d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3)); - d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4)); - d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5)); - d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6)); - d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7)); - - d0_s16 = vminq_s16(d0_s16, max); - d1_s16 = vminq_s16(d1_s16, max); - d2_s16 = vminq_s16(d2_s16, max); - d3_s16 = vminq_s16(d3_s16, max); - d4_s16 = vminq_s16(d4_s16, max); - d5_s16 = vminq_s16(d5_s16, max); - d6_s16 = vminq_s16(d6_s16, max); - d7_s16 = vminq_s16(d7_s16, max); - d0_u16 = vqshluq_n_s16(d0_s16, 0); - d1_u16 = vqshluq_n_s16(d1_s16, 0); - d2_u16 = vqshluq_n_s16(d2_s16, 0); - d3_u16 = vqshluq_n_s16(d3_s16, 0); - d4_u16 = vqshluq_n_s16(d4_s16, 0); - d5_u16 = vqshluq_n_s16(d5_s16, 0); - d6_u16 = vqshluq_n_s16(d6_s16, 0); - d7_u16 = vqshluq_n_s16(d7_s16, 0); - - vst1q_u16(dest, d0_u16); - dest += stride; - vst1q_u16(dest, d1_u16); - dest += stride; - vst1q_u16(dest, d2_u16); - dest += stride; - vst1q_u16(dest, d3_u16); - dest += stride; - vst1q_u16(dest, d4_u16); - dest += stride; - vst1q_u16(dest, d5_u16); - dest += stride; - vst1q_u16(dest, d6_u16); - dest += stride; - vst1q_u16(dest, d7_u16); -} - void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - int32x4_t a0 = vld1q_s32(input); - int32x4_t a1 = vld1q_s32(input + 8); - int32x4_t a2 = vld1q_s32(input + 16); - int32x4_t a3 = vld1q_s32(input + 24); - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 8); + a[2] = vld1q_s32(input + 16); + a[3] = vld1q_s32(input + 24); if (bd == 8) { const int16x8_t cospis = vld1q_s16(kCospi); @@ -305,327 +239,133 @@ void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 - int16x4_t b0 = vmovn_s32(a0); - int16x4_t b1 = vmovn_s32(a1); - int16x4_t b2 = vmovn_s32(a2); - int16x4_t b3 = vmovn_s32(a3); - int16x4_t b4, b5, b6, b7; + int16x4_t b[8]; - idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4, - &b5, &b6, &b7); - idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5, - b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7); - c0 = vrshrq_n_s16(c0, 5); - c1 = vrshrq_n_s16(c1, 5); - c2 = vrshrq_n_s16(c2, 5); - c3 = vrshrq_n_s16(c3, 5); - c4 = vrshrq_n_s16(c4, 5); - c5 = vrshrq_n_s16(c5, 5); - c6 = vrshrq_n_s16(c6, 5); - c7 = vrshrq_n_s16(c7, 5); + b[0] = vmovn_s32(a[0]); + b[1] = vmovn_s32(a[1]); + b[2] = vmovn_s32(a[2]); + b[3] = vmovn_s32(a[3]); + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c); + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); } else { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 - int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15; if (bd == 10) { - idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, - &a10, &a11); - idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, - &a14, &a15); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); } else { - idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, - &a10, &a11); - idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, - &a14, &a15); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); } - c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); - c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); - c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); - c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); - c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); - c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); - c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); - c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } - highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); -} - -static INLINE void idct8x8_64_half1d_bd10( - const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, - int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, - int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, - int32x4_t *const io7) { - int32x4_t step1[8], step2[8]; - - transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); - - // stage 1 - step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); - step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); - step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); - step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); - - step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0); - step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1); - step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); - step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); - - step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); - step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); - step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); - step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); - - // stage 2 - step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); - step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); - step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); - - step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); - step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); - step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); - step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); - - step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS); - step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); - step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); - step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); - - step2[4] = vaddq_s32(step1[4], step1[5]); - step2[5] = vsubq_s32(step1[4], step1[5]); - step2[6] = vsubq_s32(step1[7], step1[6]); - step2[7] = vaddq_s32(step1[7], step1[6]); - - // stage 3 - step1[0] = vaddq_s32(step2[0], step2[3]); - step1[1] = vaddq_s32(step2[1], step2[2]); - step1[2] = vsubq_s32(step2[1], step2[2]); - step1[3] = vsubq_s32(step2[0], step2[3]); - - step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); - step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); - step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); - - // stage 4 - *io0 = vaddq_s32(step1[0], step2[7]); - *io1 = vaddq_s32(step1[1], step1[6]); - *io2 = vaddq_s32(step1[2], step1[5]); - *io3 = vaddq_s32(step1[3], step2[4]); - *io4 = vsubq_s32(step1[3], step2[4]); - *io5 = vsubq_s32(step1[2], step1[5]); - *io6 = vsubq_s32(step1[1], step1[6]); - *io7 = vsubq_s32(step1[0], step2[7]); -} - -static INLINE void idct8x8_64_half1d_bd12( - const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, - int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, - int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, - int32x4_t *const io7) { - int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, - input_7l, input_7h; - int32x2_t step1l[4], step1h[4]; - int32x4_t step1[8], step2[8]; - int64x2_t t64[8]; - int32x2_t t32[8]; - - transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); - - // stage 1 - input_1l = vget_low_s32(*io1); - input_1h = vget_high_s32(*io1); - input_3l = vget_low_s32(*io3); - input_3h = vget_high_s32(*io3); - input_5l = vget_low_s32(*io5); - input_5h = vget_high_s32(*io5); - input_7l = vget_low_s32(*io7); - input_7h = vget_high_s32(*io7); - step1l[0] = vget_low_s32(*io0); - step1h[0] = vget_high_s32(*io0); - step1l[1] = vget_low_s32(*io2); - step1h[1] = vget_high_s32(*io2); - step1l[2] = vget_low_s32(*io4); - step1h[2] = vget_high_s32(*io4); - step1l[3] = vget_low_s32(*io6); - step1h[3] = vget_high_s32(*io6); - - t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); - t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); - t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); - t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); - t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); - t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); - t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); - t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); - t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0); - t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0); - t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1); - t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1); - t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0); - t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0); - t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1); - t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1); - t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); - t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); - t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); - t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); - t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); - t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); - t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); - t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); - step1[4] = vcombine_s32(t32[0], t32[1]); - step1[5] = vcombine_s32(t32[2], t32[3]); - step1[6] = vcombine_s32(t32[4], t32[5]); - step1[7] = vcombine_s32(t32[6], t32[7]); - - // stage 2 - t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); - t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); - t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); - t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); - t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); - t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); - t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); - t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); - t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); - t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); - t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1); - t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); - t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); - t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); - t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); - t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); - t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); - t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); - t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); - t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); - t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); - t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); - step2[0] = vcombine_s32(t32[0], t32[1]); - step2[1] = vcombine_s32(t32[2], t32[3]); - step2[2] = vcombine_s32(t32[4], t32[5]); - step2[3] = vcombine_s32(t32[6], t32[7]); - - step2[4] = vaddq_s32(step1[4], step1[5]); - step2[5] = vsubq_s32(step1[4], step1[5]); - step2[6] = vsubq_s32(step1[7], step1[6]); - step2[7] = vaddq_s32(step1[7], step1[6]); - - // stage 3 - step1[0] = vaddq_s32(step2[0], step2[3]); - step1[1] = vaddq_s32(step2[1], step2[2]); - step1[2] = vsubq_s32(step2[1], step2[2]); - step1[3] = vsubq_s32(step2[0], step2[3]); - - t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); - t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); - t64[0] = - vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); - t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), - vget_high_s32(cospis0), 0); - t64[2] = - vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); - t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), - vget_high_s32(cospis0), 0); - t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); - t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); - t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); - t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); - step1[5] = vcombine_s32(t32[0], t32[1]); - step1[6] = vcombine_s32(t32[2], t32[3]); - - // stage 4 - *io0 = vaddq_s32(step1[0], step2[7]); - *io1 = vaddq_s32(step1[1], step1[6]); - *io2 = vaddq_s32(step1[2], step1[5]); - *io3 = vaddq_s32(step1[3], step2[4]); - *io4 = vsubq_s32(step1[3], step2[4]); - *io5 = vsubq_s32(step1[2], step1[5]); - *io6 = vsubq_s32(step1[1], step1[6]); - *io7 = vsubq_s32(step1[0], step2[7]); + highbd_add8x8(c, dest, stride, bd); } void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - int32x4_t a0 = vld1q_s32(input); - int32x4_t a1 = vld1q_s32(input + 4); - int32x4_t a2 = vld1q_s32(input + 8); - int32x4_t a3 = vld1q_s32(input + 12); - int32x4_t a4 = vld1q_s32(input + 16); - int32x4_t a5 = vld1q_s32(input + 20); - int32x4_t a6 = vld1q_s32(input + 24); - int32x4_t a7 = vld1q_s32(input + 28); - int32x4_t a8 = vld1q_s32(input + 32); - int32x4_t a9 = vld1q_s32(input + 36); - int32x4_t a10 = vld1q_s32(input + 40); - int32x4_t a11 = vld1q_s32(input + 44); - int32x4_t a12 = vld1q_s32(input + 48); - int32x4_t a13 = vld1q_s32(input + 52); - int32x4_t a14 = vld1q_s32(input + 56); - int32x4_t a15 = vld1q_s32(input + 60); - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); if (bd == 8) { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 - int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1)); - int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3)); - int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5)); - int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7)); - int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9)); - int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11)); - int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13)); - int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15)); + int16x8_t b[8]; - idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); - idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); - c0 = vrshrq_n_s16(b0, 5); - c1 = vrshrq_n_s16(b1, 5); - c2 = vrshrq_n_s16(b2, 5); - c3 = vrshrq_n_s16(b3, 5); - c4 = vrshrq_n_s16(b4, 5); - c5 = vrshrq_n_s16(b5, 5); - c6 = vrshrq_n_s16(b6, 5); - c7 = vrshrq_n_s16(b7, 5); + idct8x8_64_1d_bd8(cospis0, cospis1, b); + idct8x8_64_1d_bd8(cospis0, cospis1, b); + + c[0] = vrshrq_n_s16(b[0], 5); + c[1] = vrshrq_n_s16(b[1], 5); + c[2] = vrshrq_n_s16(b[2], 5); + c[3] = vrshrq_n_s16(b[3], 5); + c[4] = vrshrq_n_s16(b[4], 5); + c[5] = vrshrq_n_s16(b[5], 5); + c[6] = vrshrq_n_s16(b[6], 5); + c[7] = vrshrq_n_s16(b[7], 5); } else { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { - idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, - &a14, &a15); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, - &a3, &a11); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, - &a7, &a15); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); } else { - idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, - &a14, &a15); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, - &a3, &a11); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, - &a7, &a15); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); } - c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); - c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); - c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); - c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); - c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); - c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); - c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); - c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } - highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); + highbd_add8x8(c, dest, stride, bd); } diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h b/libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h new file mode 100644 index 0000000000..518ef4336e --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; +} + +static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); + b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); + b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1); + b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1); + b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1); + b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1); + b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); + b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); + b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); + b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + int64x2_t c[12]; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); + c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); + c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); + c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); + c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1); + c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1); + c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1); + c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1); + c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1); + c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1); + c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1); + c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1); + c[4] = vsubq_s64(c[4], c[8]); + c[5] = vsubq_s64(c[5], c[9]); + c[6] = vaddq_s64(c[6], c[10]); + c[7] = vaddq_s64(c[7], c[11]); + b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), + vrshrn_n_s64(c[1], DCT_CONST_BITS)); + b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), + vrshrn_n_s64(c[3], DCT_CONST_BITS)); + b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS), + vrshrn_n_s64(c[5], DCT_CONST_BITS)); + b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS), + vrshrn_n_s64(c[7], DCT_CONST_BITS)); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest, + const int stride, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const uint16_t *dst = dest; + uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; + int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16; + + d0 = vld1q_u16(dst); + dst += stride; + d1 = vld1q_u16(dst); + dst += stride; + d2 = vld1q_u16(dst); + dst += stride; + d3 = vld1q_u16(dst); + dst += stride; + d4 = vld1q_u16(dst); + dst += stride; + d5 = vld1q_u16(dst); + dst += stride; + d6 = vld1q_u16(dst); + dst += stride; + d7 = vld1q_u16(dst); + + d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0)); + d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1)); + d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2)); + d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3)); + d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4)); + d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5)); + d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6)); + d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7)); + + d0_s16 = vminq_s16(d0_s16, max); + d1_s16 = vminq_s16(d1_s16, max); + d2_s16 = vminq_s16(d2_s16, max); + d3_s16 = vminq_s16(d3_s16, max); + d4_s16 = vminq_s16(d4_s16, max); + d5_s16 = vminq_s16(d5_s16, max); + d6_s16 = vminq_s16(d6_s16, max); + d7_s16 = vminq_s16(d7_s16, max); + d0_u16 = vqshluq_n_s16(d0_s16, 0); + d1_u16 = vqshluq_n_s16(d1_s16, 0); + d2_u16 = vqshluq_n_s16(d2_s16, 0); + d3_u16 = vqshluq_n_s16(d3_s16, 0); + d4_u16 = vqshluq_n_s16(d4_s16, 0); + d5_u16 = vqshluq_n_s16(d5_s16, 0); + d6_u16 = vqshluq_n_s16(d6_s16, 0); + d7_u16 = vqshluq_n_s16(d7_s16, 0); + + vst1q_u16(dest, d0_u16); + dest += stride; + vst1q_u16(dest, d1_u16); + dest += stride; + vst1q_u16(dest, d2_u16); + dest += stride; + vst1q_u16(dest, d3_u16); + dest += stride; + vst1q_u16(dest, d4_u16); + dest += stride; + vst1q_u16(dest, d5_u16); + dest += stride; + vst1q_u16(dest, d6_u16); + dest += stride; + vst1q_u16(dest, d7_u16); +} + +static INLINE void idct8x8_64_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + + step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0); + step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1); + step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); + step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); + + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + + step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); + step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); + + step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; + int32x2_t step1l[4], step1h[4]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); + input5l = vget_low_s32(*io5); + input5h = vget_high_s32(*io5); + input7l = vget_low_s32(*io7); + input7h = vget_high_s32(*io7); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + step1l[2] = vget_low_s32(*io4); + step1h[2] = vget_high_s32(*io4); + step1l[3] = vget_low_s32(*io6); + step1h[3] = vget_high_s32(*io6); + + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); + t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0); + t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0); + t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1); + t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1); + t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0); + t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0); + t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1); + t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1); + t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); + t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); + t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step2[0] = vcombine_s32(t32[0], t32[1]); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out, + int32_t *output) { + // Save the result into output + vst1q_s32(output + 0, out[0].val[0]); + vst1q_s32(output + 4, out[0].val[1]); + output += 16; + vst1q_s32(output + 0, out[1].val[0]); + vst1q_s32(output + 4, out[1].val[1]); + output += 16; + vst1q_s32(output + 0, out[2].val[0]); + vst1q_s32(output + 4, out[2].val[1]); + output += 16; + vst1q_s32(output + 0, out[3].val[0]); + vst1q_s32(output + 4, out[3].val[1]); + output += 16; + vst1q_s32(output + 0, out[4].val[0]); + vst1q_s32(output + 4, out[4].val[1]); + output += 16; + vst1q_s32(output + 0, out[5].val[0]); + vst1q_s32(output + 4, out[5].val[1]); + output += 16; + vst1q_s32(output + 0, out[6].val[0]); + vst1q_s32(output + 4, out[6].val[1]); + output += 16; + vst1q_s32(output + 0, out[7].val[0]); + vst1q_s32(output + 4, out[7].val[1]); + output += 16; + vst1q_s32(output + 0, out[8].val[0]); + vst1q_s32(output + 4, out[8].val[1]); + output += 16; + vst1q_s32(output + 0, out[9].val[0]); + vst1q_s32(output + 4, out[9].val[1]); + output += 16; + vst1q_s32(output + 0, out[10].val[0]); + vst1q_s32(output + 4, out[10].val[1]); + output += 16; + vst1q_s32(output + 0, out[11].val[0]); + vst1q_s32(output + 4, out[11].val[1]); + output += 16; + vst1q_s32(output + 0, out[12].val[0]); + vst1q_s32(output + 4, out[12].val[1]); + output += 16; + vst1q_s32(output + 0, out[13].val[0]); + vst1q_s32(output + 4, out[13].val[1]); + output += 16; + vst1q_s32(output + 0, out[14].val[0]); + vst1q_s32(output + 4, out[14].val[1]); + output += 16; + vst1q_s32(output + 0, out[15].val[0]); + vst1q_s32(output + 4, out[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, + uint16_t *dest, const int stride, + const int bd) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t o[16]; + o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6), + vrshrn_n_s32(out[0].val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6), + vrshrn_n_s32(out[1].val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6), + vrshrn_n_s32(out[2].val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6), + vrshrn_n_s32(out[3].val[1], 6)); + o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6), + vrshrn_n_s32(out[4].val[1], 6)); + o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6), + vrshrn_n_s32(out[5].val[1], 6)); + o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6), + vrshrn_n_s32(out[6].val[1], 6)); + o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6), + vrshrn_n_s32(out[7].val[1], 6)); + o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6), + vrshrn_n_s32(out[8].val[1], 6)); + o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6), + vrshrn_n_s32(out[9].val[1], 6)); + o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6), + vrshrn_n_s32(out[10].val[1], 6)); + o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6), + vrshrn_n_s32(out[11].val[1], 6)); + o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6), + vrshrn_n_s32(out[12].val[1], 6)); + o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6), + vrshrn_n_s32(out[13].val[1], 6)); + o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6), + vrshrn_n_s32(out[14].val[1], 6)); + o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6), + vrshrn_n_s32(out[15].val[1], 6)); + highbd_idct16x16_add8x1(o[0], max, &dest, stride); + highbd_idct16x16_add8x1(o[1], max, &dest, stride); + highbd_idct16x16_add8x1(o[2], max, &dest, stride); + highbd_idct16x16_add8x1(o[3], max, &dest, stride); + highbd_idct16x16_add8x1(o[4], max, &dest, stride); + highbd_idct16x16_add8x1(o[5], max, &dest, stride); + highbd_idct16x16_add8x1(o[6], max, &dest, stride); + highbd_idct16x16_add8x1(o[7], max, &dest, stride); + highbd_idct16x16_add8x1(o[8], max, &dest, stride); + highbd_idct16x16_add8x1(o[9], max, &dest, stride); + highbd_idct16x16_add8x1(o[10], max, &dest, stride); + highbd_idct16x16_add8x1(o[11], max, &dest, stride); + highbd_idct16x16_add8x1(o[12], max, &dest, stride); + highbd_idct16x16_add8x1(o[13], max, &dest, stride); + highbd_idct16x16_add8x1(o[14], max, &dest, stride); + highbd_idct16x16_add8x1(o[15], max, &dest, stride); +} + +void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd); + +#endif // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c index 5c5963d277..fc7f4a7747 100644 --- a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -63,65 +63,6 @@ static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, wrap_low_4x2(t32, d0, d1); } -static INLINE void idct16x16_add_store(const int16x8_t *const out, - uint8_t *dest, const int stride) { - // Add the result to dest - idct16x16_add8x1(out[0], &dest, stride); - idct16x16_add8x1(out[1], &dest, stride); - idct16x16_add8x1(out[2], &dest, stride); - idct16x16_add8x1(out[3], &dest, stride); - idct16x16_add8x1(out[4], &dest, stride); - idct16x16_add8x1(out[5], &dest, stride); - idct16x16_add8x1(out[6], &dest, stride); - idct16x16_add8x1(out[7], &dest, stride); - idct16x16_add8x1(out[8], &dest, stride); - idct16x16_add8x1(out[9], &dest, stride); - idct16x16_add8x1(out[10], &dest, stride); - idct16x16_add8x1(out[11], &dest, stride); - idct16x16_add8x1(out[12], &dest, stride); - idct16x16_add8x1(out[13], &dest, stride); - idct16x16_add8x1(out[14], &dest, stride); - idct16x16_add8x1(out[15], &dest, stride); -} - -static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest, - const int stride) { - // Add the result to dest - const int16x8_t max = vdupq_n_s16((1 << 8) - 1); - out[0] = vrshrq_n_s16(out[0], 6); - out[1] = vrshrq_n_s16(out[1], 6); - out[2] = vrshrq_n_s16(out[2], 6); - out[3] = vrshrq_n_s16(out[3], 6); - out[4] = vrshrq_n_s16(out[4], 6); - out[5] = vrshrq_n_s16(out[5], 6); - out[6] = vrshrq_n_s16(out[6], 6); - out[7] = vrshrq_n_s16(out[7], 6); - out[8] = vrshrq_n_s16(out[8], 6); - out[9] = vrshrq_n_s16(out[9], 6); - out[10] = vrshrq_n_s16(out[10], 6); - out[11] = vrshrq_n_s16(out[11], 6); - out[12] = vrshrq_n_s16(out[12], 6); - out[13] = vrshrq_n_s16(out[13], 6); - out[14] = vrshrq_n_s16(out[14], 6); - out[15] = vrshrq_n_s16(out[15], 6); - highbd_idct16x16_add8x1(out[0], max, &dest, stride); - highbd_idct16x16_add8x1(out[1], max, &dest, stride); - highbd_idct16x16_add8x1(out[2], max, &dest, stride); - highbd_idct16x16_add8x1(out[3], max, &dest, stride); - highbd_idct16x16_add8x1(out[4], max, &dest, stride); - highbd_idct16x16_add8x1(out[5], max, &dest, stride); - highbd_idct16x16_add8x1(out[6], max, &dest, stride); - highbd_idct16x16_add8x1(out[7], max, &dest, stride); - highbd_idct16x16_add8x1(out[8], max, &dest, stride); - highbd_idct16x16_add8x1(out[9], max, &dest, stride); - highbd_idct16x16_add8x1(out[10], max, &dest, stride); - highbd_idct16x16_add8x1(out[11], max, &dest, stride); - highbd_idct16x16_add8x1(out[12], max, &dest, stride); - highbd_idct16x16_add8x1(out[13], max, &dest, stride); - highbd_idct16x16_add8x1(out[14], max, &dest, stride); - highbd_idct16x16_add8x1(out[15], max, &dest, stride); -} - void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, void *const dest, const int stride, const int highbd_flag) { diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c index 021211bc99..057731ad92 100644 --- a/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -650,14 +650,10 @@ void vpx_idct32_16_neon(const int16_t *const input, void *const output, highbd_add_and_store_bd8(out, output, stride); } else { uint8_t *const outputT = (uint8_t *)output; - add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], - out[7], outputT, stride); - add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], - out[14], out[15], outputT + (8 * stride), stride); - add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], - out[22], out[23], outputT + (16 * stride), stride); - add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], - out[30], out[31], outputT + (24 * stride), stride); + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); } } diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c index f3c336fa31..f570547e44 100644 --- a/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -490,14 +490,10 @@ void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, highbd_add_and_store_bd8(out, output, stride); } else { uint8_t *const outputT = (uint8_t *)output; - add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], - out[7], outputT, stride); - add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], - out[14], out[15], outputT + (8 * stride), stride); - add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], - out[22], out[23], outputT + (16 * stride), stride); - add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], - out[30], out[31], outputT + (24 * stride), stride); + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); } } diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c index 673a36840e..8192ee4cf8 100644 --- a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -19,44 +19,41 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const uint8_t *dst = dest; - const int16x4_t cospis = vld1_s16(kCospi); - uint8x8_t dest01_u8; - uint32x2_t dest32_u32 = vdup_n_u32(0); - int16x8_t a0, a1; - uint8x8_t d01, d32; - uint16x8_t d01_u16, d32_u16; + uint32x2_t s32 = vdup_n_u32(0); + int16x8_t a[2]; + uint8x8_t s, d[2]; + uint16x8_t sum[2]; assert(!((intptr_t)dest % sizeof(uint32_t))); assert(!(stride % sizeof(uint32_t))); // Rows - a0 = load_tran_low_to_s16q(input); - a1 = load_tran_low_to_s16q(input + 8); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_idct4x4_16_bd8(a); // Columns - a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); - a0 = vrshrq_n_s16(a0, 4); - a1 = vrshrq_n_s16(a1, 4); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); - dest01_u8 = load_u8(dst, stride); + s = load_u8(dst, stride); dst += 2 * stride; // The elements are loaded in reverse order. - dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1); + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1); dst += stride; - dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0); + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0); - d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8); - d32_u16 = - vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32)); - d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16)); - d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16)); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32)); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); - store_u8(dest, stride, d01); + store_u8(dest, stride, d[0]); dest += 2 * stride; // The elements are stored in reverse order. - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1); + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1); dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0); + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0); } diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c index 1121ade279..7471387e47 100644 --- a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -17,91 +17,25 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, - int16x8_t a3, int16x8_t a4, int16x8_t a5, - int16x8_t a6, int16x8_t a7, uint8_t *dest, - const int stride) { - const uint8_t *dst = dest; - uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; - uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; - - a0 = vrshrq_n_s16(a0, 5); - a1 = vrshrq_n_s16(a1, 5); - a2 = vrshrq_n_s16(a2, 5); - a3 = vrshrq_n_s16(a3, 5); - a4 = vrshrq_n_s16(a4, 5); - a5 = vrshrq_n_s16(a5, 5); - a6 = vrshrq_n_s16(a6, 5); - a7 = vrshrq_n_s16(a7, 5); - - d0 = vld1_u8(dst); - dst += stride; - d1 = vld1_u8(dst); - dst += stride; - d2 = vld1_u8(dst); - dst += stride; - d3 = vld1_u8(dst); - dst += stride; - d4 = vld1_u8(dst); - dst += stride; - d5 = vld1_u8(dst); - dst += stride; - d6 = vld1_u8(dst); - dst += stride; - d7 = vld1_u8(dst); - - d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0); - d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1); - d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2); - d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3); - d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4); - d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5); - d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6); - d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7); - - d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16)); - d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16)); - d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16)); - d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16)); - d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16)); - d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16)); - d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16)); - d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16)); - - vst1_u8(dest, d0); - dest += stride; - vst1_u8(dest, d1); - dest += stride; - vst1_u8(dest, d2); - dest += stride; - vst1_u8(dest, d3); - dest += stride; - vst1_u8(dest, d4); - dest += stride; - vst1_u8(dest, d5); - dest += stride; - vst1_u8(dest, d6); - dest += stride; - vst1_u8(dest, d7); -} - void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 - int16x8_t a0 = load_tran_low_to_s16q(input); - int16x8_t a1 = load_tran_low_to_s16q(input + 8); - int16x8_t a2 = load_tran_low_to_s16q(input + 16); - int16x8_t a3 = load_tran_low_to_s16q(input + 24); - int16x8_t a4 = load_tran_low_to_s16q(input + 32); - int16x8_t a5 = load_tran_low_to_s16q(input + 40); - int16x8_t a6 = load_tran_low_to_s16q(input + 48); - int16x8_t a7 = load_tran_low_to_s16q(input + 56); + int16x8_t a[8]; - idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + a[2] = load_tran_low_to_s16q(input + 16); + a[3] = load_tran_low_to_s16q(input + 24); + a[4] = load_tran_low_to_s16q(input + 32); + a[5] = load_tran_low_to_s16q(input + 40); + a[6] = load_tran_low_to_s16q(input + 48); + a[7] = load_tran_low_to_s16q(input + 56); + + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_add8x8_neon(a, dest, stride); } void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, @@ -111,17 +45,15 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 - int16x4_t a0, a1, a2, a3, a4, a5, a6, a7; - int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x4_t a[8]; + int16x8_t b[8]; - a0 = load_tran_low_to_s16d(input); - a1 = load_tran_low_to_s16d(input + 8); - a2 = load_tran_low_to_s16d(input + 16); - a3 = load_tran_low_to_s16d(input + 24); + a[0] = load_tran_low_to_s16d(input); + a[1] = load_tran_low_to_s16d(input + 8); + a[2] = load_tran_low_to_s16d(input + 16); + a[3] = load_tran_low_to_s16d(input + 24); - idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4, - &a5, &a6, &a7); - idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6, - a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); - add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride); + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b); + idct8x8_add8x8_neon(b, dest, stride); } diff --git a/libs/libvpx/vpx_dsp/arm/idct_neon.h b/libs/libvpx/vpx_dsp/arm/idct_neon.h index 6ed02af5ac..c02311326b 100644 --- a/libs/libvpx/vpx_dsp/arm/idct_neon.h +++ b/libs/libvpx/vpx_dsp/arm/idct_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_ARM_IDCT_NEON_H_ -#define VPX_DSP_ARM_IDCT_NEON_H_ +#ifndef VPX_VPX_DSP_ARM_IDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_IDCT_NEON_H_ #include @@ -78,6 +78,28 @@ static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0, //------------------------------------------------------------------------------ +static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) { + return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS), + vrshrn_n_s32(in[1], DCT_CONST_BITS)); +} + +static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32, + int16x8_t *const d0, + int16x8_t *const d1) { + *d0 = dct_const_round_shift_low_8(t32 + 0); + *d1 = dct_const_round_shift_low_8(t32 + 2); +} + +static INLINE int32x4x2_t +dct_const_round_shift_high_4x2(const int64x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS), + vrshrn_n_s64(in[3], DCT_CONST_BITS)); + return out; +} + // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, const int16_t a_const) { @@ -102,24 +124,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16( // input) this function can not use vaddq_s16. // In order to match existing behavior and intentionally out of range tests, // expand the addition up to 32 bits to prevent truncation. - int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); - int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); - temp_low = vmulq_n_s32(temp_low, ab_const); - temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), - vrshrn_n_s32(temp_high, DCT_CONST_BITS)); + int32x4_t t[2]; + t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); } // Subtract b from a, then multiply by ab_const. Shift and narrow by // DCT_CONST_BITS. static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { - int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); - int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); - temp_low = vmulq_n_s32(temp_low, ab_const); - temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), - vrshrn_n_s32(temp_high, DCT_CONST_BITS)); + int32x4_t t[2]; + t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); } // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by @@ -127,12 +149,12 @@ static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( const int16x8_t a, const int16_t a_const, const int16x8_t b, const int16_t b_const) { - int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const); - int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); - temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); - temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); - return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), - vrshrn_n_s32(temp_high, DCT_CONST_BITS)); + int32x4_t t[2]; + t[0] = vmull_n_s16(vget_low_s16(a), a_const); + t[1] = vmull_n_s16(vget_high_s16(a), a_const); + t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const); + t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const); + return dct_const_round_shift_low_8(t); } //------------------------------------------------------------------------------ @@ -145,53 +167,43 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( static INLINE int32x4x2_t multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) { int64x2_t b[4]; - int32x4x2_t c; + b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); - c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS), - vrshrn_n_s64(b[1], DCT_CONST_BITS)); - c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS), - vrshrn_n_s64(b[3], DCT_CONST_BITS)); - return c; + return dct_const_round_shift_high_4x2(b); } // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual( const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { - const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]); - const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]); + int32x4_t t[2]; int64x2_t c[4]; - int32x4x2_t d; - c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); - c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); - c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); - c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); - d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), - vrshrn_n_s64(c[1], DCT_CONST_BITS)); - d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), - vrshrn_n_s64(c[3], DCT_CONST_BITS)); - return d; + + t[0] = vaddq_s32(a.val[0], b.val[0]); + t[1] = vaddq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); } // Subtract b from a, then multiply by ab_const. Shift and narrow by // DCT_CONST_BITS. static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual( const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { - const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]); - const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]); + int32x4_t t[2]; int64x2_t c[4]; - int32x4x2_t d; - c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); - c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); - c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); - c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); - d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), - vrshrn_n_s64(c[1], DCT_CONST_BITS)); - d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), - vrshrn_n_s64(c[3], DCT_CONST_BITS)); - return d; + + t[0] = vsubq_s32(a.val[0], b.val[0]); + t[1] = vsubq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); } // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by @@ -200,7 +212,6 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b, const int32_t b_const) { int64x2_t c[4]; - int32x4x2_t d; c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); @@ -209,72 +220,66 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const); c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const); c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const); - d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), - vrshrn_n_s64(c[1], DCT_CONST_BITS)); - d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), - vrshrn_n_s64(c[3], DCT_CONST_BITS)); - return d; + return dct_const_round_shift_high_4x2(c); } // Shift the output down by 6 and add it to the destination buffer. -static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, - const int16x8_t a2, const int16x8_t a3, - const int16x8_t a4, const int16x8_t a5, - const int16x8_t a6, const int16x8_t a7, - uint8_t *b, const int b_stride) { - uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; - b0 = vld1_u8(b); - b += b_stride; - b1 = vld1_u8(b); - b += b_stride; - b2 = vld1_u8(b); - b += b_stride; - b3 = vld1_u8(b); - b += b_stride; - b4 = vld1_u8(b); - b += b_stride; - b5 = vld1_u8(b); - b += b_stride; - b6 = vld1_u8(b); - b += b_stride; - b7 = vld1_u8(b); - b -= (7 * b_stride); +static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d, + const int stride) { + uint8x8_t b[8]; + int16x8_t c[8]; + + b[0] = vld1_u8(d); + d += stride; + b[1] = vld1_u8(d); + d += stride; + b[2] = vld1_u8(d); + d += stride; + b[3] = vld1_u8(d); + d += stride; + b[4] = vld1_u8(d); + d += stride; + b[5] = vld1_u8(d); + d += stride; + b[6] = vld1_u8(d); + d += stride; + b[7] = vld1_u8(d); + d -= (7 * stride); // c = b + (a >> 6) - c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6); - c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6); - c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6); - c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6); - c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6); - c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6); - c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6); - c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6); + c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6); + c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6); + c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6); + c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6); + c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6); + c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6); + c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6); + c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6); - b0 = vqmovun_s16(c0); - b1 = vqmovun_s16(c1); - b2 = vqmovun_s16(c2); - b3 = vqmovun_s16(c3); - b4 = vqmovun_s16(c4); - b5 = vqmovun_s16(c5); - b6 = vqmovun_s16(c6); - b7 = vqmovun_s16(c7); + b[0] = vqmovun_s16(c[0]); + b[1] = vqmovun_s16(c[1]); + b[2] = vqmovun_s16(c[2]); + b[3] = vqmovun_s16(c[3]); + b[4] = vqmovun_s16(c[4]); + b[5] = vqmovun_s16(c[5]); + b[6] = vqmovun_s16(c[6]); + b[7] = vqmovun_s16(c[7]); - vst1_u8(b, b0); - b += b_stride; - vst1_u8(b, b1); - b += b_stride; - vst1_u8(b, b2); - b += b_stride; - vst1_u8(b, b3); - b += b_stride; - vst1_u8(b, b4); - b += b_stride; - vst1_u8(b, b5); - b += b_stride; - vst1_u8(b, b6); - b += b_stride; - vst1_u8(b, b7); + vst1_u8(d, b[0]); + d += stride; + vst1_u8(d, b[1]); + d += stride; + vst1_u8(d, b[2]); + d += stride; + vst1_u8(d, b[3]); + d += stride; + vst1_u8(d, b[4]); + d += stride; + vst1_u8(d, b[5]); + d += stride; + vst1_u8(d, b[6]); + d += stride; + vst1_u8(d, b[7]); } static INLINE uint8x16_t create_dcq(const int16_t dc) { @@ -283,56 +288,53 @@ static INLINE uint8x16_t create_dcq(const int16_t dc) { return vdupq_n_u8((uint8_t)t); } -static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, - int16x8_t *const a0, - int16x8_t *const a1) { - int16x4_t b0, b1, b2, b3; - int32x4_t c0, c1, c2, c3; - int16x8_t d0, d1; +static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) { + const int16x4_t cospis = vld1_s16(kCospi); + int16x4_t b[4]; + int32x4_t c[4]; + int16x8_t d[2]; - transpose_s16_4x4q(a0, a1); - b0 = vget_low_s16(*a0); - b1 = vget_high_s16(*a0); - b2 = vget_low_s16(*a1); - b3 = vget_high_s16(*a1); - c0 = vmull_lane_s16(b0, cospis, 2); - c2 = vmull_lane_s16(b1, cospis, 2); - c1 = vsubq_s32(c0, c2); - c0 = vaddq_s32(c0, c2); - c2 = vmull_lane_s16(b2, cospis, 3); - c3 = vmull_lane_s16(b2, cospis, 1); - c2 = vmlsl_lane_s16(c2, b3, cospis, 1); - c3 = vmlal_lane_s16(c3, b3, cospis, 3); - b0 = vrshrn_n_s32(c0, DCT_CONST_BITS); - b1 = vrshrn_n_s32(c1, DCT_CONST_BITS); - b2 = vrshrn_n_s32(c2, DCT_CONST_BITS); - b3 = vrshrn_n_s32(c3, DCT_CONST_BITS); - d0 = vcombine_s16(b0, b1); - d1 = vcombine_s16(b3, b2); - *a0 = vaddq_s16(d0, d1); - *a1 = vsubq_s16(d0, d1); + b[0] = vget_low_s16(a[0]); + b[1] = vget_high_s16(a[0]); + b[2] = vget_low_s16(a[1]); + b[3] = vget_high_s16(a[1]); + c[0] = vmull_lane_s16(b[0], cospis, 2); + c[2] = vmull_lane_s16(b[1], cospis, 2); + c[1] = vsubq_s32(c[0], c[2]); + c[0] = vaddq_s32(c[0], c[2]); + c[3] = vmull_lane_s16(b[2], cospis, 3); + c[2] = vmull_lane_s16(b[2], cospis, 1); + c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1); + c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3); + dct_const_round_shift_low_8_dual(c, &d[0], &d[1]); + a[0] = vaddq_s16(d[0], d[1]); + a[1] = vsubq_s16(d[0], d[1]); } -static INLINE void idct8x8_12_pass1_bd8( - const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, - int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2, - int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5, - int16x4_t *const io6, int16x4_t *const io7) { +static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) { + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); +} + +static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + int16x4_t *const io) { int16x4_t step1[8], step2[8]; int32x4_t t32[2]; - transpose_s16_4x4d(io0, io1, io2, io3); + transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]); // stage 1 - step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3); - step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2); - step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1); - step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0); + step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3); + step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2); + step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1); + step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0); // stage 2 - step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2); - step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3); - step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1); + step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2); + step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3); + step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1); step2[4] = vadd_s16(step1[4], step1[5]); step2[5] = vsub_s16(step1[4], step1[5]); @@ -352,32 +354,27 @@ static INLINE void idct8x8_12_pass1_bd8( step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); // stage 4 - *io0 = vadd_s16(step1[0], step2[7]); - *io1 = vadd_s16(step1[1], step1[6]); - *io2 = vadd_s16(step1[2], step1[5]); - *io3 = vadd_s16(step1[3], step2[4]); - *io4 = vsub_s16(step1[3], step2[4]); - *io5 = vsub_s16(step1[2], step1[5]); - *io6 = vsub_s16(step1[1], step1[6]); - *io7 = vsub_s16(step1[0], step2[7]); + io[0] = vadd_s16(step1[0], step2[7]); + io[1] = vadd_s16(step1[1], step1[6]); + io[2] = vadd_s16(step1[2], step1[5]); + io[3] = vadd_s16(step1[3], step2[4]); + io[4] = vsub_s16(step1[3], step2[4]); + io[5] = vsub_s16(step1[2], step1[5]); + io[6] = vsub_s16(step1[1], step1[6]); + io[7] = vsub_s16(step1[0], step2[7]); } -static INLINE void idct8x8_12_pass2_bd8( - const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, - const int16x4_t input0, const int16x4_t input1, const int16x4_t input2, - const int16x4_t input3, const int16x4_t input4, const int16x4_t input5, - const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0, - int16x8_t *const output1, int16x8_t *const output2, - int16x8_t *const output3, int16x8_t *const output4, - int16x8_t *const output5, int16x8_t *const output6, - int16x8_t *const output7) { +static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + const int16x4_t *const input, + int16x8_t *const output) { int16x8_t in[4]; int16x8_t step1[8], step2[8]; int32x4_t t32[8]; - int16x4_t t16[8]; - transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6, - input7, &in[0], &in[1], &in[2], &in[3]); + transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5], + input[6], input[7], &in[0], &in[1], &in[2], &in[3]); // stage 1 step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3); @@ -407,86 +404,64 @@ static INLINE void idct8x8_12_pass2_bd8( t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - step1[5] = vcombine_s16(t16[0], t16[1]); - step1[6] = vcombine_s16(t16[2], t16[3]); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); // stage 4 - *output0 = vaddq_s16(step1[0], step2[7]); - *output1 = vaddq_s16(step1[1], step1[6]); - *output2 = vaddq_s16(step1[2], step1[5]); - *output3 = vaddq_s16(step1[3], step2[4]); - *output4 = vsubq_s16(step1[3], step2[4]); - *output5 = vsubq_s16(step1[2], step1[5]); - *output6 = vsubq_s16(step1[1], step1[6]); - *output7 = vsubq_s16(step1[0], step2[7]); + output[0] = vaddq_s16(step1[0], step2[7]); + output[1] = vaddq_s16(step1[1], step1[6]); + output[2] = vaddq_s16(step1[2], step1[5]); + output[3] = vaddq_s16(step1[3], step2[4]); + output[4] = vsubq_s16(step1[3], step2[4]); + output[5] = vsubq_s16(step1[2], step1[5]); + output[6] = vsubq_s16(step1[1], step1[6]); + output[7] = vsubq_s16(step1[0], step2[7]); } -static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, - const int16x4_t cospis1, - int16x8_t *const io0, int16x8_t *const io1, - int16x8_t *const io2, int16x8_t *const io3, - int16x8_t *const io4, int16x8_t *const io5, - int16x8_t *const io6, - int16x8_t *const io7) { - int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, - input_7l, input_7h; +static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; int16x4_t step1l[4], step1h[4]; int16x8_t step1[8], step2[8]; int32x4_t t32[8]; - int16x4_t t16[8]; - - transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7); // stage 1 - input_1l = vget_low_s16(*io1); - input_1h = vget_high_s16(*io1); - input_3l = vget_low_s16(*io3); - input_3h = vget_high_s16(*io3); - input_5l = vget_low_s16(*io5); - input_5h = vget_high_s16(*io5); - input_7l = vget_low_s16(*io7); - input_7h = vget_high_s16(*io7); - step1l[0] = vget_low_s16(*io0); - step1h[0] = vget_high_s16(*io0); - step1l[1] = vget_low_s16(*io2); - step1h[1] = vget_high_s16(*io2); - step1l[2] = vget_low_s16(*io4); - step1h[2] = vget_high_s16(*io4); - step1l[3] = vget_low_s16(*io6); - step1h[3] = vget_high_s16(*io6); + input1l = vget_low_s16(io[1]); + input1h = vget_high_s16(io[1]); + input3l = vget_low_s16(io[3]); + input3h = vget_high_s16(io[3]); + input5l = vget_low_s16(io[5]); + input5h = vget_high_s16(io[5]); + input7l = vget_low_s16(io[7]); + input7h = vget_high_s16(io[7]); + step1l[0] = vget_low_s16(io[0]); + step1h[0] = vget_high_s16(io[0]); + step1l[1] = vget_low_s16(io[2]); + step1h[1] = vget_high_s16(io[2]); + step1l[2] = vget_low_s16(io[4]); + step1h[2] = vget_high_s16(io[4]); + step1l[3] = vget_low_s16(io[6]); + step1h[3] = vget_high_s16(io[6]); - t32[0] = vmull_lane_s16(input_1l, cospis1, 3); - t32[1] = vmull_lane_s16(input_1h, cospis1, 3); - t32[2] = vmull_lane_s16(input_3l, cospis1, 2); - t32[3] = vmull_lane_s16(input_3h, cospis1, 2); - t32[4] = vmull_lane_s16(input_3l, cospis1, 1); - t32[5] = vmull_lane_s16(input_3h, cospis1, 1); - t32[6] = vmull_lane_s16(input_1l, cospis1, 0); - t32[7] = vmull_lane_s16(input_1h, cospis1, 0); - t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0); - t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0); - t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1); - t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1); - t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2); - t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); - t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); - t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); - t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); - t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); - t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); - step1[4] = vcombine_s16(t16[0], t16[1]); - step1[5] = vcombine_s16(t16[2], t16[3]); - step1[6] = vcombine_s16(t16[4], t16[5]); - step1[7] = vcombine_s16(t16[6], t16[7]); + t32[0] = vmull_lane_s16(input1l, cospis1, 3); + t32[1] = vmull_lane_s16(input1h, cospis1, 3); + t32[2] = vmull_lane_s16(input3l, cospis1, 2); + t32[3] = vmull_lane_s16(input3h, cospis1, 2); + t32[4] = vmull_lane_s16(input3l, cospis1, 1); + t32[5] = vmull_lane_s16(input3h, cospis1, 1); + t32[6] = vmull_lane_s16(input1l, cospis1, 0); + t32[7] = vmull_lane_s16(input1h, cospis1, 0); + t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0); + t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0); + t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1); + t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1); + t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2); + t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2); + t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3); + t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3); + dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]); + dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]); // stage 2 t32[2] = vmull_lane_s16(step1l[0], cospis0, 2); @@ -503,18 +478,8 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); - t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); - t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); - t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); - step2[0] = vcombine_s16(t16[0], t16[1]); - step2[1] = vcombine_s16(t16[2], t16[3]); - step2[2] = vcombine_s16(t16[4], t16[5]); - step2[3] = vcombine_s16(t16[6], t16[7]); + dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]); + dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]); step2[4] = vaddq_s16(step1[4], step1[5]); step2[5] = vsubq_s16(step1[4], step1[5]); @@ -533,35 +498,25 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - step1[5] = vcombine_s16(t16[0], t16[1]); - step1[6] = vcombine_s16(t16[2], t16[3]); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); // stage 4 - *io0 = vaddq_s16(step1[0], step2[7]); - *io1 = vaddq_s16(step1[1], step1[6]); - *io2 = vaddq_s16(step1[2], step1[5]); - *io3 = vaddq_s16(step1[3], step2[4]); - *io4 = vsubq_s16(step1[3], step2[4]); - *io5 = vsubq_s16(step1[2], step1[5]); - *io6 = vsubq_s16(step1[1], step1[6]); - *io7 = vsubq_s16(step1[0], step2[7]); + io[0] = vaddq_s16(step1[0], step2[7]); + io[1] = vaddq_s16(step1[1], step1[6]); + io[2] = vaddq_s16(step1[2], step1[5]); + io[3] = vaddq_s16(step1[3], step2[4]); + io[4] = vsubq_s16(step1[3], step2[4]); + io[5] = vsubq_s16(step1[2], step1[5]); + io[6] = vsubq_s16(step1[1], step1[6]); + io[7] = vsubq_s16(step1[0], step2[7]); } -static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32, - int16x8_t *const d0, - int16x8_t *const d1) { - int16x4_t t16[4]; - - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - *d0 = vcombine_s16(t16[0], t16[1]); - *d1 = vcombine_s16(t16[2], t16[3]); +static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6], + &io[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io); } static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, @@ -584,7 +539,7 @@ static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1, int32x4_t t32[4]; idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, @@ -596,7 +551,7 @@ static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); t32[2] = vnegq_s32(t32[2]); t32[3] = vnegq_s32(t32[3]); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, @@ -611,7 +566,7 @@ static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, @@ -627,7 +582,7 @@ static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, @@ -643,7 +598,7 @@ static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, @@ -659,7 +614,7 @@ static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1); t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1); t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, @@ -675,7 +630,7 @@ static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, @@ -691,7 +646,7 @@ static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, @@ -707,7 +662,7 @@ static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3); t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3); t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct16x16_add_stage7(const int16x8_t *const step2, @@ -786,129 +741,153 @@ static INLINE void idct16x16_store_pass1(const int16x8_t *const out, vst1q_s16(output, out[15]); } -static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest, - const int stride) { - uint8x8_t d = vld1_u8(*dest); - uint16x8_t q; - - res = vrshrq_n_s16(res, 6); - q = vaddw_u8(vreinterpretq_u16_s16(res), d); - d = vqmovun_s16(vreinterpretq_s16_u16(q)); +static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 5); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); vst1_u8(*dest, d); *dest += stride; } -static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max, - uint16_t **dest, const int stride) { - uint16x8_t d = vld1q_u16(*dest); +static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest, + const int stride) { + idct8x8_add8x1(out[0], &dest, stride); + idct8x8_add8x1(out[1], &dest, stride); + idct8x8_add8x1(out[2], &dest, stride); + idct8x8_add8x1(out[3], &dest, stride); + idct8x8_add8x1(out[4], &dest, stride); + idct8x8_add8x1(out[5], &dest, stride); + idct8x8_add8x1(out[6], &dest, stride); + idct8x8_add8x1(out[7], &dest, stride); +} - res = vqaddq_s16(res, vreinterpretq_s16_u16(d)); - res = vminq_s16(res, max); - d = vqshluq_n_s16(res, 0); +static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 6); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void idct16x16_add_store(const int16x8_t *const out, + uint8_t *dest, const int stride) { + // Add the result to dest + idct16x16_add8x1(out[0], &dest, stride); + idct16x16_add8x1(out[1], &dest, stride); + idct16x16_add8x1(out[2], &dest, stride); + idct16x16_add8x1(out[3], &dest, stride); + idct16x16_add8x1(out[4], &dest, stride); + idct16x16_add8x1(out[5], &dest, stride); + idct16x16_add8x1(out[6], &dest, stride); + idct16x16_add8x1(out[7], &dest, stride); + idct16x16_add8x1(out[8], &dest, stride); + idct16x16_add8x1(out[9], &dest, stride); + idct16x16_add8x1(out[10], &dest, stride); + idct16x16_add8x1(out[11], &dest, stride); + idct16x16_add8x1(out[12], &dest, stride); + idct16x16_add8x1(out[13], &dest, stride); + idct16x16_add8x1(out[14], &dest, stride); + idct16x16_add8x1(out[15], &dest, stride); +} + +static INLINE void highbd_idct16x16_add8x1(const int16x8_t a, + const int16x8_t max, + uint16_t **const dest, + const int stride) { + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s)); + const int16x8_t res1 = vminq_s16(res0, max); + const uint16x8_t d = vqshluq_n_s16(res1, 0); vst1q_u16(*dest, d); *dest += stride; } -static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest, - const int stride) { - uint16x8_t d = vld1q_u16(*dest); +static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest, + const int stride) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << 8) - 1); + out[0] = vrshrq_n_s16(out[0], 6); + out[1] = vrshrq_n_s16(out[1], 6); + out[2] = vrshrq_n_s16(out[2], 6); + out[3] = vrshrq_n_s16(out[3], 6); + out[4] = vrshrq_n_s16(out[4], 6); + out[5] = vrshrq_n_s16(out[5], 6); + out[6] = vrshrq_n_s16(out[6], 6); + out[7] = vrshrq_n_s16(out[7], 6); + out[8] = vrshrq_n_s16(out[8], 6); + out[9] = vrshrq_n_s16(out[9], 6); + out[10] = vrshrq_n_s16(out[10], 6); + out[11] = vrshrq_n_s16(out[11], 6); + out[12] = vrshrq_n_s16(out[12], 6); + out[13] = vrshrq_n_s16(out[13], 6); + out[14] = vrshrq_n_s16(out[14], 6); + out[15] = vrshrq_n_s16(out[15], 6); + highbd_idct16x16_add8x1(out[0], max, &dest, stride); + highbd_idct16x16_add8x1(out[1], max, &dest, stride); + highbd_idct16x16_add8x1(out[2], max, &dest, stride); + highbd_idct16x16_add8x1(out[3], max, &dest, stride); + highbd_idct16x16_add8x1(out[4], max, &dest, stride); + highbd_idct16x16_add8x1(out[5], max, &dest, stride); + highbd_idct16x16_add8x1(out[6], max, &dest, stride); + highbd_idct16x16_add8x1(out[7], max, &dest, stride); + highbd_idct16x16_add8x1(out[8], max, &dest, stride); + highbd_idct16x16_add8x1(out[9], max, &dest, stride); + highbd_idct16x16_add8x1(out[10], max, &dest, stride); + highbd_idct16x16_add8x1(out[11], max, &dest, stride); + highbd_idct16x16_add8x1(out[12], max, &dest, stride); + highbd_idct16x16_add8x1(out[13], max, &dest, stride); + highbd_idct16x16_add8x1(out[14], max, &dest, stride); + highbd_idct16x16_add8x1(out[15], max, &dest, stride); +} - res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6); - d = vmovl_u8(vqmovun_s16(res)); +static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a, + uint16_t **const dest, + const int stride) { + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6); + const uint16x8_t d = vmovl_u8(vqmovun_s16(res)); vst1q_u16(*dest, d); *dest += stride; } static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a, - uint16_t *out, const int b_stride) { - highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride); -} - -static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, - uint16_t *dest, const int stride, - const int bd) { - // Add the result to dest - const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - int16x8_t o[16]; - o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6), - vrshrn_n_s32(out[0].val[1], 6)); - o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6), - vrshrn_n_s32(out[1].val[1], 6)); - o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6), - vrshrn_n_s32(out[2].val[1], 6)); - o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6), - vrshrn_n_s32(out[3].val[1], 6)); - o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6), - vrshrn_n_s32(out[4].val[1], 6)); - o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6), - vrshrn_n_s32(out[5].val[1], 6)); - o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6), - vrshrn_n_s32(out[6].val[1], 6)); - o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6), - vrshrn_n_s32(out[7].val[1], 6)); - o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6), - vrshrn_n_s32(out[8].val[1], 6)); - o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6), - vrshrn_n_s32(out[9].val[1], 6)); - o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6), - vrshrn_n_s32(out[10].val[1], 6)); - o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6), - vrshrn_n_s32(out[11].val[1], 6)); - o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6), - vrshrn_n_s32(out[12].val[1], 6)); - o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6), - vrshrn_n_s32(out[13].val[1], 6)); - o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6), - vrshrn_n_s32(out[14].val[1], 6)); - o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6), - vrshrn_n_s32(out[15].val[1], 6)); - highbd_idct16x16_add8x1(o[0], max, &dest, stride); - highbd_idct16x16_add8x1(o[1], max, &dest, stride); - highbd_idct16x16_add8x1(o[2], max, &dest, stride); - highbd_idct16x16_add8x1(o[3], max, &dest, stride); - highbd_idct16x16_add8x1(o[4], max, &dest, stride); - highbd_idct16x16_add8x1(o[5], max, &dest, stride); - highbd_idct16x16_add8x1(o[6], max, &dest, stride); - highbd_idct16x16_add8x1(o[7], max, &dest, stride); - highbd_idct16x16_add8x1(o[8], max, &dest, stride); - highbd_idct16x16_add8x1(o[9], max, &dest, stride); - highbd_idct16x16_add8x1(o[10], max, &dest, stride); - highbd_idct16x16_add8x1(o[11], max, &dest, stride); - highbd_idct16x16_add8x1(o[12], max, &dest, stride); - highbd_idct16x16_add8x1(o[13], max, &dest, stride); - highbd_idct16x16_add8x1(o[14], max, &dest, stride); - highbd_idct16x16_add8x1(o[15], max, &dest, stride); + uint16_t *out, const int stride) { + highbd_idct16x16_add8x1_bd8(a[0], &out, stride); + highbd_idct16x16_add8x1_bd8(a[1], &out, stride); + highbd_idct16x16_add8x1_bd8(a[2], &out, stride); + highbd_idct16x16_add8x1_bd8(a[3], &out, stride); + highbd_idct16x16_add8x1_bd8(a[4], &out, stride); + highbd_idct16x16_add8x1_bd8(a[5], &out, stride); + highbd_idct16x16_add8x1_bd8(a[6], &out, stride); + highbd_idct16x16_add8x1_bd8(a[7], &out, stride); + highbd_idct16x16_add8x1_bd8(a[8], &out, stride); + highbd_idct16x16_add8x1_bd8(a[9], &out, stride); + highbd_idct16x16_add8x1_bd8(a[10], &out, stride); + highbd_idct16x16_add8x1_bd8(a[11], &out, stride); + highbd_idct16x16_add8x1_bd8(a[12], &out, stride); + highbd_idct16x16_add8x1_bd8(a[13], &out, stride); + highbd_idct16x16_add8x1_bd8(a[14], &out, stride); + highbd_idct16x16_add8x1_bd8(a[15], &out, stride); + highbd_idct16x16_add8x1_bd8(a[16], &out, stride); + highbd_idct16x16_add8x1_bd8(a[17], &out, stride); + highbd_idct16x16_add8x1_bd8(a[18], &out, stride); + highbd_idct16x16_add8x1_bd8(a[19], &out, stride); + highbd_idct16x16_add8x1_bd8(a[20], &out, stride); + highbd_idct16x16_add8x1_bd8(a[21], &out, stride); + highbd_idct16x16_add8x1_bd8(a[22], &out, stride); + highbd_idct16x16_add8x1_bd8(a[23], &out, stride); + highbd_idct16x16_add8x1_bd8(a[24], &out, stride); + highbd_idct16x16_add8x1_bd8(a[25], &out, stride); + highbd_idct16x16_add8x1_bd8(a[26], &out, stride); + highbd_idct16x16_add8x1_bd8(a[27], &out, stride); + highbd_idct16x16_add8x1_bd8(a[28], &out, stride); + highbd_idct16x16_add8x1_bd8(a[29], &out, stride); + highbd_idct16x16_add8x1_bd8(a[30], &out, stride); + highbd_idct16x16_add8x1_bd8(a[31], &out, stride); } void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, @@ -937,4 +916,4 @@ void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output); void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, const int highbd_flag); -#endif // VPX_DSP_ARM_IDCT_NEON_H_ +#endif // VPX_VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c index fb1fa6b681..38e275834b 100644 --- a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c +++ b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -667,8 +667,6 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, d135_store_32x2(&dst, stride, row_0, row_1, row_2); } -// ----------------------------------------------------------------------------- - #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm index a042d40acb..a81a9d1013 100644 --- a/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm +++ b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm @@ -201,7 +201,7 @@ str lr, [sp, #16] ; thresh1 add sp, #4 pop {r0-r1, lr} - add r0, r1, lsl #3 ; s + 8 * pitch + add r0, r0, r1, lsl #3 ; s + 8 * pitch b vpx_lpf_vertical_8_neon ENDP ; |vpx_lpf_vertical_8_dual_neon| diff --git a/libs/libvpx/vpx_dsp/arm/mem_neon.h b/libs/libvpx/vpx_dsp/arm/mem_neon.h index 4efad5333e..943865b3c2 100644 --- a/libs/libvpx/vpx_dsp/arm/mem_neon.h +++ b/libs/libvpx/vpx_dsp/arm/mem_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_ARM_MEM_NEON_H_ -#define VPX_DSP_ARM_MEM_NEON_H_ +#ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_ +#define VPX_VPX_DSP_ARM_MEM_NEON_H_ #include #include @@ -19,6 +19,21 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { + return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) | + ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48)); +} + +static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) { + return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32)); +} + +static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1, + const int32_t c2, const int32_t c3) { + return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3)); +} + // Helper functions used to load tran_low_t into int16, narrowing if necessary. static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { #if CONFIG_VP9_HIGHBITDEPTH @@ -86,9 +101,9 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { if (stride == 4) return vld1_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1_lane_u32(&a, a_u32, 0); + a_u32 = vset_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); - a_u32 = vld1_lane_u32(&a, a_u32, 1); + a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); } @@ -112,16 +127,16 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 0); + a_u32 = vsetq_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 1); + a_u32 = vsetq_lane_u32(a, a_u32, 1); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 2); + a_u32 = vsetq_lane_u32(a, a_u32, 2); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 3); + a_u32 = vsetq_lane_u32(a, a_u32, 3); return vreinterpretq_u8_u32(a_u32); } @@ -166,4 +181,4 @@ static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { buf += stride; vst1_lane_u32((uint32_t *)buf, a_u32, 1); } -#endif // VPX_DSP_ARM_MEM_NEON_H_ +#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/quantize_neon.c b/libs/libvpx/vpx_dsp/arm/quantize_neon.c index a0a1e6dd5a..adef5f6e15 100644 --- a/libs/libvpx/vpx_dsp/arm/quantize_neon.c +++ b/libs/libvpx/vpx_dsp/arm/quantize_neon.c @@ -15,17 +15,33 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff) { + const int32x4_t dqcoeff_0 = + vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + const int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + +#if CONFIG_VP9_HIGHBITDEPTH + vst1q_s32(dqcoeff, dqcoeff_0); + vst1q_s32(dqcoeff + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -38,8 +54,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -65,17 +81,15 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff = vandq_s16(qcoeff, zbin_mask); // Set non-zero elements to -1 and use that to extract values for eob. - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - qcoeff = vmulq_s16(qcoeff, dequant); - - store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } @@ -90,8 +104,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, do { // Add one because the eob is not its index. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -118,23 +132,24 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Set non-zero elements to -1 and use that to extract values for eob. eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - qcoeff = vmulq_s16(qcoeff, dequant); - - store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; n_coeffs -= 8; } while (n_coeffs > 0); } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -142,25 +157,50 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); } +static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff) { + int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff, dqcoeff_0); + vst1q_s32(dqcoeff + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff, + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - (void)scan_ptr; + (void)scan; (void)n_coeffs; // Because we will always calculate 32*32. (void)skip_block; assert(!skip_block); @@ -174,8 +214,8 @@ void vpx_quantize_b_32x32_neon( const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -188,8 +228,6 @@ void vpx_quantize_b_32x32_neon( // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - int16x8_t dqcoeff; - int32x4_t dqcoeff_0, dqcoeff_1; qcoeff = vaddq_s16(qcoeff, rounded); @@ -203,25 +241,15 @@ void vpx_quantize_b_32x32_neon( qcoeff = vandq_s16(qcoeff, zbin_mask); // Set non-zero elements to -1 and use that to extract values for eob. - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); - dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); - - // Add 1 if negative to round towards zero because the C uses division. - dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); - dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); - - dqcoeff = - vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); - - store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } @@ -234,8 +262,8 @@ void vpx_quantize_b_32x32_neon( for (i = 1; i < 32 * 32 / 8; ++i) { // Add one because the eob is not its index. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -248,8 +276,6 @@ void vpx_quantize_b_32x32_neon( // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - int16x8_t dqcoeff; - int32x4_t dqcoeff_0, dqcoeff_1; qcoeff = vaddq_s16(qcoeff, rounded); @@ -264,28 +290,22 @@ void vpx_quantize_b_32x32_neon( // Set non-zero elements to -1 and use that to extract values for eob. eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); - dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); - - dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); - dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); - - dqcoeff = - vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); - - store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -293,4 +313,5 @@ void vpx_quantize_b_32x32_neon( const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } diff --git a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c index b04de3aff2..06443c6995 100644 --- a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -10,233 +10,371 @@ #include +#include #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride); - for (i = 0; i < 4; ++i) { - const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride); - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); - } +static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, + const void *const buf1) { + uint32_t a; + uint32x2_t aa = vdup_n_u32(0); + memcpy(&a, buf0, 4); + aa = vset_lane_u32(a, aa, 0); + memcpy(&a, buf1, 4); + aa = vset_lane_u32(a, aa, 1); + return vreinterpret_u8_u32(aa); } -void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { +static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, + const uint8_t *const ref_array[4], + const int ref_stride, const int height, + uint32_t *const res) { int i; - const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride); - const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride); - for (i = 0; i < 4; ++i) { - const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride); - const uint8x16_t ref_1 = - load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride); - uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0)); - abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0)); - abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1)); - abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1)); - res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); - } -} + uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + uint16x4_t a[2]; + uint32x4_t r; -static INLINE void sad8x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { - int i, j; - uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + assert(!((intptr_t)src_ptr % sizeof(uint32_t))); + assert(!(src_stride % sizeof(uint32_t))); for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(a); - a += a_stride; + const uint8x8_t s = vreinterpret_u8_u32( + vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride))); + const uint8x8_t ref01 = load_unaligned_2_buffers( + ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride); + const uint8x8_t ref23 = load_unaligned_2_buffers( + ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride); + abs[0] = vabal_u8(abs[0], s, ref01); + abs[1] = vabal_u8(abs[1], s, ref23); + } + + a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); + a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); + r = vpaddlq_u16(vcombine_u16(a[0], a[1])); + vst1q_u32(res, r); +} + +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res); +} + +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res); +} + +//////////////////////////////////////////////////////////////////////////////// + +// Can handle 512 pixels' sad sum (such as 16x32 or 32x16) +static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); + vst1q_u32(res, r); +} + +// Can handle 1024 pixels' sad sum (such as 32x32) +static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1)); + const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); + const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + vst1q_u32(res, vcombine_u32(c0, c1)); +} + +// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) +static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0)); + const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1)); + const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2)); + const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); + const uint32x2_t c0 = vpadd_u32(b0, b1); + const uint32x2_t c1 = vpadd_u32(b2, b3); + vst1q_u32(res, vcombine_u32(c0, c1)); +} + +// Can handle 4096 pixels' sad sum (such as 64x64) +static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, + uint32_t *const res) { + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x4_t a4 = vpaddlq_u16(sum[4]); + const uint32x4_t a5 = vpaddlq_u16(sum[5]); + const uint32x4_t a6 = vpaddlq_u16(sum[6]); + const uint32x4_t a7 = vpaddlq_u16(sum[7]); + const uint32x4_t b0 = vaddq_u32(a0, a1); + const uint32x4_t b1 = vaddq_u32(a2, a3); + const uint32x4_t b2 = vaddq_u32(a4, a5); + const uint32x4_t b3 = vaddq_u32(a6, a7); + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + vst1q_u32(res, vcombine_u32(d0, d1)); +} + +static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res, const int height) { + int i, j; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + for (i = 0; i < height; ++i) { + const uint8x8_t s = vld1_u8(src_ptr); + src_ptr += src_stride; for (j = 0; j < 4; ++j) { - const uint8x8_t b_u8 = vld1_u8(b_loop[j]); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], a_u8, b_u8); + const uint8x8_t b_u8 = vld1_u8(ref_loop[j]); + ref_loop[j] += ref_stride; + sum[j] = vabal_u8(sum[j], s, b_u8); } } - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); - } + sad_512_pel_final_neon(sum, res); } -void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad8x_4d(src, src_stride, ref, ref_stride, res, 4); + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4); } -void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad8x_4d(src, src_stride, ref, ref_stride, res, 8); + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); } -void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad8x_4d(src, src_stride, ref, ref_stride, res, 16); + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); } -static INLINE void sad16x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, + uint16x8_t *const sum) { + const uint8x16_t r = vld1q_u8(ref_ptr); + *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r)); + *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r)); +} + +static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res, const int height) { int i, j; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(a); - a += a_stride; + const uint8x16_t s = vld1q_u8(src_ptr); + src_ptr += src_stride; for (j = 0; j < 4; ++j) { - const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); + sad16_neon(ref_loop[j], s, &sum[j]); + ref_loop[j] += ref_stride; } } - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); - } + sad_512_pel_final_neon(sum, res); } -void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad16x_4d(src, src_stride, ref, ref_stride, res, 8); + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); } -void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad16x_4d(src, src_stride, ref, ref_stride, res, 16); + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); } -void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad16x_4d(src, src_stride, ref, ref_stride, res, 32); + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32); } -static INLINE void sad32x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { - int i, j; +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + const int height, uint16x8_t *const sum) { + int i; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; + + sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + uint8x16_t s; + + s = vld1q_u8(src_ptr + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + src_ptr += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } +} + +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + uint16x8_t sum[4]; + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum); + sad_512_pel_final_neon(sum, res); +} + +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + uint16x8_t sum[4]; + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum); + sad_1024_pel_final_neon(sum, res); +} + +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + uint16x8_t sum[4]; + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum); + sad_2048_pel_final_neon(sum, res); +} + +//////////////////////////////////////////////////////////////////////////////// + +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + int i; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - a += a_stride; - for (j = 0; j < 4; ++j) { - const uint8x16_t b_0 = vld1q_u8(b_loop[j]); - const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0)); - sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1)); - } + for (i = 0; i < 32; ++i) { + uint8x16_t s; + + s = vld1q_u8(src_ptr + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); + + src_ptr += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; } - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); - } + sad_2048_pel_final_neon(sum, res); } -void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 16); -} - -void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 32); -} - -void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 64); -} - -static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1, - const uint8x16_t b_0, const uint8x16_t b_1, - uint16x8_t *sum) { - *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0)); - *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0)); - *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1)); - *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1)); -} - -static INLINE void sad64x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { int i; - uint16x8_t sum_0 = vdupq_n_u16(0); - uint16x8_t sum_1 = vdupq_n_u16(0); - uint16x8_t sum_2 = vdupq_n_u16(0); - uint16x8_t sum_3 = vdupq_n_u16(0); - uint16x8_t sum_4 = vdupq_n_u16(0); - uint16x8_t sum_5 = vdupq_n_u16(0); - uint16x8_t sum_6 = vdupq_n_u16(0); - uint16x8_t sum_7 = vdupq_n_u16(0); - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; + uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0) }; - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - const uint8x16_t a_2 = vld1q_u8(a + 32); - const uint8x16_t a_3 = vld1q_u8(a + 48); - a += a_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0); - sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48), - &sum_1); - b_loop[0] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2); - sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48), - &sum_3); - b_loop[1] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4); - sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48), - &sum_5); - b_loop[2] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6); - sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48), - &sum_7); - b_loop[3] += b_stride; + for (i = 0; i < 64; ++i) { + uint8x16_t s; + + s = vld1q_u8(src_ptr + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); + + s = vld1q_u8(src_ptr + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); + + s = vld1q_u8(src_ptr + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); + + s = vld1q_u8(src_ptr + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); + + src_ptr += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; } - result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0); - result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0); - result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0); - result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0); -} - -void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - sad64x_4d(src, src_stride, ref, ref_stride, res, 32); -} - -void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - sad64x_4d(src, src_stride, ref, ref_stride, res, 64); + sad_4096_pel_final_neon(sum, res); } diff --git a/libs/libvpx/vpx_dsp/arm/sad_neon.c b/libs/libvpx/vpx_dsp/arm/sad_neon.c index 9518a166bb..c4a49e366d 100644 --- a/libs/libvpx/vpx_dsp/arm/sad_neon.c +++ b/libs/libvpx/vpx_dsp/arm/sad_neon.c @@ -11,6 +11,7 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" @@ -73,128 +74,132 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } -static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, const int height) { +static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(a); - const uint8x8_t b_u8 = vld1_u8(b); - a += a_stride; - b += b_stride; + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + src_ptr += src_stride; + ref_ptr += ref_stride; abs = vabal_u8(abs, a_u8, b_u8); } return abs; } -static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(a); - const uint8x8_t b_u8 = vld1_u8(b); - const uint8x8_t c_u8 = vld1_u8(c); + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t c_u8 = vld1_u8(second_pred); const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); - a += a_stride; - b += b_stride; - c += 8; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; abs = vabal_u8(abs, a_u8, avg); } return abs; } -#define sad8xN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ +#define sad8xN(n) \ + uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad8xN(4); sad8xN(8); sad8xN(16); -static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(a); - const uint8x16_t b_u8 = vld1q_u8(b); - a += a_stride; - b += b_stride; + const uint8x16_t a_u8 = vld1q_u8(src_ptr); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr); + src_ptr += src_stride; + ref_ptr += ref_stride; abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); } return abs; } -static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(a); - const uint8x16_t b_u8 = vld1q_u8(b); - const uint8x16_t c_u8 = vld1q_u8(c); + const uint8x16_t a_u8 = vld1q_u8(src_ptr); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr); + const uint8x16_t c_u8 = vld1q_u8(second_pred); const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); - a += a_stride; - b += b_stride; - c += 16; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); } return abs; } -#define sad16xN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ +#define sad16xN(n) \ + uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint16x8_t abs = \ + sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad16xN(8); sad16xN(16); sad16xN(32); -static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(a); - const uint8x16_t a_hi = vld1q_u8(a + 16); - const uint8x16_t b_lo = vld1q_u8(b); - const uint8x16_t b_hi = vld1q_u8(b + 16); - a += a_stride; - b += b_stride; + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + src_ptr += src_stride; + ref_ptr += ref_stride; abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); @@ -203,24 +208,25 @@ static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, return abs; } -static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(a); - const uint8x16_t a_hi = vld1q_u8(a + 16); - const uint8x16_t b_lo = vld1q_u8(b); - const uint8x16_t b_hi = vld1q_u8(b + 16); - const uint8x16_t c_lo = vld1q_u8(c); - const uint8x16_t c_hi = vld1q_u8(c + 16); + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t c_lo = vld1q_u8(second_pred); + const uint8x16_t c_hi = vld1q_u8(second_pred + 16); const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); - a += a_stride; - b += b_stride; - c += 32; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); @@ -229,43 +235,44 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, return abs; } -#define sad32xN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ +#define sad32xN(n) \ + uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint16x8_t abs = \ + sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad32xN(16); sad32xN(32); sad32xN(64); -static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const int height) { int i; uint16x8_t abs_0 = vdupq_n_u16(0); uint16x8_t abs_1 = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - const uint8x16_t a_2 = vld1q_u8(a + 32); - const uint8x16_t a_3 = vld1q_u8(a + 48); - const uint8x16_t b_0 = vld1q_u8(b); - const uint8x16_t b_1 = vld1q_u8(b + 16); - const uint8x16_t b_2 = vld1q_u8(b + 32); - const uint8x16_t b_3 = vld1q_u8(b + 48); - a += a_stride; - b += b_stride; + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + src_ptr += src_stride; + ref_ptr += ref_stride; abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); @@ -282,33 +289,34 @@ static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, } } -static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs_0 = vdupq_n_u16(0); uint16x8_t abs_1 = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - const uint8x16_t a_2 = vld1q_u8(a + 32); - const uint8x16_t a_3 = vld1q_u8(a + 48); - const uint8x16_t b_0 = vld1q_u8(b); - const uint8x16_t b_1 = vld1q_u8(b + 16); - const uint8x16_t b_2 = vld1q_u8(b + 32); - const uint8x16_t b_3 = vld1q_u8(b + 48); - const uint8x16_t c_0 = vld1q_u8(c); - const uint8x16_t c_1 = vld1q_u8(c + 16); - const uint8x16_t c_2 = vld1q_u8(c + 32); - const uint8x16_t c_3 = vld1q_u8(c + 48); + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t c_0 = vld1q_u8(second_pred); + const uint8x16_t c_1 = vld1q_u8(second_pred + 16); + const uint8x16_t c_2 = vld1q_u8(second_pred + 32); + const uint8x16_t c_3 = vld1q_u8(second_pred + 48); const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); - a += a_stride; - b += b_stride; - c += 64; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); @@ -325,19 +333,20 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, } } -#define sad64xN(n) \ - uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ - } \ - \ - uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t abs = \ - sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ +#define sad64xN(n) \ + uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t abs = \ + sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + } \ + \ + uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t abs = \ + sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ } sad64xN(32); diff --git a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c index 4f58a7832a..37bfd1cd1f 100644 --- a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c +++ b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -97,30 +97,30 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define sub_pixel_varianceNxM(n, m) \ - uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else { \ - var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[yoffset]); \ - } \ - return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \ +#define sub_pixel_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else { \ + var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[y_offset]); \ + } \ + return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \ } sub_pixel_varianceNxM(4, 4); @@ -139,34 +139,34 @@ sub_pixel_varianceNxM(64, 64); // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define sub_pixel_avg_varianceNxM(n, m) \ - uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else { \ - var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[yoffset]); \ - } \ - \ - vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ - \ - return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \ +#define sub_pixel_avg_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else { \ + var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[y_offset]); \ + } \ + \ + vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ + \ + return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ } sub_pixel_avg_varianceNxM(4, 4); diff --git a/libs/libvpx/vpx_dsp/arm/subtract_neon.c b/libs/libvpx/vpx_dsp/arm/subtract_neon.c index ce81fb630f..612897e247 100644 --- a/libs/libvpx/vpx_dsp/arm/subtract_neon.c +++ b/libs/libvpx/vpx_dsp/arm/subtract_neon.c @@ -9,71 +9,73 @@ */ #include +#include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { - int r, c; + int r = rows, c; if (cols > 16) { - for (r = 0; r < rows; ++r) { + do { for (c = 0; c < cols; c += 32) { - const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); - const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); - const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); - const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); - const uint16x8_t v_diff_lo_00 = - vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); - const uint16x8_t v_diff_hi_00 = - vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); - const uint16x8_t v_diff_lo_16 = - vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); - const uint16x8_t v_diff_hi_16 = - vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); - vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); - vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); - vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); - vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + const uint8x16_t s0 = vld1q_u8(&src[c + 0]); + const uint8x16_t s1 = vld1q_u8(&src[c + 16]); + const uint8x16_t p0 = vld1q_u8(&pred[c + 0]); + const uint8x16_t p1 = vld1q_u8(&pred[c + 16]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0)); + const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1)); + const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3)); } diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 8) { - for (r = 0; r < rows; ++r) { - const uint8x16_t v_src = vld1q_u8(&src[0]); - const uint8x16_t v_pred = vld1q_u8(&pred[0]); - const uint16x8_t v_diff_lo = - vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); - const uint16x8_t v_diff_hi = - vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); - vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + do { + const uint8x16_t s = vld1q_u8(&src[0]); + const uint8x16_t p = vld1q_u8(&pred[0]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 4) { - for (r = 0; r < rows; ++r) { - const uint8x8_t v_src = vld1_u8(&src[0]); - const uint8x8_t v_pred = vld1_u8(&pred[0]); - const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + do { + const uint8x8_t s = vld1_u8(&src[0]); + const uint8x8_t p = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(s, p); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } + assert(cols == 4); + do { + const uint8x8_t s = load_unaligned_u8(src, (int)src_stride); + const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride); + const uint16x8_t d = vsubl_u8(s, p); + vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d))); + vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d))); + diff += 2 * diff_stride; + pred += 2 * pred_stride; + src += 2 * src_stride; + r -= 2; + } while (r); } } diff --git a/libs/libvpx/vpx_dsp/arm/sum_neon.h b/libs/libvpx/vpx_dsp/arm/sum_neon.h index d74fe0cde4..9e6833aad3 100644 --- a/libs/libvpx/vpx_dsp/arm/sum_neon.h +++ b/libs/libvpx/vpx_dsp/arm/sum_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_ARM_SUM_NEON_H_ -#define VPX_DSP_ARM_SUM_NEON_H_ +#ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_ +#define VPX_VPX_DSP_ARM_SUM_NEON_H_ #include @@ -30,18 +30,9 @@ static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) { vreinterpret_u32_u64(vget_high_u64(c))); } -static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a, - const uint16x8_t b) { - const uint32x4_t c = vpaddlq_u16(a); - const uint32x4_t d = vpadalq_u16(c, b); - const uint64x2_t e = vpaddlq_u32(d); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)), - vreinterpret_u32_u64(vget_high_u64(e))); -} - static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { const uint64x2_t b = vpaddlq_u32(a); return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); } -#endif // VPX_DSP_ARM_SUM_NEON_H_ +#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/sum_squares_neon.c b/libs/libvpx/vpx_dsp/arm/sum_squares_neon.c new file mode 100644 index 0000000000..cfefad9938 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/sum_squares_neon.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include +#include "./vpx_dsp_rtcd.h" + +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) { + uint64x1_t s2; + + if (size == 4) { + int16x4_t s[4]; + int32x4_t s0; + uint32x2_t s1; + + s[0] = vld1_s16(src + 0 * stride); + s[1] = vld1_s16(src + 1 * stride); + s[2] = vld1_s16(src + 2 * stride); + s[3] = vld1_s16(src + 3 * stride); + s0 = vmull_s16(s[0], s[0]); + s0 = vmlal_s16(s0, s[1], s[1]); + s0 = vmlal_s16(s0, s[2], s[2]); + s0 = vmlal_s16(s0, s[3], s[3]); + s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)), + vget_high_u32(vreinterpretq_u32_s32(s0))); + s2 = vpaddl_u32(s1); + } else { + int r = size; + uint64x2_t s1 = vdupq_n_u64(0); + + do { + int c = size; + int32x4_t s0 = vdupq_n_s32(0); + const int16_t *src_t = src; + + do { + int16x8_t s[8]; + + s[0] = vld1q_s16(src_t + 0 * stride); + s[1] = vld1q_s16(src_t + 1 * stride); + s[2] = vld1q_s16(src_t + 2 * stride); + s[3] = vld1q_s16(src_t + 3 * stride); + s[4] = vld1q_s16(src_t + 4 * stride); + s[5] = vld1q_s16(src_t + 5 * stride); + s[6] = vld1q_s16(src_t + 6 * stride); + s[7] = vld1q_s16(src_t + 7 * stride); + s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0])); + s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1])); + s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2])); + s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3])); + s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4])); + s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5])); + s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6])); + s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7])); + s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0])); + s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1])); + s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2])); + s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3])); + s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4])); + s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5])); + s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6])); + s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7])); + src_t += 8; + c -= 8; + } while (c); + + s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0))); + s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0))); + src += 8 * stride; + r -= 8; + } while (r); + + s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1)); + } + + return vget_lane_u64(s2, 0); +} diff --git a/libs/libvpx/vpx_dsp/arm/transpose_neon.h b/libs/libvpx/vpx_dsp/arm/transpose_neon.h index d85cbcee46..43340e48d9 100644 --- a/libs/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/libs/libvpx/vpx_dsp/arm/transpose_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_ARM_TRANSPOSE_NEON_H_ -#define VPX_DSP_ARM_TRANSPOSE_NEON_H_ +#ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ +#define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ #include @@ -1313,4 +1313,4 @@ static INLINE void load_and_transpose_s32_8x8( transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7); } -#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_ +#endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/variance_neon.c b/libs/libvpx/vpx_dsp/arm/variance_neon.c index 61c2c16a72..77b1015b74 100644 --- a/libs/libvpx/vpx_dsp/arm/variance_neon.c +++ b/libs/libvpx/vpx_dsp/arm/variance_neon.c @@ -27,8 +27,9 @@ // this limit. // Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int h, uint32_t *sse, int *sum) { +static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { int i; int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_lo_s32 = vdupq_n_s32(0); @@ -38,8 +39,8 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, assert(h <= 256); for (i = 0; i < h; i += 4) { - const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride); - const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride); + const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride); const uint16x8_t diff_lo_u16 = vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); const uint16x8_t diff_hi_u16 = @@ -61,8 +62,8 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), vget_high_s16(diff_hi_s16)); - a += 4 * a_stride; - b += 4 * b_stride; + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; } *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); @@ -72,9 +73,9 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, } // Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { +static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { int i, j; int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_lo_s32 = vdupq_n_s32(0); @@ -86,8 +87,8 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 16) { - const uint8x16_t a_u8 = vld1q_u8(a + j); - const uint8x16_t b_u8 = vld1q_u8(b + j); + const uint8x16_t a_u8 = vld1q_u8(src_ptr + j); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j); const uint16x8_t diff_lo_u16 = vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); @@ -110,8 +111,8 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), vget_high_s16(diff_hi_s16)); } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); @@ -121,8 +122,9 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, } // Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int h, uint32_t *sse, int *sum) { +static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { int i = 0; int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_lo_s32 = vdupq_n_s32(0); @@ -132,10 +134,10 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, assert(h <= 128); do { - const uint8x8_t a_0_u8 = vld1_u8(a); - const uint8x8_t a_1_u8 = vld1_u8(a + a_stride); - const uint8x8_t b_0_u8 = vld1_u8(b); - const uint8x8_t b_1_u8 = vld1_u8(b + b_stride); + const uint8x8_t a_0_u8 = vld1_u8(src_ptr); + const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride); + const uint8x8_t b_0_u8 = vld1_u8(ref_ptr); + const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride); const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); @@ -150,8 +152,8 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, vget_high_s16(diff_0_s16)); sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), vget_high_s16(diff_1_s16)); - a += a_stride + a_stride; - b += b_stride + b_stride; + src_ptr += src_stride + src_stride; + ref_ptr += ref_stride + ref_stride; i += 2; } while (i < h); @@ -161,31 +163,36 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, 0); } -void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); } -void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum); } -#define varianceNxM(n, m, shift) \ - unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - if (n == 4) \ - variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum); \ - else if (n == 8) \ - variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum); \ - else \ - variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum); \ - if (n * m < 16 * 16) \ - return *sse - ((sum * sum) >> shift); \ - else \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ +#define varianceNxM(n, m, shift) \ + unsigned int vpx_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, unsigned int *sse) { \ + int sum; \ + if (n == 4) \ + variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ + &sum); \ + else if (n == 8) \ + variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ + &sum); \ + else \ + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \ + &sum); \ + if (n * m < 16 * 16) \ + return *sse - ((sum * sum) >> shift); \ + else \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } varianceNxM(4, 4, 4); @@ -199,58 +206,66 @@ varianceNxM(16, 32, 9); varianceNxM(32, 16, 9); varianceNxM(32, 32, 10); -unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride), - b_stride, 32, 32, &sse2, &sum2); + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1, + &sum1); + variance_neon_w16(src_ptr + (32 * src_stride), src_stride, + ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2, + &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); } -unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, + &sum1); + variance_neon_w16(src_ptr + (16 * src_stride), src_stride, + ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); } -unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, + &sum1); + variance_neon_w16(src_ptr + (16 * src_stride), src_stride, + ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride, + ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride, + ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, +unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; @@ -267,13 +282,13 @@ unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, for (i = 0; i < 8; i++) { // mse16x16_neon_loop q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); @@ -312,10 +327,9 @@ unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); } -unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, - int source_stride, +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, - int recon_stride) { + int ref_stride) { int16x4_t d22s16, d24s16, d26s16, d28s16; int64x1_t d0s64; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; @@ -324,21 +338,21 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int64x2_t q1s64; d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; q11u16 = vsubl_u8(d0u8, d4u8); q12u16 = vsubl_u8(d1u8, d5u8); diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm new file mode 100644 index 0000000000..d8e4bcc3a7 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm @@ -0,0 +1,438 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers***************************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_avg_horiz_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_horiz_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u8 {d6}, [r1] + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u8 {d7}, [r6] + vrhadd.u8 d20, d20, d6 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vrhadd.u8 d8, d8, d7 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlsl.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlal.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlal.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + add r7, r1, #8 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vld1.u8 {d0}, [r1] + vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u8 {d2}, [r7] + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vrhadd.u8 d8, d8, d0 + vrhadd.u8 d9, d9, d2 + vmlsl.u8 q11, d1, d24 + vmlsl.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlal.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + vmlal.u8 q11, d13, d28 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + subeq r14, r14, #2 + vhadd.s16 q5, q5, q10 + vmlal.u8 q11, d15, d29 + addeq r1, r1, r8 + vmlsl.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vdup.16 q10, r7 + vld1.u32 {q3}, [r12], r11 + add r7, r6, #8 + moveq r5, r10 + vld1.u8 {d0}, [r6] + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u8 {d2}, [r7] + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q6}, [r12], r11 + vrhadd.u8 d10, d10, d0 + vld1.u32 {q7}, [r12], r11 + vrhadd.u8 d11, d11, d2 + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + mov r7, #0xc000 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + add r7, r6, #8 + vld1.u8 {d20}, [r6] + vld1.u8 {d21}, [r7] + vrhadd.u8 d10, d10, d20 + vrhadd.u8 d11, d11, d21 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlal.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlal.u8 q4, d5, d29 + vmlsl.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vld1.u32 {d10[0]}, [r1] + vld1.u32 {d10[1]}, [r6] + vrhadd.u8 d8, d8, d10 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm new file mode 100644 index 0000000000..7a77747fec --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm @@ -0,0 +1,439 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_avg_horiz_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_horiz_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u8 {d6}, [r1] + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u8 {d7}, [r6] + vrhadd.u8 d20, d20, d6 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vrhadd.u8 d8, d8, d7 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlal.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlsl.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlsl.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + add r7, r1, #8 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vld1.u8 {d0}, [r1] + vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u8 {d2}, [r7] + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vrhadd.u8 d8, d8, d0 + vrhadd.u8 d9, d9, d2 + vmlsl.u8 q11, d1, d24 + vmlal.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlsl.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + vmlal.u8 q11, d13, d28 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + subeq r14, r14, #2 + vhadd.s16 q5, q5, q10 + vmlsl.u8 q11, d15, d29 + addeq r1, r1, r8 + vmlal.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vdup.16 q10, r7 + vld1.u32 {q3}, [r12], r11 + add r7, r6, #8 + moveq r5, r10 + vld1.u8 {d0}, [r6] + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u8 {d2}, [r7] + vqrshrun.s16 d11, q11, #6 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q6}, [r12], r11 + vrhadd.u8 d10, d10, d0 + vld1.u32 {q7}, [r12], r11 + vrhadd.u8 d11, d11, d2 + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + mov r7, #0xc000 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + add r7, r6, #8 + vld1.u8 {d20}, [r6] + vld1.u8 {d21}, [r7] + vrhadd.u8 d10, d10, d20 + vrhadd.u8 d11, d11, d21 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlal.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlsl.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlsl.u8 q4, d5, d29 + vmlal.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vld1.u32 {d10[0]}, [r1] + vld1.u32 {d10[1]}, [r6] + vrhadd.u8 d8, d8, d10 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm deleted file mode 100644 index 1c2ee50630..0000000000 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm +++ /dev/null @@ -1,295 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; VP9_FILTER_WEIGHT == 128 - ; VP9_FILTER_SHIFT == 7 - - EXPORT |vpx_convolve8_avg_horiz_neon| - EXPORT |vpx_convolve8_avg_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter -; sp[]int x0_q4 -; sp[]int x_step_q4 ; unused -; sp[]int y0_q4 -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|vpx_convolve8_avg_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldrd r4, r5, [sp, #32] ; filter, x0_q4 - add r4, r5, lsl #4 - ldrd r6, r7, [sp, #52] ; w, h - - vld1.s16 {q0}, [r4] ; filter - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -vpx_convolve8_avg_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -vpx_convolve8_avg_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; slightly out of order load to match the existing data - vld1.u32 {d6[0]}, [r2], r3 - vld1.u32 {d7[0]}, [r2], r3 - vld1.u32 {d6[1]}, [r2], r3 - vld1.u32 {d7[1]}, [r2], r3 - - sub r2, r2, r3, lsl #2 ; reset for store - - ; src[] * filter - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_avg_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt vpx_convolve8_avg_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|vpx_convolve8_avg_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #24] ; filter - ldr r5, [sp, #36] ; y0_q4 - add r4, r5, lsl #4 - ldr r6, [sp, #44] ; w - ldr lr, [sp, #48] ; h - - vld1.s16 {q0}, [r4] ; filter - - lsl r1, r1, #1 - lsl r3, r3, #1 - -vpx_convolve8_avg_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -vpx_convolve8_avg_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - vld1.u32 {d6[0]}, [r5@32], r3 - vld1.u32 {d6[1]}, [r8@32], r3 - vld1.u32 {d7[0]}, [r5@32], r3 - vld1.u32 {d7[1]}, [r8@32], r3 - - pld [r7] - pld [r4] - - ; src[] * filter - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r5] - pld [r8] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - sub r5, r5, r3, lsl #1 ; reset for store - sub r8, r8, r3, lsl #1 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt vpx_convolve8_avg_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_avg_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm new file mode 100644 index 0000000000..d310a83dad --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm @@ -0,0 +1,486 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_avg_vert_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_vert_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + pld [r3, r2, lsl #1] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r3, r3, r2 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + add r14, r1, r6 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlsl.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d6, d25 + vrhadd.u8 d10, d10, d20 + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + addle r0, r0, r8 + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vrhadd.u8 d12, d12, d20 + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + addle r1, r1, r9 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlsl.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + add r10, r10, r2 ; 11*strd + vmlal.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlal.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d16, d28 + add r10, r10, r2 ;12*strd + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + subs r7, r7, #4 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vrhadd.u8 d12, d12, d20 + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vst1.8 {d12}, [r14], r6 + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vst1.8 {d14}, [r14], r6 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vmlal.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + add r14, r1, r6 + vmlsl.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlal.u8 q7, d16, d27 + vmlsl.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vld1.u8 {d20}, [r14] + vrhadd.u8 d12, d12, d20 + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d20}, [r14] + vrhadd.u8 d14, d14, d20 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlsl.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlsl.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlal.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlal.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlsl.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vld1.u32 {d20[0]}, [r1] + vld1.u32 {d20[1]}, [r3] + vrhadd.u8 d0, d0, d20 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + mov r4, r3 + vld1.u32 {d20[0]}, [r4], r6 + vld1.u32 {d20[1]}, [r4] + vrhadd.u8 d8, d8, d20 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm new file mode 100644 index 0000000000..c5695fbda8 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm @@ -0,0 +1,487 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_avg_vert_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_vert_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + pld [r3, r2, lsl #1] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r3, r3, r2 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + pld [r3, r2, lsl #1] + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + add r14, r1, r6 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlal.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d6, d25 + vrhadd.u8 d10, d10, d20 + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + addle r0, r0, r8 + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vrhadd.u8 d12, d12, d20 + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + addle r1, r1, r9 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlal.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + add r10, r10, r2 ; 11*strd + vmlsl.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlsl.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlal.u8 q6, d16, d28 + add r10, r10, r2 ;12*strd + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + subs r7, r7, #4 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vld1.u8 {d20}, [r14] + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vrhadd.u8 d12, d12, d20 + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vst1.8 {d12}, [r14], r6 + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vst1.8 {d14}, [r14], r6 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vmlsl.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + add r14, r1, r6 + vmlal.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlsl.u8 q7, d16, d27 + vmlal.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vld1.u8 {d20}, [r14] + vrhadd.u8 d12, d12, d20 + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d20}, [r14] + vrhadd.u8 d14, d14, d20 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlal.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlal.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlsl.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlsl.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlal.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vld1.u32 {d20[0]}, [r1] + vld1.u32 {d20[1]}, [r3] + vrhadd.u8 d0, d0, d20 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + mov r4, r3 + vld1.u32 {d20[0]}, [r4], r6 + vld1.u32 {d20[1]}, [r4] + vrhadd.u8 d8, d8, d20 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm new file mode 100644 index 0000000000..fa1b732466 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm @@ -0,0 +1,415 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_horiz_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_horiz_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlsl.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlal.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlal.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vmlsl.u8 q11, d1, d24 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + vmlsl.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlal.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + addeq r1, r1, r8 + subeq r14, r14, #2 + vmlal.u8 q11, d13, d28 + vhadd.s16 q5, q5, q10 + vmlal.u8 q11, d15, d29 + vmlsl.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q9}, [r12], r11 + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + moveq r5, r10 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vdup.16 q10, r7 + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlal.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlal.u8 q4, d5, d29 + vmlsl.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm new file mode 100644 index 0000000000..90b2c8fef7 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm @@ -0,0 +1,415 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_horiz_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_horiz_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlal.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlsl.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlsl.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vmlsl.u8 q11, d1, d24 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + vmlal.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlsl.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + addeq r1, r1, r8 + subeq r14, r14, #2 + vmlal.u8 q11, d13, d28 + vhadd.s16 q5, q5, q10 + vmlsl.u8 q11, d15, d29 + vmlal.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q9}, [r12], r11 + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + moveq r5, r10 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vdup.16 q10, r7 + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlal.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlsl.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlsl.u8 q4, d5, d29 + vmlal.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h index c1634ed55f..4f27da9d2f 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -8,6 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ +#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ + #include #include "./vpx_config.h" @@ -131,3 +134,5 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], filters, filter3, filter4); } + +#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm deleted file mode 100644 index 5eee15664d..0000000000 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm +++ /dev/null @@ -1,273 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; VP9_FILTER_WEIGHT == 128 - ; VP9_FILTER_SHIFT == 7 - - EXPORT |vpx_convolve8_horiz_neon| - EXPORT |vpx_convolve8_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter -; sp[]int x0_q4 -; sp[]int x_step_q4 ; unused -; sp[]int y0_q4 -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|vpx_convolve8_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldrd r4, r5, [sp, #32] ; filter, x0_q4 - add r4, r5, lsl #4 - ldrd r6, r7, [sp, #52] ; w, h - - vld1.s16 {q0}, [r4] ; filter - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -vpx_convolve8_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -vpx_convolve8_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; src[] * filter - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt vpx_convolve8_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|vpx_convolve8_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #24] ; filter - ldr r5, [sp, #36] ; y0_q4 - add r4, r5, lsl #4 - ldr r6, [sp, #44] ; w - ldr lr, [sp, #48] ; h - - vld1.s16 {q0}, [r4] ; filter - - lsl r1, r1, #1 - lsl r3, r3, #1 - -vpx_convolve8_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -vpx_convolve8_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - pld [r5] - pld [r8] - - ; src[] * filter - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r7] - pld [r4] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt vpx_convolve8_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c new file mode 100644 index 0000000000..4470b28b88 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h" + +/* Type1 and Type2 functions are called depending on the position of the + * negative and positive coefficients in the filter. In type1, the filter kernel + * used is sub_pel_filters_8lp, in which only the first two and the last two + * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 & + * 7. + */ + +#define DEFINE_FILTER(dir) \ + void vpx_convolve8_##dir##_neon( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + if (filter == vp9_filter_kernels[1]) { \ + vpx_convolve8_##dir##_filter_type1_neon( \ + src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } else { \ + vpx_convolve8_##dir##_filter_type2_neon( \ + src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } \ + } + +DEFINE_FILTER(horiz); +DEFINE_FILTER(avg_horiz); +DEFINE_FILTER(vert); +DEFINE_FILTER(avg_vert); diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h new file mode 100644 index 0000000000..b123d1cb08 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ +#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ + +#define DECLARE_FILTER(dir, type) \ + void vpx_convolve8_##dir##_filter_##type##_neon( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +DECLARE_FILTER(horiz, type1); +DECLARE_FILTER(avg_horiz, type1); +DECLARE_FILTER(horiz, type2); +DECLARE_FILTER(avg_horiz, type2); +DECLARE_FILTER(vert, type1); +DECLARE_FILTER(avg_vert, type1); +DECLARE_FILTER(vert, type2); +DECLARE_FILTER(avg_vert, type2); + +#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm new file mode 100644 index 0000000000..2666d4253e --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm @@ -0,0 +1,457 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_vert_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_vert_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + pld [r3, r2, lsl #1] + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r3, r3, r2 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlsl.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + addle r1, r1, r9 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlsl.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + add r10, r10, r2 ; 11*strd + vmlal.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlal.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + vmlsl.u8 q6, d16, d28 + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d17, d29 + add r10, r10, r2 ;12*strd + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + subs r7, r7, #4 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vst1.8 {d14}, [r14], r6 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vmlal.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlal.u8 q7, d16, d27 + vmlsl.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from + ; sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlsl.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlsl.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlal.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlal.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlsl.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm new file mode 100644 index 0000000000..cb5d6d3fe5 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm @@ -0,0 +1,455 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_vert_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_vert_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r3, r3, r2 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlal.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + addle r1, r1, r9 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlal.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + add r10, r10, r2 ; 11*strd + vmlsl.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlsl.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + vmlal.u8 q6, d16, d28 + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d17, d29 + add r10, r10, r2 ;12*strd + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + subs r7, r7, #4 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vst1.8 {d14}, [r14], r6 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vmlsl.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlsl.u8 q7, d16, d27 + vmlal.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlal.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlal.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlsl.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlsl.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlal.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c index 2bf2d890be..830f3176d7 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -24,7 +24,8 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, uint8_t temp[64 * 72]; // Account for the vertical phase needing 3 lines prior and 4 lines post - const int intermediate_height = h + 7; + // (+ 1 to make it divisible by 4). + const int intermediate_height = h + 8; assert(y_step_q4 == 16); assert(x_step_q4 == 16); @@ -48,7 +49,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { uint8_t temp[64 * 72]; - const int intermediate_height = h + 7; + const int intermediate_height = h + 8; assert(y_step_q4 == 16); assert(x_step_q4 == 16); diff --git a/libs/libvpx/vpx_dsp/avg.c b/libs/libvpx/vpx_dsp/avg.c index a7ac6d9538..1c45e8a73d 100644 --- a/libs/libvpx/vpx_dsp/avg.c +++ b/libs/libvpx/vpx_dsp/avg.c @@ -32,6 +32,166 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) { return (sum + 8) >> 4; } +#if CONFIG_VP9_HIGHBITDEPTH +// src_diff: 13 bit, dynamic range [-4095, 4095] +// coeff: 16 bit +static void hadamard_highbd_col8_first_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// src_diff: 16 bit, dynamic range [-32760, 32760] +// coeff: 19 bit +static void hadamard_highbd_col8_second_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int32_t *coeff) { + int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int32_t c0 = b0 + b2; + int32_t c1 = b1 + b3; + int32_t c2 = b0 - b2; + int32_t c3 = b1 - b3; + int32_t c4 = b4 + b6; + int32_t c5 = b5 + b7; + int32_t c6 = b4 - b6; + int32_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int32_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // src_diff: 13 bit + // buffer: 16 bit, dynamic range [-32760, 32760] + hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf); + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // buffer: 16 bit + // buffer2: 19 bit, dynamic range [-262080, 262080] + hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx); + ++tmp_buf; + } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; +} + +// In place 16x16 2D Hadamard transform +void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 19 bit, dynamic range [-262080, 262080] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; + tran_low_t b1 = (a0 - a1) >> 1; + tran_low_t b2 = (a2 + a3) >> 1; + tran_low_t b3 = (a2 - a3) >> 1; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 20 bit + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; + tran_low_t b1 = (a0 - a1) >> 2; + tran_low_t b2 = (a2 + a3) >> 2; + tran_low_t b3 = (a2 - a3) >> 2; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + // src_diff: first pass, 9 bit, dynamic range [-255, 255] // second pass, 12 bit, dynamic range [-2040, 2040] static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, @@ -123,6 +283,50 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, } } +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 2; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +// coeff: dynamic range 20 bit. +// length: value range {16, 64, 256, 1024}. +int vpx_highbd_satd_c(const tran_low_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 30 bits + return satd; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. int vpx_satd_c(const tran_low_t *coeff, int length) { diff --git a/libs/libvpx/vpx_dsp/bitreader.h b/libs/libvpx/vpx_dsp/bitreader.h index 6ee2a58632..a5927ea2ad 100644 --- a/libs/libvpx/vpx_dsp/bitreader.h +++ b/libs/libvpx/vpx_dsp/bitreader.h @@ -8,10 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITREADER_H_ -#define VPX_DSP_BITREADER_H_ +#ifndef VPX_VPX_DSP_BITREADER_H_ +#define VPX_VPX_DSP_BITREADER_H_ #include +#include #include #include "./vpx_config.h" @@ -19,6 +20,9 @@ #include "vpx/vp8dx.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG #ifdef __cplusplus extern "C" { @@ -94,7 +98,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) { } { - register int shift = vpx_norm[range]; + const unsigned char shift = vpx_norm[(unsigned char)range]; range <<= shift; value <<= shift; count -= shift; @@ -103,6 +107,31 @@ static INLINE int vpx_read(vpx_reader *r, int prob) { r->count = count; r->range = range; +#if CONFIG_BITSTREAM_DEBUG + { + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + int ref_result, ref_prob; + bitstream_queue_pop(&ref_result, &ref_prob); + if ((int)bit != ref_result) { + fprintf(stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + if (prob != ref_prob) { + fprintf(stderr, + "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d " + "queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + + assert(0); + } + } +#endif + return bit; } @@ -131,4 +160,4 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree, } // extern "C" #endif -#endif // VPX_DSP_BITREADER_H_ +#endif // VPX_VPX_DSP_BITREADER_H_ diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.c b/libs/libvpx/vpx_dsp/bitreader_buffer.c index 3e16bfa38c..f59f1f7cb9 100644 --- a/libs/libvpx/vpx_dsp/bitreader_buffer.c +++ b/libs/libvpx/vpx_dsp/bitreader_buffer.c @@ -23,7 +23,7 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) { rb->bit_offset = off + 1; return bit; } else { - rb->error_handler(rb->error_handler_data); + if (rb->error_handler != NULL) rb->error_handler(rb->error_handler_data); return 0; } } diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.h b/libs/libvpx/vpx_dsp/bitreader_buffer.h index 8a48a95ed1..b27703a4db 100644 --- a/libs/libvpx/vpx_dsp/bitreader_buffer.h +++ b/libs/libvpx/vpx_dsp/bitreader_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITREADER_BUFFER_H_ -#define VPX_DSP_BITREADER_BUFFER_H_ +#ifndef VPX_VPX_DSP_BITREADER_BUFFER_H_ +#define VPX_VPX_DSP_BITREADER_BUFFER_H_ #include @@ -44,4 +44,4 @@ int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits); } // extern "C" #endif -#endif // VPX_DSP_BITREADER_BUFFER_H_ +#endif // VPX_VPX_DSP_BITREADER_BUFFER_H_ diff --git a/libs/libvpx/vpx_dsp/bitwriter.c b/libs/libvpx/vpx_dsp/bitwriter.c index 81e28b309f..5b41aa54dd 100644 --- a/libs/libvpx/vpx_dsp/bitwriter.c +++ b/libs/libvpx/vpx_dsp/bitwriter.c @@ -12,6 +12,10 @@ #include "./bitwriter.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif + void vpx_start_encode(vpx_writer *br, uint8_t *source) { br->lowvalue = 0; br->range = 255; @@ -24,8 +28,15 @@ void vpx_start_encode(vpx_writer *br, uint8_t *source) { void vpx_stop_encode(vpx_writer *br) { int i; +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(1); +#endif for (i = 0; i < 32; i++) vpx_write_bit(br, 0); // Ensure there's no ambigous collision with any index marker bytes if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0; + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(0); +#endif } diff --git a/libs/libvpx/vpx_dsp/bitwriter.h b/libs/libvpx/vpx_dsp/bitwriter.h index 41040cf935..f276feefb1 100644 --- a/libs/libvpx/vpx_dsp/bitwriter.h +++ b/libs/libvpx/vpx_dsp/bitwriter.h @@ -8,12 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITWRITER_H_ -#define VPX_DSP_BITWRITER_H_ +#ifndef VPX_VPX_DSP_BITWRITER_H_ +#define VPX_VPX_DSP_BITWRITER_H_ + +#include #include "vpx_ports/mem.h" #include "vpx_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG #ifdef __cplusplus extern "C" { @@ -27,15 +32,30 @@ typedef struct vpx_writer { uint8_t *buffer; } vpx_writer; -void vpx_start_encode(vpx_writer *bc, uint8_t *buffer); -void vpx_stop_encode(vpx_writer *bc); +void vpx_start_encode(vpx_writer *br, uint8_t *source); +void vpx_stop_encode(vpx_writer *br); static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; unsigned int lowvalue = br->lowvalue; - register int shift; + int shift; + +#if CONFIG_BITSTREAM_DEBUG + /* + int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = bitstream_queue_get_frame_write(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + assert(0); + } + */ + bitstream_queue_push(bit, probability); +#endif split = 1 + (((range - 1) * probability) >> 8); @@ -94,4 +114,4 @@ static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) { } // extern "C" #endif -#endif // VPX_DSP_BITWRITER_H_ +#endif // VPX_VPX_DSP_BITWRITER_H_ diff --git a/libs/libvpx/vpx_dsp/bitwriter_buffer.h b/libs/libvpx/vpx_dsp/bitwriter_buffer.h index a123a2fe8c..3662cb64df 100644 --- a/libs/libvpx/vpx_dsp/bitwriter_buffer.h +++ b/libs/libvpx/vpx_dsp/bitwriter_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITWRITER_BUFFER_H_ -#define VPX_DSP_BITWRITER_BUFFER_H_ +#ifndef VPX_VPX_DSP_BITWRITER_BUFFER_H_ +#define VPX_VPX_DSP_BITWRITER_BUFFER_H_ #include "vpx/vpx_integer.h" @@ -35,4 +35,4 @@ void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data, } // extern "C" #endif -#endif // VPX_DSP_BITWRITER_BUFFER_H_ +#endif // VPX_VPX_DSP_BITWRITER_BUFFER_H_ diff --git a/libs/libvpx/vpx_dsp/deblock.c b/libs/libvpx/vpx_dsp/deblock.c index 94acbb3919..455b73bbce 100644 --- a/libs/libvpx/vpx_dsp/deblock.c +++ b/libs/libvpx/vpx_dsp/deblock.c @@ -39,11 +39,10 @@ const int16_t vpx_rv[] = { 9, 10, 13, }; -void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, int cols, - unsigned char *f, int size) { +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, + unsigned char *dst, int src_pitch, + int dst_pitch, int cols, + unsigned char *flimits, int size) { unsigned char *p_src, *p_dst; int row; int col; @@ -55,19 +54,21 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, for (row = 0; row < size; row++) { /* post_proc_down for one row */ - p_src = src_ptr; - p_dst = dst_ptr; + p_src = src; + p_dst = dst; for (col = 0; col < cols; col++) { - unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line]; - unsigned char p_above1 = p_src[col - src_pixels_per_line]; - unsigned char p_below1 = p_src[col + src_pixels_per_line]; - unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line]; + unsigned char p_above2 = p_src[col - 2 * src_pitch]; + unsigned char p_above1 = p_src[col - src_pitch]; + unsigned char p_below1 = p_src[col + src_pitch]; + unsigned char p_below2 = p_src[col + 2 * src_pitch]; v = p_src[col]; - if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) && - (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { + if ((abs(v - p_above2) < flimits[col]) && + (abs(v - p_above1) < flimits[col]) && + (abs(v - p_below1) < flimits[col]) && + (abs(v - p_below2) < flimits[col])) { unsigned char k1, k2, k3; k1 = (p_above2 + p_above1 + 1) >> 1; k2 = (p_below2 + p_below1 + 1) >> 1; @@ -79,8 +80,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, } /* now post_proc_across */ - p_src = dst_ptr; - p_dst = dst_ptr; + p_src = dst; + p_dst = dst; p_src[-2] = p_src[-1] = p_src[0]; p_src[cols] = p_src[cols + 1] = p_src[cols - 1]; @@ -88,10 +89,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, for (col = 0; col < cols; col++) { v = p_src[col]; - if ((abs(v - p_src[col - 2]) < f[col]) && - (abs(v - p_src[col - 1]) < f[col]) && - (abs(v - p_src[col + 1]) < f[col]) && - (abs(v - p_src[col + 2]) < f[col])) { + if ((abs(v - p_src[col - 2]) < flimits[col]) && + (abs(v - p_src[col - 1]) < flimits[col]) && + (abs(v - p_src[col + 1]) < flimits[col]) && + (abs(v - p_src[col + 2]) < flimits[col])) { unsigned char k1, k2, k3; k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; @@ -109,8 +110,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, p_dst[col - 1] = d[(col - 1) & 3]; /* next row */ - src_ptr += src_pixels_per_line; - dst_ptr += dst_pixels_per_line; + src += src_pitch; + dst += dst_pitch; } } diff --git a/libs/libvpx/vpx_dsp/fastssim.c b/libs/libvpx/vpx_dsp/fastssim.c index 0469071a17..6ab6f557e2 100644 --- a/libs/libvpx/vpx_dsp/fastssim.c +++ b/libs/libvpx/vpx_dsp/fastssim.c @@ -128,10 +128,12 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) { int i1; i0 = 2 * i; i1 = FS_MINI(i0 + 1, w2); - dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + - src1[j1offs + i0] + src1[j1offs + i1]; - dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + - src2[j1offs + i0] + src2[j1offs + i1]; + dst1[j * w + i] = + (uint32_t)((int64_t)src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]); + dst2[j * w + i] = + (uint32_t)((int64_t)src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]); } } } @@ -220,12 +222,12 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { ssim = _ctx->level[_l].ssim; c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); for (j = 0; j < h; j++) { - unsigned mux; - unsigned muy; + int64_t mux; + int64_t muy; int i0; int i1; - mux = 5 * col_sums_x[0]; - muy = 5 * col_sums_y[0]; + mux = (int64_t)5 * col_sums_x[0]; + muy = (int64_t)5 * col_sums_y[0]; for (i = 1; i < 4; i++) { i1 = FS_MINI(i, w - 1); mux += col_sums_x[i1]; @@ -237,8 +239,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { if (i + 1 < w) { i0 = FS_MAXI(0, i - 4); i1 = FS_MINI(i + 4, w - 1); - mux += col_sums_x[i1] - col_sums_x[i0]; - muy += col_sums_x[i1] - col_sums_x[i0]; + mux += (int)col_sums_x[i1] - (int)col_sums_x[i0]; + muy += (int)col_sums_x[i1] - (int)col_sums_x[i0]; } } if (j + 1 < h) { @@ -246,8 +248,10 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; j1offs = FS_MINI(j + 4, h - 1) * w; - for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; - for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + for (i = 0; i < w; i++) + col_sums_x[i] = (uint32_t)((int64_t)col_sums_x[i] + im1[j1offs + i]); + for (i = 0; i < w; i++) + col_sums_y[i] = (uint32_t)((int64_t)col_sums_y[i] + im2[j1offs + i]); } } } @@ -343,18 +347,18 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { for (j = 0; j < h + 4; j++) { if (j < h - 1) { for (i = 0; i < w - 1; i++) { - unsigned g1; - unsigned g2; - unsigned gx; - unsigned gy; - g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); - g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); + int64_t g1; + int64_t g2; + int64_t gx; + int64_t gy; + g1 = labs((int64_t)im1[(j + 1) * w + i + 1] - (int64_t)im1[j * w + i]); + g2 = labs((int64_t)im1[(j + 1) * w + i] - (int64_t)im1[j * w + i + 1]); gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); - g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); - g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); - gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); - gx_buf[(j & 7) * stride + i + 4] = gx; - gy_buf[(j & 7) * stride + i + 4] = gy; + g1 = labs((int64_t)im2[(j + 1) * w + i + 1] - (int64_t)im2[j * w + i]); + g2 = labs((int64_t)im2[(j + 1) * w + i] - (int64_t)im2[j * w + i + 1]); + gy = ((int64_t)4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2)); + gx_buf[(j & 7) * stride + i + 4] = (uint32_t)gx; + gy_buf[(j & 7) * stride + i + 4] = (uint32_t)gy; } } else { memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.c b/libs/libvpx/vpx_dsp/fwd_txfm.c index 6dcb3ba668..ef66de0247 100644 --- a/libs/libvpx/vpx_dsp/fwd_txfm.c +++ b/libs/libvpx/vpx_dsp/fwd_txfm.c @@ -87,11 +87,11 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { output[0] = sum * 2; } -void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; tran_low_t intermediate[64]; int pass; - tran_low_t *output = intermediate; + tran_low_t *out = intermediate; const tran_low_t *in = NULL; // Transform columns @@ -133,10 +133,10 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { t1 = (x0 - x1) * cospi_16_64; t2 = x2 * cospi_24_64 + x3 * cospi_8_64; t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - output[0] = (tran_low_t)fdct_round_shift(t0); - output[2] = (tran_low_t)fdct_round_shift(t2); - output[4] = (tran_low_t)fdct_round_shift(t1); - output[6] = (tran_low_t)fdct_round_shift(t3); + out[0] = (tran_low_t)fdct_round_shift(t0); + out[2] = (tran_low_t)fdct_round_shift(t2); + out[4] = (tran_low_t)fdct_round_shift(t1); + out[6] = (tran_low_t)fdct_round_shift(t3); // Stage 2 t0 = (s6 - s5) * cospi_16_64; @@ -155,19 +155,19 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { t1 = x1 * cospi_12_64 + x2 * cospi_20_64; t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - output[1] = (tran_low_t)fdct_round_shift(t0); - output[3] = (tran_low_t)fdct_round_shift(t2); - output[5] = (tran_low_t)fdct_round_shift(t1); - output[7] = (tran_low_t)fdct_round_shift(t3); - output += 8; + out[1] = (tran_low_t)fdct_round_shift(t0); + out[3] = (tran_low_t)fdct_round_shift(t2); + out[5] = (tran_low_t)fdct_round_shift(t1); + out[7] = (tran_low_t)fdct_round_shift(t3); + out += 8; } in = intermediate; - output = final_output; + out = output; } // Rows for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; + for (j = 0; j < 8; ++j) output[j + i * 8] /= 2; } } @@ -705,9 +705,9 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; - tran_high_t output[32 * 32]; + tran_high_t out[32 * 32]; // Columns for (i = 0; i < 32; ++i) { @@ -715,16 +715,16 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) - out[j + i * 32] = + output[j + i * 32] = (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); } } @@ -732,9 +732,9 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { // Note that although we use dct_32_round in dct32 computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; - tran_high_t output[32 * 32]; + tran_high_t out[32 * 32]; // Columns for (i = 0; i < 32; ++i) { @@ -745,15 +745,15 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { // TODO(cd): see quality impact of only doing // output[j * 32 + i] = (temp_out[j] + 1) >> 2; // PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; vpx_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j]; } } @@ -772,14 +772,14 @@ void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, vpx_fdct4x4_c(input, output, stride); } -void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct8x8_c(input, final_output, stride); + vpx_fdct8x8_c(input, output, stride); } -void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct8x8_1_c(input, final_output, stride); + vpx_fdct8x8_1_c(input, output, stride); } void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, @@ -792,17 +792,18 @@ void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, vpx_fdct16x16_1_c(input, output, stride); } -void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - vpx_fdct32x32_c(input, out, stride); +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_c(input, output, stride); } -void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct32x32_rd_c(input, out, stride); + vpx_fdct32x32_rd_c(input, output, stride); } -void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct32x32_1_c(input, out, stride); + vpx_fdct32x32_1_c(input, output, stride); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.h b/libs/libvpx/vpx_dsp/fwd_txfm.h index 29e139c73b..a43c8ea7f7 100644 --- a/libs/libvpx/vpx_dsp/fwd_txfm.h +++ b/libs/libvpx/vpx_dsp/fwd_txfm.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_FWD_TXFM_H_ -#define VPX_DSP_FWD_TXFM_H_ +#ifndef VPX_VPX_DSP_FWD_TXFM_H_ +#define VPX_VPX_DSP_FWD_TXFM_H_ #include "vpx_dsp/txfm_common.h" @@ -22,4 +22,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) { } void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round); -#endif // VPX_DSP_FWD_TXFM_H_ +#endif // VPX_VPX_DSP_FWD_TXFM_H_ diff --git a/libs/libvpx/vpx_dsp/inv_txfm.c b/libs/libvpx/vpx_dsp/inv_txfm.c index 0194aa1e18..69de05e718 100644 --- a/libs/libvpx/vpx_dsp/inv_txfm.c +++ b/libs/libvpx/vpx_dsp/inv_txfm.c @@ -67,11 +67,11 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; - const tran_low_t *ip = in; + const tran_low_t *ip = input; tran_low_t *op = tmp; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1346,12 +1346,12 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, } } -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest, +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; - const tran_low_t *ip = in; + const tran_low_t *ip = input; tran_low_t *op = tmp; (void)bd; diff --git a/libs/libvpx/vpx_dsp/inv_txfm.h b/libs/libvpx/vpx_dsp/inv_txfm.h index 13137659fa..6eedbeac35 100644 --- a/libs/libvpx/vpx_dsp/inv_txfm.h +++ b/libs/libvpx/vpx_dsp/inv_txfm.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_INV_TXFM_H_ -#define VPX_DSP_INV_TXFM_H_ +#ifndef VPX_VPX_DSP_INV_TXFM_H_ +#define VPX_VPX_DSP_INV_TXFM_H_ #include @@ -76,7 +76,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { // bd of 10 uses trans_low with 18bits, need to remove 14bits // bd of 12 uses trans_low with 20bits, need to remove 12bits // bd of x uses trans_low with 8+x bits, need to remove 24-x bits - #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) #if CONFIG_VP9_HIGHBITDEPTH #define HIGHBD_WRAPLOW(x, bd) \ @@ -123,4 +122,4 @@ static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { } // extern "C" #endif -#endif // VPX_DSP_INV_TXFM_H_ +#endif // VPX_VPX_DSP_INV_TXFM_H_ diff --git a/libs/libvpx/vpx_dsp/loopfilter.c b/libs/libvpx/vpx_dsp/loopfilter.c index 9866ea37d6..47f30c96af 100644 --- a/libs/libvpx/vpx_dsp/loopfilter.c +++ b/libs/libvpx/vpx_dsp/loopfilter.c @@ -109,29 +109,30 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; } -void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch); ++s; } } -void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); + vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, @@ -178,31 +179,33 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, } } -void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p); + filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch, + s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch); ++s; } } -void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); + vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, @@ -283,7 +286,8 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, } } -static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, +static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; @@ -291,34 +295,37 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8 * count; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = - flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, - s[4 * p], s[5 * p], s[6 * p], s[7 * p]); + const int8_t flat2 = flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]); - filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, - s + 7 * p); + filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, + s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch, + s + 7 * pitch); ++s; } } -void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1); } -void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2); } -static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, +static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; @@ -335,18 +342,18 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); - s += p; + s += pitch; } } -void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8); } -void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16); } #if CONFIG_VP9_HIGHBITDEPTH @@ -440,7 +447,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); } -void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; @@ -448,27 +455,28 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, + s + 1 * pitch, bd); ++s; } } void vpx_highbd_lpf_horizontal_4_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); + vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, @@ -517,33 +525,36 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, } } -void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, - s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, bd); ++s; } } void vpx_highbd_lpf_horizontal_8_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); + vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, @@ -639,7 +650,7 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, } } -static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, @@ -649,44 +660,45 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8 * count; ++i) { - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat2 = - highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, - s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); + const int8_t flat2 = highbd_flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd); - highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, - s + 6 * p, s + 7 * p, bd); + highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, + s + 6 * pitch, s + 7 * pitch, bd); ++s; } } -void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); } -void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p, +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd); } -static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, @@ -712,20 +724,20 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, bd); - s += p; + s += pitch; } } -void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd); } -void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c index 43d2c1146e..97541411e4 100644 --- a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c +++ b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c @@ -9,7 +9,9 @@ */ #include -#include "./macros_msa.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise, int blackclamp, int whiteclamp, int width, diff --git a/libs/libvpx/vpx_dsp/mips/avg_msa.c b/libs/libvpx/vpx_dsp/mips/avg_msa.c index d0ac7b8e29..3fd18dec56 100644 --- a/libs/libvpx/vpx_dsp/mips/avg_msa.c +++ b/libs/libvpx/vpx_dsp/mips/avg_msa.c @@ -9,6 +9,7 @@ */ #include +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" @@ -56,6 +57,7 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { return sum_out; } +#if !CONFIG_VP9_HIGHBITDEPTH void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride, int16_t *dst) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7; @@ -391,6 +393,7 @@ int vpx_satd_msa(const int16_t *data, int length) { return satd; } +#endif // !CONFIG_VP9_HIGHBITDEPTH void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { diff --git a/libs/libvpx/vpx_dsp/mips/common_dspr2.h b/libs/libvpx/vpx_dsp/mips/common_dspr2.h index 0a42f5cec2..87a5bbab56 100644 --- a/libs/libvpx/vpx_dsp/mips/common_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/common_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_COMMON_MIPS_DSPR2_H_ -#define VPX_COMMON_MIPS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ #include #include "./vpx_config.h" @@ -45,4 +45,4 @@ static INLINE void prefetch_store_streamed(unsigned char *dst) { } // extern "C" #endif -#endif // VPX_COMMON_MIPS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c index d9c2bef69e..cc458c8618 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/mips/convolve_common_dspr2.h" #include "vpx_dsp/vpx_convolve.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" #if HAVE_DSPR2 @@ -341,7 +342,7 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); - if (((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_y) == 2) { vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c index fb68ad8813..7a9aa49d8a 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/mips/convolve_common_dspr2.h" #include "vpx_dsp/vpx_convolve.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" #if HAVE_DSPR2 @@ -945,7 +946,7 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); - if (((const int32_t *)filter_x)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2) { vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c index 89f0f41962..1e7052f6c5 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c @@ -1322,7 +1322,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, if (filter_x[3] == 0x80) { copy_horiz_transposed(src - src_stride * 3, src_stride, temp, intermediate_height, w, intermediate_height); - } else if (((const int32_t *)filter_x)[0] == 0) { + } else if (vpx_get_filter_taps(filter_x) == 2) { vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp, intermediate_height, filter_x, w, intermediate_height); } else { @@ -1365,7 +1365,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, /* copy the src to dst */ if (filter_y[3] == 0x80) { copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); - } else if (((const int32_t *)filter_y)[0] == 0) { + } else if (vpx_get_filter_taps(filter_y) == 2) { vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, filter_y, h, w); } else { diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c index 77e95c8444..09d6f36e56 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c @@ -825,7 +825,7 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); - if (((const int32_t *)filter_x)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2) { vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c index c329f71ccf..fd977b5336 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c @@ -325,7 +325,7 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); - if (((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_y) == 2) { vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { diff --git a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h index 48e440d73c..14b65bc650 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_ -#define VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ #include @@ -55,4 +55,4 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, } // extern "C" #endif -#endif // VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/libs/libvpx/vpx_dsp/mips/deblock_msa.c b/libs/libvpx/vpx_dsp/mips/deblock_msa.c index aafa272fbd..4e93ff594d 100644 --- a/libs/libvpx/vpx_dsp/mips/deblock_msa.c +++ b/libs/libvpx/vpx_dsp/mips/deblock_msa.c @@ -10,42 +10,42 @@ #include -#include "./macros_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" extern const int16_t vpx_rv[]; -#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \ - out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, out12, out13, out14, \ - out15) \ - { \ - v8i16 temp0, temp1, temp2, temp3, temp4; \ - v8i16 temp5, temp6, temp7, temp8, temp9; \ - \ - ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ - temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ - ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ - temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out8, out10); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out12, out14); \ - out0 = (v16u8)temp6; \ - out2 = (v16u8)temp7; \ - out4 = (v16u8)temp8; \ - out6 = (v16u8)temp9; \ - out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ - out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ - out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ - out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ - out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ - out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ - out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ +#define VPX_TRANSPOSE8x16_UB_UB( \ + in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ + ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out8, out10); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out12, out14); \ + out0 = (v16u8)temp6; \ + out2 = (v16u8)temp7; \ + out4 = (v16u8)temp8; \ + out6 = (v16u8)temp9; \ + out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ + out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ + out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ + out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ + out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ } #define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \ @@ -509,11 +509,11 @@ void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst, } } -void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, - int32_t rows, int32_t cols, int32_t flimit) { +void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows, + int32_t cols, int32_t flimit) { int32_t row, col, cnt; - uint8_t *src_dup = src_ptr; - v16u8 src0, src, tmp_orig; + uint8_t *src_dup = src; + v16u8 src0, src1, tmp_orig; v16u8 tmp = { 0 }; v16i8 zero = { 0 }; v8u16 sum_h, src_r_h, src_l_h; @@ -532,13 +532,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, src_dup[cols + 16] = src_dup[cols - 1]; tmp_orig = (v16u8)__msa_ldi_b(0); tmp_orig[15] = tmp[15]; - src = LD_UB(src_dup - 8); - src[15] = 0; - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src1 = LD_UB(src_dup - 8); + src1[15] = 0; + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); sum_sq = HADD_SW_S32(src_r_w) + 16; - sum_h = __msa_hadd_u_h(src, src); + sum_h = __msa_hadd_u_h(src1, src1); sum = HADD_UH_U32(sum_h); { v16u8 src7, src8, src_r, src_l; @@ -567,8 +567,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1]; } sum = sum_l[7]; - src = LD_UB(src_dup + 16 * col); - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src1 = LD_UB(src_dup + 16 * col); + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4); src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4); tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7); @@ -614,7 +614,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, total3 = (total3 < flimit_vec); PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); - tmp = __msa_bmz_v(tmp, src, (v16u8)mask); + tmp = __msa_bmz_v(tmp, src1, (v16u8)mask); if (col == 0) { uint64_t src_d; diff --git a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c index 06fdc951e7..36583e2d24 100644 --- a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c +++ b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" static void fdct8x32_1d_column_load_butterfly(const int16_t *input, diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h index fd589224d3..c0be56b819 100644 --- a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h +++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_FWD_TXFM_MSA_H_ -#define VPX_DSP_MIPS_FWD_TXFM_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ +#define VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ #include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/txfm_common.h" @@ -361,4 +361,4 @@ void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride); void fdct16x8_1d_row(int16_t *input, int16_t *output); -#endif // VPX_DSP_MIPS_FWD_TXFM_MSA_H_ +#endif // VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c b/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c index 2a211c5677..7ca61a28ec 100644 --- a/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c +++ b/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { diff --git a/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c index 2ea6136f9b..053948183a 100644 --- a/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c +++ b/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" static void idct32x8_row_transpose_store(const int16_t *input, diff --git a/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c b/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c index 0a85742f10..56ffec3cba 100644 --- a/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c +++ b/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, diff --git a/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c b/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c index 7f77d20191..a383ff2066 100644 --- a/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c +++ b/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h index 27881f0db6..cbea22f20f 100644 --- a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ -#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ #include @@ -25,7 +25,6 @@ extern "C" { #if HAVE_DSPR2 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ ({ \ - \ int32_t tmp, out; \ int dct_cost_rounding = DCT_CONST_ROUNDING; \ int in = input; \ @@ -73,4 +72,4 @@ void iadst16_dspr2(const int16_t *input, int16_t *output); } // extern "C" #endif -#endif // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h index 1fe9b28e8a..3b66249ef2 100644 --- a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h +++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_ -#define VPX_DSP_MIPS_INV_TXFM_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ +#define VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/txfm_macros_msa.h" @@ -408,4 +408,4 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output); void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, int32_t dst_stride); void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); -#endif // VPX_DSP_MIPS_INV_TXFM_MSA_H_ +#endif // VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h index 5b0c73345b..ec339be868 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ -#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ #include @@ -731,4 +731,4 @@ static INLINE void wide_mbfilter_dspr2( } // extern "C" #endif -#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h index 38ed0b2a63..9af0b42360 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ -#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ #include @@ -432,4 +432,4 @@ extern "C" { } // extern "C" #endif -#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h index ee11142266..24c492bea0 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ -#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ #include @@ -352,4 +352,4 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, } // extern "C" #endif -#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h index 49fd74c25a..1ea05e0b0b 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_LOOPFILTER_MSA_H_ -#define VPX_DSP_LOOPFILTER_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" @@ -174,4 +174,4 @@ mask_out = limit_in < (v16u8)mask_out; \ mask_out = __msa_xori_b(mask_out, 0xff); \ } -#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/libs/libvpx/vpx_dsp/mips/macros_msa.h b/libs/libvpx/vpx_dsp/mips/macros_msa.h index f9a446e7bc..a3a5a4dfee 100644 --- a/libs/libvpx/vpx_dsp/mips/macros_msa.h +++ b/libs/libvpx/vpx_dsp/mips/macros_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_MACROS_MSA_H_ -#define VPX_DSP_MIPS_MACROS_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_ +#define VPX_VPX_DSP_MIPS_MACROS_MSA_H_ #include @@ -1966,4 +1966,4 @@ \ tmp1_m; \ }) -#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ +#endif // VPX_VPX_DSP_MIPS_MACROS_MSA_H_ diff --git a/libs/libvpx/vpx_dsp/mips/sad_mmi.c b/libs/libvpx/vpx_dsp/mips/sad_mmi.c index 33bd3fe7f9..4368db5fdb 100644 --- a/libs/libvpx/vpx_dsp/mips/sad_mmi.c +++ b/libs/libvpx/vpx_dsp/mips/sad_mmi.c @@ -341,7 +341,7 @@ const uint8_t *ref_array, int ref_stride, \ uint32_t *sad_array) { \ int i; \ - for (i = 0; i < k; ++i) \ + for (i = 0; i < (k); ++i) \ sad_array[i] = \ vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \ } diff --git a/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c index 313e06f92d..572fcabfc0 100644 --- a/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c +++ b/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c @@ -27,13 +27,14 @@ static const uint8_t bilinear_filters_msa[8][2] = { HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ \ - sub += res_l0_m + res_l1_m; \ + (sub) += res_l0_m + res_l1_m; \ } -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) #define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, @@ -1619,16 +1620,16 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \ - const uint8_t *src, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ uint32_t *sse) { \ int32_t diff; \ uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ } else { \ @@ -1638,7 +1639,7 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( \ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ \ @@ -1672,15 +1673,15 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ v_filter, ht, &diff); \ @@ -1690,7 +1691,7 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); &diff); \ } \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ &diff); \ @@ -1719,16 +1720,16 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t src_stride, - int32_t xoffset, int32_t yoffset, + int32_t x_offset, int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, uint32_t *sse, const uint8_t *sec_pred) { int32_t diff; - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; - if (yoffset) { - if (xoffset) { + if (y_offset) { + if (x_offset) { *sse = sub_pixel_avg_sse_diff_32width_hv_msa( src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, v_filter, 64, &diff); @@ -1738,7 +1739,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, v_filter, 64, &diff); } } else { - if (xoffset) { + if (x_offset) { *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, 64, &diff); @@ -1753,15 +1754,15 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ v_filter, ht, &diff); \ @@ -1771,7 +1772,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, &diff); \ } \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ &diff); \ diff --git a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h index f077fa4814..f27504a207 100644 --- a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h +++ b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ -#define VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ +#define VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" @@ -98,4 +98,4 @@ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ } -#endif // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ +#endif // VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ diff --git a/libs/libvpx/vpx_dsp/mips/variance_mmi.c b/libs/libvpx/vpx_dsp/mips/variance_mmi.c index 4af60d3634..c1780c33af 100644 --- a/libs/libvpx/vpx_dsp/mips/variance_mmi.c +++ b/libs/libvpx/vpx_dsp/mips/variance_mmi.c @@ -87,10 +87,10 @@ static const uint8_t bilinear_filters[8][2] = { "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" #define VARIANCE_SSE_8 \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \ - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ @@ -101,10 +101,10 @@ static const uint8_t bilinear_filters[8][2] = { #define VARIANCE_SSE_16 \ VARIANCE_SSE_8 \ - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \ - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ @@ -115,11 +115,11 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ @@ -129,11 +129,11 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ @@ -169,12 +169,12 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ @@ -190,12 +190,12 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ @@ -258,12 +258,12 @@ static const uint8_t bilinear_filters[8][2] = { VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ \ /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ @@ -282,12 +282,12 @@ static const uint8_t bilinear_filters[8][2] = { VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ \ /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ @@ -357,24 +357,23 @@ static const uint8_t bilinear_filters[8][2] = { // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); - ++a; + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } @@ -387,28 +386,27 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); - ++a; + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } -static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[12]; @@ -424,57 +422,57 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "mfc1 %[tmp1], %[ftmp9] \n\t" @@ -491,9 +489,10 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), [tmp2]"=&r"(tmp[2]), - [a]"+&r"(a), [b]"+&r"(b), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), [sum]"=&r"(sum) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse) : "memory" ); @@ -501,18 +500,19 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (64 * high)); } -#define VPX_VARIANCE64XN(n) \ - uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE64XN(n) \ + uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE64XN(64) VPX_VARIANCE64XN(32) -uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, uint32_t *sse) { +uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse) { int sum; double ftmp[12]; uint32_t tmp[3]; @@ -527,33 +527,33 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "mfc1 %[tmp1], %[ftmp9] \n\t" @@ -570,9 +570,10 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), [tmp2]"=&r"(tmp[2]), - [a]"+&r"(a), [b]"+&r"(b), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), [sum]"=&r"(sum) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [sse]"r"(sse) : "memory" ); @@ -580,8 +581,8 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, return *sse - (((int64_t)sum * sum) / 2048); } -static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[13]; @@ -598,30 +599,30 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -646,8 +647,9 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -655,18 +657,18 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (32 * high)); } -#define VPX_VARIANCE32XN(n) \ - uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE32XN(n) \ + uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE32XN(32) VPX_VARIANCE32XN(16) -static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[13]; @@ -683,20 +685,20 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -721,8 +723,9 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -730,19 +733,19 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (16 * high)); } -#define VPX_VARIANCE16XN(n) \ - uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE16XN(n) \ + uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE16XN(32) VPX_VARIANCE16XN(16) VPX_VARIANCE16XN(8) -static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[13]; @@ -759,15 +762,15 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -792,8 +795,9 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -801,19 +805,19 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (8 * high)); } -#define VPX_VARIANCE8XN(n) \ - uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE8XN(n) \ + uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE8XN(16) VPX_VARIANCE8XN(8) VPX_VARIANCE8XN(4) -static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[12]; @@ -830,15 +834,15 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_4 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t" @@ -862,8 +866,9 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -871,19 +876,19 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (4 * high)); } -#define VPX_VARIANCE4XN(n) \ - uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE4XN(n) \ + uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE4XN(8) VPX_VARIANCE4XN(4) -static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, uint32_t *sse, - uint64_t high) { +static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { double ftmp[12]; uint32_t tmp[1]; @@ -900,8 +905,8 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, VARIANCE_SSE_16 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -914,8 +919,9 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse) : "memory" ); @@ -923,19 +929,19 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, return *sse; } -#define vpx_mse16xN(n) \ - uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \ +#define vpx_mse16xN(n) \ + uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } vpx_mse16xN(16); vpx_mse16xN(8); -static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, uint32_t *sse, - uint64_t high) { +static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { double ftmp[12]; uint32_t tmp[1]; @@ -952,8 +958,8 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, VARIANCE_SSE_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -966,8 +972,9 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse) : "memory" ); @@ -975,28 +982,29 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, return *sse; } -#define vpx_mse8xN(n) \ - uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \ +#define vpx_mse8xN(n) \ + uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } vpx_mse8xN(16); vpx_mse8xN(8); -#define SUBPIX_VAR(W, H) \ - uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[((H) + 1) * (W)]; \ + uint8_t temp2[(H) * (W)]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR(64, 64) @@ -1006,9 +1014,10 @@ SUBPIX_VAR(32, 32) SUBPIX_VAR(32, 16) SUBPIX_VAR(16, 32) -static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - uint8_t *temp2, int counter) { +static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[15]; @@ -1016,8 +1025,8 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; - const uint8_t *filter_x = bilinear_filters[xoffset]; - const uint8_t *filter_y = bilinear_filters[yoffset]; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" @@ -1031,26 +1040,26 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, // fdata3: fdata3[0] ~ fdata3[15] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A - // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B // temp2: temp2[0] ~ temp2[15] VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A - // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A // temp2+16*1: temp2[0] ~ temp2[15] MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B "1: \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B @@ -1062,43 +1071,44 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), - [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) : [filter_x0] "f"((uint64_t)filter_x[0]), [filter_x1] "f"((uint64_t)filter_x[1]), [filter_y0] "f"((uint64_t)filter_y[0]), [filter_y1] "f"((uint64_t)filter_y[1]), - [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), [mask] "f"(mask) : "memory" ); } -#define SUBPIX_VAR16XN(H) \ - uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp2[16 * H]; \ - var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \ - (H - 2) / 2); \ - \ - return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \ +#define SUBPIX_VAR16XN(H) \ + uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[16 * (H)]; \ + var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H)-2) / 2); \ + \ + return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR16XN(16) SUBPIX_VAR16XN(8) -static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - uint8_t *temp2, int counter) { +static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[15]; mips_reg tmp[2]; DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; - const uint8_t *filter_x = bilinear_filters[xoffset]; - const uint8_t *filter_y = bilinear_filters[yoffset]; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" @@ -1112,26 +1122,26 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, // fdata3: fdata3[0] ~ fdata3[7] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A - // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B // temp2: temp2[0] ~ temp2[7] VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A - // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A // temp2+8*1: temp2[0] ~ temp2[7] MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B "1: \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B @@ -1143,44 +1153,45 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), - [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) : [filter_x0] "f"((uint64_t)filter_x[0]), [filter_x1] "f"((uint64_t)filter_x[1]), [filter_y0] "f"((uint64_t)filter_y[0]), [filter_y1] "f"((uint64_t)filter_y[1]), - [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), [mask] "f"(mask) : "memory" ); } -#define SUBPIX_VAR8XN(H) \ - uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp2[8 * H]; \ - var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \ - (H - 2) / 2); \ - \ - return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \ +#define SUBPIX_VAR8XN(H) \ + uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[8 * (H)]; \ + var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H)-2) / 2); \ + \ + return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR8XN(16) SUBPIX_VAR8XN(8) SUBPIX_VAR8XN(4) -static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - uint8_t *temp2, int counter) { +static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[7]; mips_reg tmp[2]; DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; - const uint8_t *filter_x = bilinear_filters[xoffset]; - const uint8_t *filter_y = bilinear_filters[yoffset]; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" @@ -1193,26 +1204,26 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, // fdata3: fdata3[0] ~ fdata3[3] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A - // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B // temp2: temp2[0] ~ temp2[7] VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A - // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A // temp2+4*1: temp2[0] ~ temp2[7] MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B "1: \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B @@ -1220,49 +1231,49 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, "bnez %[counter], 1b \n\t" : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), - [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), + [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) : [filter_x0] "f"((uint64_t)filter_x[0]), [filter_x1] "f"((uint64_t)filter_x[1]), [filter_y0] "f"((uint64_t)filter_y[0]), [filter_y1] "f"((uint64_t)filter_y[1]), - [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), [mask] "f"(mask) : "memory" ); } -#define SUBPIX_VAR4XN(H) \ - uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp2[4 * H]; \ - var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \ - (H - 2) / 2); \ - \ - return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \ +#define SUBPIX_VAR4XN(H) \ + uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[4 * (H)]; \ + var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H)-2) / 2); \ + \ + return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR4XN(8) SUBPIX_VAR4XN(4) -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ - \ - return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \ +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[((H) + 1) * (W)]; \ + uint8_t temp2[(H) * (W)]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \ } SUBPIX_AVG_VAR(64, 64) diff --git a/libs/libvpx/vpx_dsp/mips/variance_msa.c b/libs/libvpx/vpx_dsp/mips/variance_msa.c index 49b2f99230..444b086a6e 100644 --- a/libs/libvpx/vpx_dsp/mips/variance_msa.c +++ b/libs/libvpx/vpx_dsp/mips/variance_msa.c @@ -33,10 +33,11 @@ sub += res_l0_m + res_l1_m; \ } -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) #define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c index 187a013421..5b5a1cbc3a 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -658,7 +658,7 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor[cnt] = filter_x[cnt]; } - if (((const int32_t *)filter_x)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2) { switch (w) { case 4: common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c index 5187cea21c..ba816192a1 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -538,8 +538,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, @@ -571,8 +571,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, x_step_q4, y0_q4, y_step_q4, w, h); break; } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c index ef8c901140..e6a790dfc6 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -625,7 +625,7 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c index 152dc26104..792c0f709c 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -634,7 +634,7 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor[cnt] = filter_x[cnt]; } - if (((const int32_t *)filter_x)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2) { switch (w) { case 4: common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c new file mode 100644 index 0000000000..ba9ceb8665 --- /dev/null +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx_ports/mem.h" + +#define GET_DATA_H_MMI \ + "pmaddhw %[ftmp4], %[ftmp4], %[filter1] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp5], %[filter2] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "punpckhwd %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp6], %[filter1] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp7], %[filter2] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ + "pmaddhw %[ftmp8], %[ftmp8], %[filter1] \n\t" \ + "pmaddhw %[ftmp9], %[ftmp9], %[filter2] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpckhwd %[ftmp9], %[ftmp8], %[ftmp0] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp10], %[ftmp10], %[filter1] \n\t" \ + "pmaddhw %[ftmp11], %[ftmp11], %[filter2] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" + +#define GET_DATA_V_MMI \ + "punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srcl], %[srcl], %[filter10] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srch], %[srch], %[filter10] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" + +/* clang-format off */ +#define ROUND_POWER_OF_TWO_MMI \ + /* Add para[0] */ \ + "lw %[tmp0], 0x00(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp6]) \ + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp6] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp6] \n\t" \ + "paddw %[srch], %[srch], %[ftmp6] \n\t" \ + /* Arithmetic right shift para[1] bits */ \ + "lw %[tmp0], 0x04(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp5]) \ + "psraw %[srcl], %[srcl], %[ftmp5] \n\t" \ + "psraw %[srch], %[srch], %[ftmp5] \n\t" +/* clang-format on */ + +#define CLIP_PIXEL_MMI \ + /* Staturated operation */ \ + "packsswh %[srcl], %[srcl], %[srch] \n\t" \ + "packushb %[ftmp12], %[srcl], %[ftmp0] \n\t" + +static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[5]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + /* clang-format off */ + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[2]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (w & 0x03) { + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; + dst += dst_stride; + } + } else { + double ftmp[4]; + uint32_t tmp[2]; + src_stride -= w; + dst_stride -= w; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp3]) + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + } +} + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w & 0x03) { + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, + 64, filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } else { + convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } +} + +void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); + else + convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); + else + convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); + else + convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); + else + convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c index d35a5a7a63..c942167587 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c @@ -558,8 +558,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, @@ -591,8 +591,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, x_step_q4, y0_q4, y_step_q4, w, h); break; } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c index 13fce0077c..195228689e 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -641,7 +641,7 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h index d53244596b..a0280c5434 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ -#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ +#define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/vpx_filter.h" @@ -119,4 +119,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } -#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ +#endif // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ diff --git a/libs/libvpx/vpx_dsp/postproc.h b/libs/libvpx/vpx_dsp/postproc.h index 43cb5c8e8d..37f993f814 100644 --- a/libs/libvpx/vpx_dsp/postproc.h +++ b/libs/libvpx/vpx_dsp/postproc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_POSTPROC_H_ -#define VPX_DSP_POSTPROC_H_ +#ifndef VPX_VPX_DSP_POSTPROC_H_ +#define VPX_VPX_DSP_POSTPROC_H_ #ifdef __cplusplus extern "C" { @@ -22,4 +22,4 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size); } #endif -#endif // VPX_DSP_POSTPROC_H_ +#endif // VPX_VPX_DSP_POSTPROC_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h index 2c5d9a4f6a..7ac873f9fc 100644 --- a/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h +++ b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ -#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ +#ifndef VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ +#define VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -44,4 +44,4 @@ static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) { #endif } -#endif // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ +#endif // VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/deblock_vsx.c b/libs/libvpx/vpx_dsp/ppc/deblock_vsx.c new file mode 100644 index 0000000000..2129911696 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/deblock_vsx.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +extern const int16_t vpx_rv[]; + +static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, + 0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v, + uint8x16_t filter) { + const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]); + const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]); + const uint8x16_t k3 = vec_avg(k1, k2); + const uint8x16_t f_a = vec_max(vec_absd(v, ctx[0]), vec_absd(v, ctx[1])); + const uint8x16_t f_b = vec_max(vec_absd(v, ctx[2]), vec_absd(v, ctx[3])); + const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter); + return vec_sel(v, vec_avg(k3, v), mask); +} + +static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src, + int stride) { + ctx[0] = vec_vsx_ld(col - 2 * stride, src); + ctx[1] = vec_vsx_ld(col - stride, src); + ctx[2] = vec_vsx_ld(col + stride, src); + ctx[3] = vec_vsx_ld(col + 2 * stride, src); +} + +static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx, + uint8x16_t v, uint8x16_t right_ctx) { + static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1A, 0x1B, 0x1C, 0x1D }; + + static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, + 0x1B, 0x1C, 0x1D, 0x1E }; + + static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, + 0x0D, 0x0E, 0x0F, 0x10 }; + + static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11 }; + ctx[0] = vec_perm(left_ctx, v, l2_perm); + ctx[1] = vec_perm(left_ctx, v, l1_perm); + ctx[2] = vec_perm(v, right_ctx, r1_perm); + ctx[3] = vec_perm(v, right_ctx, r2_perm); +} +void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, int cols, + unsigned char *f, int size) { + int row, col; + uint8x16_t ctx[4], out, v, left_ctx; + + for (row = 0; row < size; row++) { + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + /* now post_proc_across */ + left_ctx = vec_splats(dst_ptr[0]); + v = vec_vsx_ld(0, dst_ptr); + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = (col + 16 == cols) + ? vec_splats(dst_ptr[cols - 1]) + : vec_vsx_ld(col, dst_ptr + 16); + horz_ctx(ctx, left_ctx, v, right_ctx); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + left_ctx = v; + v = right_ctx; + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]); + horz_ctx(ctx, left_ctx, v, right_ctx); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + src_ptr += src_pixels_per_line; + dst_ptr += dst_pixels_per_line; + } +} + +// C: s[c + 7] +static INLINE int16x8_t next7l_s16(uint8x16_t c) { + static const uint8x16_t next7_perm = { + 0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13, + 0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17, + }; + return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm); +} + +// Slide across window and add. +static INLINE int16x8_t slide_sum_s16(int16x8_t x) { + // x = A B C D E F G H + // + // 0 A B C D E F G + const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3)))); + // 0 0 A B C D E F + const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))), + // 0 0 0 A B C D E + vec_slo(x, vec_splats((int8_t)(6 << 3)))); + // 0 0 0 0 A B C D + const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))), + // 0 0 0 0 0 A B C + vec_slo(x, vec_splats((int8_t)(10 << 3)))); + // 0 0 0 0 0 0 A B + const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))), + // 0 0 0 0 0 0 0 A + vec_slo(x, vec_splats((int8_t)(14 << 3)))); + return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4)); +} + +// Slide across window and add. +static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) { + // 0 A C E + // + 0 B D F + int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3)))); + // 0 0 A C + // + 0 0 B D + int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3)))); + // 0 0 0 A + // + 0 0 0 B + int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3)))); + sumsq_1 = vec_add(sumsq_1, xsq_even); + sumsq_2 = vec_add(sumsq_2, sumsq_3); + return vec_add(sumsq_1, sumsq_2); +} + +// C: (b + sum + val) >> 4 +static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) { + return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4)); +} + +// C: sumsq * 15 - sum * sum +static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd, + int16x8_t sum, int32x4_t lim) { + static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, + 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; + const int32x4_t sumsq_odd_scaled = + vec_mul(sumsq_odd, vec_splats((int32_t)15)); + const int32x4_t sumsq_even_scaled = + vec_mul(sumsq_even, vec_splats((int32_t)15)); + const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum)); + const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum)); + + const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim); + const bool32x4_t mask_even = vec_cmplt(thres_even, lim); + return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge); +} + +void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows, + int cols, int flimit) { + int row, col; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + + for (row = 0; row < rows; row++) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + // Fill left context with first col. + int16x8_t left_ctx = vec_splats((int16_t)src[0]); + int16_t s = src[0] * 9; + int32_t ssq = src[0] * src[0] * 9 + 16; + + // Fill the next 6 columns of the sliding window with cols 2 to 7. + for (col = 1; col <= 6; ++col) { + s += src[col]; + ssq += src[col] * src[col]; + } + // Set this sum to every element in the window. + sum = vec_splats(s); + sumsq_even = vec_splats(ssq); + sumsq_odd = vec_splats(ssq); + + for (col = 0; col < cols; col += 8) { + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const uint8x16_t val = vec_vsx_ld(0, src + col); + const int16x8_t val_high = unpack_to_s16_h(val); + + // C: s[c + 7] + const int16x8_t right_ctx = (col + 8 == cols) + ? vec_splats((int16_t)src[col + 7]) + : next7l_s16(val); + + // C: x = s[c + 7] - s[c - 8]; + const int16x8_t x = vec_sub(right_ctx, left_ctx); + const int32x4_t xsq_even = + vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx)); + const int32x4_t xsq_odd = + vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx)); + + const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd); + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_even = vec_add(sumsq_even, sumsq_tmp); + // B D F G + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd)); + + sum = vec_add(sum, slide_sum_s16(x)); + + // C: (8 + sum + s[c]) >> 4 + filtered = filter_s16(vec_splats((int16_t)8), sum, val_high); + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(val_high, filtered, mask); + + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge); + vec_vsx_st(out, 0, src + col); + + // Update window sum and square sum + sum = vec_splat(sum, 7); + sumsq_even = vec_splat(sumsq_odd, 3); + sumsq_odd = vec_splat(sumsq_odd, 3); + + // C: s[c - 8] (for next iteration) + left_ctx = val_high; + } + src += pitch; + } +} + +void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int col, row, i; + int16x8_t window[16]; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + // If rows is less than 8 the bottom border extension fails. + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t r1, sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + r1 = unpack_to_s16_h(vec_vsx_ld(0, dst)); + // Fill sliding window with first row. + for (i = 0; i <= 8; i++) { + window[i] = r1; + } + // First 9 rows of the sliding window are the same. + // sum = r1 * 9 + sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16); + + // sumsq = r1 * r1 * 9 + sumsq_even = vec_mule(sum, r1); + sumsq_odd = vec_mulo(sum, r1); + + // Fill the next 6 rows of the sliding window with rows 2 to 7. + for (i = 1; i <= 6; ++i) { + const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst)); + window[i + 8] = next_row; + sum = vec_add(sum, next_row); + sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row)); + sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row)); + } + + for (row = 0; row < rows; row++) { + int32x4_t d15_even, d15_odd, d0_even, d0_odd; + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127)); + + // Move the sliding window + if (row + 7 < rows) { + window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst)); + } else { + window[15] = window[14]; + } + + // C: sum += s[7 * pitch] - s[-8 * pitch]; + sum = vec_add(sum, vec_sub(window[15], window[0])); + + // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * + // pitch]; + // Optimization Note: Caching a squared-window for odd and even is + // slower than just repeating the multiplies. + d15_odd = vec_mulo(window[15], window[15]); + d15_even = vec_mule(window[15], window[15]); + d0_odd = vec_mulo(window[0], window[0]); + d0_even = vec_mule(window[0], window[0]); + sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd)); + sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even)); + + // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4 + filtered = filter_s16(rv, sum, window[8]); + + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(window[8], filtered, mask); + + // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per + // iteration + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch), + load_merge); + vec_vsx_st(out, 0, dst + row * pitch); + + // Optimization Note: Turns out that the following loop is faster than + // using pointers to manage the sliding window. + for (i = 1; i < 16; i++) { + window[i - 1] = window[i]; + } + } + dst += 8; + } +} diff --git a/libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c b/libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c new file mode 100644 index 0000000000..328b0e3130 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/ppc/transpose_vsx.h" +#include "vpx_dsp/ppc/txfm_common_vsx.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14. +static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add, + int16x8_t *sub) { + // Since a + b can overflow 16 bits, the multiplication is distributed + // (a * c +/- b * c). + const int32x4_t ac_e = vec_mule(a, cospi16_v); + const int32x4_t ac_o = vec_mulo(a, cospi16_v); + const int32x4_t bc_e = vec_mule(b, cospi16_v); + const int32x4_t bc_o = vec_mulo(b, cospi16_v); + + // Reuse the same multiplies for sum and difference. + const int32x4_t sum_e = vec_add(ac_e, bc_e); + const int32x4_t sum_o = vec_add(ac_o, bc_o); + const int32x4_t diff_e = vec_sub(ac_e, bc_e); + const int32x4_t diff_o = vec_sub(ac_o, bc_o); + + // Add rounding offset + const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding); + const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding); + const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding); + const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding); + + const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits); + const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits); + const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits); + const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); + + // There's no pack operation for even and odd, so we need to permute. + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); +} + +// Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14 +static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b, + int16x8_t c2, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t ac1_o = vec_mulo(a, c1); + const int32x4_t ac1_e = vec_mule(a, c1); + const int32x4_t ac2_o = vec_mulo(a, c2); + const int32x4_t ac2_e = vec_mule(a, c2); + + const int32x4_t bc1_o = vec_mulo(b, c1); + const int32x4_t bc1_e = vec_mule(b, c1); + const int32x4_t bc2_o = vec_mulo(b, c2); + const int32x4_t bc2_e = vec_mule(b, c2); + + const int32x4_t sum_o = vec_add(ac1_o, bc2_o); + const int32x4_t sum_e = vec_add(ac1_e, bc2_e); + const int32x4_t diff_o = vec_sub(ac2_o, bc1_o); + const int32x4_t diff_e = vec_sub(ac2_e, bc1_e); + + // Add rounding offset + const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding); + const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding); + const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding); + const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding); + + const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits); + const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits); + const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits); + const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); + + // There's no pack operation for even and odd, so we need to permute. + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); +} + +// While other architecture combine the load and the stage 1 operations, Power9 +// benchmarking show no benefit in such an approach. +static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { + // Tried out different combinations of load and shift instructions, this is + // the fastest one. + { + const int16x8_t l0 = vec_vsx_ld(0, a); + const int16x8_t l1 = vec_vsx_ld(0, a + stride); + const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride); + const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride); + const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride); + const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride); + const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride); + const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride); + + const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride); + const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride); + const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride); + const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride); + const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride); + const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride); + const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride); + const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride); + + b[0] = vec_sl(l0, vec_dct_scale_log2); + b[1] = vec_sl(l1, vec_dct_scale_log2); + b[2] = vec_sl(l2, vec_dct_scale_log2); + b[3] = vec_sl(l3, vec_dct_scale_log2); + b[4] = vec_sl(l4, vec_dct_scale_log2); + b[5] = vec_sl(l5, vec_dct_scale_log2); + b[6] = vec_sl(l6, vec_dct_scale_log2); + b[7] = vec_sl(l7, vec_dct_scale_log2); + + b[8] = vec_sl(l8, vec_dct_scale_log2); + b[9] = vec_sl(l9, vec_dct_scale_log2); + b[10] = vec_sl(l10, vec_dct_scale_log2); + b[11] = vec_sl(l11, vec_dct_scale_log2); + b[12] = vec_sl(l12, vec_dct_scale_log2); + b[13] = vec_sl(l13, vec_dct_scale_log2); + b[14] = vec_sl(l14, vec_dct_scale_log2); + b[15] = vec_sl(l15, vec_dct_scale_log2); + } + { + const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride); + const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride); + const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride); + const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride); + const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride); + const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride); + const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride); + const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride); + + const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride); + const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride); + const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride); + const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride); + const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride); + const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride); + const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride); + const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride); + + b[16] = vec_sl(l16, vec_dct_scale_log2); + b[17] = vec_sl(l17, vec_dct_scale_log2); + b[18] = vec_sl(l18, vec_dct_scale_log2); + b[19] = vec_sl(l19, vec_dct_scale_log2); + b[20] = vec_sl(l20, vec_dct_scale_log2); + b[21] = vec_sl(l21, vec_dct_scale_log2); + b[22] = vec_sl(l22, vec_dct_scale_log2); + b[23] = vec_sl(l23, vec_dct_scale_log2); + + b[24] = vec_sl(l24, vec_dct_scale_log2); + b[25] = vec_sl(l25, vec_dct_scale_log2); + b[26] = vec_sl(l26, vec_dct_scale_log2); + b[27] = vec_sl(l27, vec_dct_scale_log2); + b[28] = vec_sl(l28, vec_dct_scale_log2); + b[29] = vec_sl(l29, vec_dct_scale_log2); + b[30] = vec_sl(l30, vec_dct_scale_log2); + b[31] = vec_sl(l31, vec_dct_scale_log2); + } +} + +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + vec_vsx_st(b[0], 0, a); + vec_vsx_st(b[8], 0, a + 8); + vec_vsx_st(b[16], 0, a + 16); + vec_vsx_st(b[24], 0, a + 24); + + vec_vsx_st(b[1], 0, a + 32); + vec_vsx_st(b[9], 0, a + 40); + vec_vsx_st(b[17], 0, a + 48); + vec_vsx_st(b[25], 0, a + 56); + + vec_vsx_st(b[2], 0, a + 64); + vec_vsx_st(b[10], 0, a + 72); + vec_vsx_st(b[18], 0, a + 80); + vec_vsx_st(b[26], 0, a + 88); + + vec_vsx_st(b[3], 0, a + 96); + vec_vsx_st(b[11], 0, a + 104); + vec_vsx_st(b[19], 0, a + 112); + vec_vsx_st(b[27], 0, a + 120); + + vec_vsx_st(b[4], 0, a + 128); + vec_vsx_st(b[12], 0, a + 136); + vec_vsx_st(b[20], 0, a + 144); + vec_vsx_st(b[28], 0, a + 152); + + vec_vsx_st(b[5], 0, a + 160); + vec_vsx_st(b[13], 0, a + 168); + vec_vsx_st(b[21], 0, a + 176); + vec_vsx_st(b[29], 0, a + 184); + + vec_vsx_st(b[6], 0, a + 192); + vec_vsx_st(b[14], 0, a + 200); + vec_vsx_st(b[22], 0, a + 208); + vec_vsx_st(b[30], 0, a + 216); + + vec_vsx_st(b[7], 0, a + 224); + vec_vsx_st(b[15], 0, a + 232); + vec_vsx_st(b[23], 0, a + 240); + vec_vsx_st(b[31], 0, a + 248); +} + +// Returns 1 if negative 0 if positive +static INLINE int16x8_t vec_sign_s16(int16x8_t a) { + return vec_sr(a, vec_shift_sign_s16); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +static INLINE int16x8_t sub_round_shift(const int16x8_t a) { + const int16x8_t sign = vec_sign_s16(a); + return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t sign = vec_sign_s16(a); + return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2); +} + +static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) { + int16x8_t temp0[32]; // Hold stages: 1, 4, 7 + int16x8_t temp1[32]; // Hold stages: 2, 5 + int16x8_t temp2[32]; // Hold stages: 3, 6 + int i; + + // Stage 1 + // Unrolling this loops actually slows down Power9 benchmarks + for (i = 0; i < 16; i++) { + temp0[i] = vec_add(in[i], in[31 - i]); + // pass through to stage 3. + temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]); + } + + // Stage 2 + // Unrolling this loops actually slows down Power9 benchmarks + for (i = 0; i < 8; i++) { + temp1[i] = vec_add(temp0[i], temp0[15 - i]); + temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]); + } + + // Apply butterflies (in place) on pass through to stage 3. + single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]); + single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]); + single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]); + single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]); + + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (pass) { + temp1[0] = add_round_shift_s16(temp1[0]); + temp1[1] = add_round_shift_s16(temp1[1]); + temp1[2] = add_round_shift_s16(temp1[2]); + temp1[3] = add_round_shift_s16(temp1[3]); + temp1[4] = add_round_shift_s16(temp1[4]); + temp1[5] = add_round_shift_s16(temp1[5]); + temp1[6] = add_round_shift_s16(temp1[6]); + temp1[7] = add_round_shift_s16(temp1[7]); + temp1[8] = add_round_shift_s16(temp1[8]); + temp1[9] = add_round_shift_s16(temp1[9]); + temp1[10] = add_round_shift_s16(temp1[10]); + temp1[11] = add_round_shift_s16(temp1[11]); + temp1[12] = add_round_shift_s16(temp1[12]); + temp1[13] = add_round_shift_s16(temp1[13]); + temp1[14] = add_round_shift_s16(temp1[14]); + temp1[15] = add_round_shift_s16(temp1[15]); + + temp1[16] = add_round_shift_s16(temp1[16]); + temp1[17] = add_round_shift_s16(temp1[17]); + temp1[18] = add_round_shift_s16(temp1[18]); + temp1[19] = add_round_shift_s16(temp1[19]); + temp1[20] = add_round_shift_s16(temp1[20]); + temp1[21] = add_round_shift_s16(temp1[21]); + temp1[22] = add_round_shift_s16(temp1[22]); + temp1[23] = add_round_shift_s16(temp1[23]); + temp1[24] = add_round_shift_s16(temp1[24]); + temp1[25] = add_round_shift_s16(temp1[25]); + temp1[26] = add_round_shift_s16(temp1[26]); + temp1[27] = add_round_shift_s16(temp1[27]); + temp1[28] = add_round_shift_s16(temp1[28]); + temp1[29] = add_round_shift_s16(temp1[29]); + temp1[30] = add_round_shift_s16(temp1[30]); + temp1[31] = add_round_shift_s16(temp1[31]); + } + + // Stage 3 + temp2[0] = vec_add(temp1[0], temp1[7]); + temp2[1] = vec_add(temp1[1], temp1[6]); + temp2[2] = vec_add(temp1[2], temp1[5]); + temp2[3] = vec_add(temp1[3], temp1[4]); + temp2[5] = vec_sub(temp1[2], temp1[5]); + temp2[6] = vec_sub(temp1[1], temp1[6]); + temp2[8] = temp1[8]; + temp2[9] = temp1[9]; + + single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]); + single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]); + temp2[14] = temp1[14]; + temp2[15] = temp1[15]; + + temp2[18] = vec_add(temp1[18], temp1[21]); + temp2[19] = vec_add(temp1[19], temp1[20]); + + temp2[20] = vec_sub(temp1[19], temp1[20]); + temp2[21] = vec_sub(temp1[18], temp1[21]); + + temp2[26] = vec_sub(temp1[29], temp1[26]); + temp2[27] = vec_sub(temp1[28], temp1[27]); + + temp2[28] = vec_add(temp1[28], temp1[27]); + temp2[29] = vec_add(temp1[29], temp1[26]); + + // Pass through Stage 4 + temp0[7] = vec_sub(temp1[0], temp1[7]); + temp0[4] = vec_sub(temp1[3], temp1[4]); + temp0[16] = vec_add(temp1[16], temp1[23]); + temp0[17] = vec_add(temp1[17], temp1[22]); + temp0[22] = vec_sub(temp1[17], temp1[22]); + temp0[23] = vec_sub(temp1[16], temp1[23]); + temp0[24] = vec_sub(temp1[31], temp1[24]); + temp0[25] = vec_sub(temp1[30], temp1[25]); + temp0[30] = vec_add(temp1[30], temp1[25]); + temp0[31] = vec_add(temp1[31], temp1[24]); + + // Stage 4 + temp0[0] = vec_add(temp2[0], temp2[3]); + temp0[1] = vec_add(temp2[1], temp2[2]); + temp0[2] = vec_sub(temp2[1], temp2[2]); + temp0[3] = vec_sub(temp2[0], temp2[3]); + single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]); + + temp0[9] = vec_add(temp2[9], temp2[10]); + temp0[10] = vec_sub(temp2[9], temp2[10]); + temp0[13] = vec_sub(temp2[14], temp2[13]); + temp0[14] = vec_add(temp2[14], temp2[13]); + + double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29], + &temp0[18]); + double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28], + &temp0[19]); + double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27], + &temp0[20]); + double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26], + &temp0[21]); + + // Pass through Stage 5 + temp1[8] = vec_add(temp2[8], temp2[11]); + temp1[11] = vec_sub(temp2[8], temp2[11]); + temp1[12] = vec_sub(temp2[15], temp2[12]); + temp1[15] = vec_add(temp2[15], temp2[12]); + + // Stage 5 + // 0 and 1 pass through to 0 and 16 at the end + single_butterfly(temp0[0], temp0[1], &out[0], &out[16]); + + // 2 and 3 pass through to 8 and 24 at the end + double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]); + + temp1[4] = vec_add(temp0[4], temp0[5]); + temp1[5] = vec_sub(temp0[4], temp0[5]); + temp1[6] = vec_sub(temp0[7], temp0[6]); + temp1[7] = vec_add(temp0[7], temp0[6]); + + double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14], + &temp1[9]); + double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13], + &temp1[10]); + + temp1[17] = vec_add(temp0[17], temp0[18]); + temp1[18] = vec_sub(temp0[17], temp0[18]); + + temp1[21] = vec_sub(temp0[22], temp0[21]); + temp1[22] = vec_add(temp0[22], temp0[21]); + + temp1[25] = vec_add(temp0[25], temp0[26]); + temp1[26] = vec_sub(temp0[25], temp0[26]); + + temp1[29] = vec_sub(temp0[30], temp0[29]); + temp1[30] = vec_add(temp0[30], temp0[29]); + + // Pass through Stage 6 + temp2[16] = vec_add(temp0[16], temp0[19]); + temp2[19] = vec_sub(temp0[16], temp0[19]); + temp2[20] = vec_sub(temp0[23], temp0[20]); + temp2[23] = vec_add(temp0[23], temp0[20]); + temp2[24] = vec_add(temp0[24], temp0[27]); + temp2[27] = vec_sub(temp0[24], temp0[27]); + temp2[28] = vec_sub(temp0[31], temp0[28]); + temp2[31] = vec_add(temp0[31], temp0[28]); + + // Stage 6 + // 4 and 7 pass through to 4 and 28 at the end + double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]); + // 5 and 6 pass through to 20 and 12 at the end + double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20], + &out[12]); + temp2[8] = vec_add(temp1[8], temp1[9]); + temp2[9] = vec_sub(temp1[8], temp1[9]); + temp2[10] = vec_sub(temp1[11], temp1[10]); + temp2[11] = vec_add(temp1[11], temp1[10]); + temp2[12] = vec_add(temp1[12], temp1[13]); + temp2[13] = vec_sub(temp1[12], temp1[13]); + temp2[14] = vec_sub(temp1[15], temp1[14]); + temp2[15] = vec_add(temp1[15], temp1[14]); + + double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30], + &temp2[17]); + double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29], + &temp2[18]); + double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26], + &temp2[21]); + double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25], + &temp2[22]); + + // Stage 7 + double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]); + double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18], + &out[14]); + double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10], + &out[22]); + double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26], + &out[6]); + + temp0[16] = vec_add(temp2[16], temp2[17]); + temp0[17] = vec_sub(temp2[16], temp2[17]); + temp0[18] = vec_sub(temp2[19], temp2[18]); + temp0[19] = vec_add(temp2[19], temp2[18]); + temp0[20] = vec_add(temp2[20], temp2[21]); + temp0[21] = vec_sub(temp2[20], temp2[21]); + temp0[22] = vec_sub(temp2[23], temp2[22]); + temp0[23] = vec_add(temp2[23], temp2[22]); + temp0[24] = vec_add(temp2[24], temp2[25]); + temp0[25] = vec_sub(temp2[24], temp2[25]); + temp0[26] = vec_sub(temp2[27], temp2[26]); + temp0[27] = vec_add(temp2[27], temp2[26]); + temp0[28] = vec_add(temp2[28], temp2[29]); + temp0[29] = vec_sub(temp2[28], temp2[29]); + temp0[30] = vec_sub(temp2[31], temp2[30]); + temp0[31] = vec_add(temp2[31], temp2[30]); + + // Final stage --- outputs indices are bit-reversed. + double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1], + &out[31]); + double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17], + &out[15]); + double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9], + &out[23]); + double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25], + &out[7]); + double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5], + &out[27]); + double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21], + &out[11]); + double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13], + &out[19]); + double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29], + &out[3]); + + if (pass == 0) { + for (i = 0; i < 32; i++) { + out[i] = sub_round_shift(out[i]); + } + } +} + +void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + int16x8_t temp6[32]; + + // Process in 8x32 columns. + load(input, stride, temp0); + fdct32_vsx(temp0, temp1, 0); + + load(input + 8, stride, temp0); + fdct32_vsx(temp0, temp2, 0); + + load(input + 16, stride, temp0); + fdct32_vsx(temp0, temp3, 0); + + load(input + 24, stride, temp0); + fdct32_vsx(temp0, temp4, 0); + + // Generate the top row by munging the first set of 8 from each one + // together. + transpose_8x8(&temp1[0], &temp0[0]); + transpose_8x8(&temp2[0], &temp0[8]); + transpose_8x8(&temp3[0], &temp0[16]); + transpose_8x8(&temp4[0], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out, temp6); + + // Second row of 8x32. + transpose_8x8(&temp1[8], &temp0[0]); + transpose_8x8(&temp2[8], &temp0[8]); + transpose_8x8(&temp3[8], &temp0[16]); + transpose_8x8(&temp4[8], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 8 * 32, temp6); + + // Third row of 8x32 + transpose_8x8(&temp1[16], &temp0[0]); + transpose_8x8(&temp2[16], &temp0[8]); + transpose_8x8(&temp3[16], &temp0[16]); + transpose_8x8(&temp4[16], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 16 * 32, temp6); + + // Final row of 8x32. + transpose_8x8(&temp1[24], &temp0[0]); + transpose_8x8(&temp2[24], &temp0[8]); + transpose_8x8(&temp3[24], &temp0[16]); + transpose_8x8(&temp4[24], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 24 * 32, temp6); +} diff --git a/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c index 6273460f19..a4c8322ff2 100644 --- a/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c +++ b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c @@ -35,6 +35,8 @@ void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, } } +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, @@ -87,6 +89,7 @@ void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, dst += stride; vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst); } +#endif void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -233,6 +236,8 @@ void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, H_PREDICTOR_32(v15_1); } +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); @@ -311,6 +316,7 @@ void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, val = vec_sub(vec_add(vec_splat(l, 7), a), tl); vec_vsx_st(vec_packsu(val, tmp), 0, dst); } +#endif static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, int16x8_t ah, int16x8_t al, int16x8_t tl) { @@ -547,6 +553,8 @@ void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, dc_fill_predictor_32x32(dst, stride, avg32(above)); } +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) { const uint8x16_t a0 = vec_vsx_ld(0, above); const uint8x16_t l0 = vec_vsx_ld(0, left); @@ -559,6 +567,7 @@ static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) { return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), 3); } +#endif static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) { const uint8x16_t a0 = vec_vsx_ld(0, above); @@ -573,10 +582,13 @@ static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) { 3); } +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left)); } +#endif void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -615,6 +627,8 @@ static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b, static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 }; +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t af = vec_vsx_ld(0, above); @@ -633,6 +647,7 @@ void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, row = vec_perm(row, above_right, sl1); } } +#endif void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -674,6 +689,8 @@ void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, } } +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t af = vec_vsx_ld(0, above); @@ -696,6 +713,7 @@ void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, row1 = vec_perm(row1, above_right, sl1); } } +#endif void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { diff --git a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c index d43a9fd184..e99412ecab 100644 --- a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c +++ b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c @@ -14,67 +14,129 @@ #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" #include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" -static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, - 16364, 16364, 16364, 16364 }; -static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, - 16305, 16305, 16305, 16305 }; -static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, - 16207, 16207, 16207, 16207 }; -static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, - 16069, 16069, 16069, 16069 }; -static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, - -16069, -16069, -16069, -16069 }; -static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, - 15893, 15893, 15893, 15893 }; -static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, - 15679, 15679, 15679, 15679 }; -static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, - 15426, 15426, 15426, 15426 }; -static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, - 15137, 15137, 15137, 15137 }; -static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, - -15137, -15137, -15137, -15137 }; -static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, - 14811, 14811, 14811, 14811 }; -static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, - 14449, 14449, 14449, 14449 }; -static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, - 14053, 14053, 14053, 14053 }; -static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, - 13623, 13623, 13623, 13623 }; -static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, - 13160, 13160, 13160, 13160 }; -static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, - 12665, 12665, 12665, 12665 }; -static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, - 12140, 12140, 12140, 12140 }; -static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, - 11585, 11585, 11585, 11585 }; -static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, - 11003, 11003, 11003, 11003 }; -static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, - 10394, 10394, 10394, 10394 }; -static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 }; -static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 }; -static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, - -9102, -9102, -9102, -9102 }; -static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 }; -static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 }; -static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 }; -static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 }; -static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270, - -6270, -6270, -6270, -6270 }; -static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 }; -static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 }; -static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 }; -static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 }; -static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; -static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; -static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364, + -16364, -16364, -16364, -16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305, + -16305, -16305, -16305, -16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893, + -15893, -15893, -15893, -15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811, + -14811, -14811, -14811, -14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449, + -14449, -14449, -14449, -14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623, + -13623, -13623, -13623, -13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160, + -13160, -13160, -13160, -13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585, + -11585, -11585, -11585, -11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003, + -11003, -11003, -11003, -11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394, + -10394, -10394, -10394, -10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423, + -8423, -8423, -8423, -8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270, + -6270, -6270, -6270, -6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520, + -5520, -5520, -5520, -5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756, + -4756, -4756, -4756, -4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196, + -3196, -3196, -3196, -3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404, + -2404, -2404, -2404, -2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283, + 5283, 5283, 5283, 5283 }; +static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929, + 9929, 9929, 9929, 9929 }; +static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377, + 13377, 13377, 13377, 13377 }; +static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212, + 15212, 15212, 15212, 15212 }; + +static uint8x16_t tr8_mask0 = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +}; + +static uint8x16_t tr8_mask1 = { + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F +}; #define ROUND_SHIFT_INIT \ const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ @@ -107,19 +169,18 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; out1 = vec_sub(step0, step1); \ out1 = vec_perm(out1, out1, mask0); -void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, - int stride) { - int32x4_t temp1, temp2, temp3, temp4; - int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; - uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, - 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; - uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; - int16x8_t v0 = load_tran_low(0, input); - int16x8_t v1 = load_tran_low(8 * sizeof(*input), input); - int16x8_t t0 = vec_mergeh(v0, v1); - int16x8_t t1 = vec_mergel(v0, v1); +#define PACK_STORE(v0, v1) \ + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \ + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \ + output_v = vec_packsu(tmp16_0, tmp16_1); \ + \ + vec_vsx_st(output_v, 0, tmp_dest); \ + for (i = 0; i < 4; i++) \ + for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, + int stride) { + int i, j; uint8x16_t dest0 = vec_vsx_ld(0, dest); uint8x16_t dest1 = vec_vsx_ld(stride, dest); uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); @@ -129,31 +190,45 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + int16x8_t tmp16_0, tmp16_1; uint8x16_t output_v; uint8_t tmp_dest[16]; - ROUND_SHIFT_INIT PIXEL_ADD_INIT; - v0 = vec_mergeh(t0, t1); - v1 = vec_mergel(t0, t1); + PIXEL_ADD4(out[0], in[0]); + PIXEL_ADD4(out[1], in[1]); - IDCT4(v0, v1, t_out0, t_out1); - // transpose - t0 = vec_mergeh(t_out0, t_out1); - t1 = vec_mergel(t_out0, t_out1); - v0 = vec_mergeh(t0, t1); - v1 = vec_mergel(t0, t1); - IDCT4(v0, v1, t_out0, t_out1); + PACK_STORE(out[0], out[1]); +} - PIXEL_ADD4(v0, t_out0); - PIXEL_ADD4(v1, t_out1); - tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); - tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); - output_v = vec_packsu(tmp16_0, tmp16_1); +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t temp1, temp2, temp3, temp4; + int16x8_t step0, step1, tmp16_0; + uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; + int16x8_t t0 = vec_mergeh(in[0], in[1]); + int16x8_t t1 = vec_mergel(in[0], in[1]); + ROUND_SHIFT_INIT - vec_vsx_st(output_v, 0, tmp_dest); - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + in[0] = vec_mergeh(t0, t1); + in[1] = vec_mergel(t0, t1); + + IDCT4(in[0], in[1], out[0], out[1]); +} + +void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + // Rows + vpx_idct4_vsx(in, out); + + // Columns + vpx_idct4_vsx(out, in); + + vpx_round_store4x4_vsx(in, out, dest, stride); } #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ @@ -255,28 +330,20 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, #define PIXEL_ADD(in, out, add, shiftx) \ out = vec_add(vec_sra(vec_add(in, add), shiftx), out); -static uint8x16_t tr8_mask0 = { - 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 -}; -static uint8x16_t tr8_mask1 = { - 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, - 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F -}; -void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, - int stride) { - int32x4_t temp10, temp11; +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) { int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; - int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1, - tmp16_2, tmp16_3; - int16x8_t src0 = load_tran_low(0, input); - int16x8_t src1 = load_tran_low(8 * sizeof(*input), input); - int16x8_t src2 = load_tran_low(16 * sizeof(*input), input); - int16x8_t src3 = load_tran_low(24 * sizeof(*input), input); - int16x8_t src4 = load_tran_low(32 * sizeof(*input), input); - int16x8_t src5 = load_tran_low(40 * sizeof(*input), input); - int16x8_t src6 = load_tran_low(48 * sizeof(*input), input); - int16x8_t src7 = load_tran_low(56 * sizeof(*input), input); + int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3; + int32x4_t temp10, temp11; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]); +} + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) { + uint8x16_t zerov = vec_splat_u8(0); uint8x16_t dest0 = vec_vsx_ld(0, dest); uint8x16_t dest1 = vec_vsx_ld(stride, dest); uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); @@ -285,7 +352,6 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); - uint8x16_t zerov = vec_splat_u8(0); int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); @@ -297,23 +363,15 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); uint16x8_t shift5 = vec_splat_u16(5); uint8x16_t output0, output1, output2, output3; - ROUND_SHIFT_INIT; - TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2, - tmp3, tmp4, tmp5, tmp6, tmp7); - - IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2, - src3, src4, src5, src6, src7); - IDCT8(src0, src1, src2, src3, src4, src5, src6, src7); - PIXEL_ADD(src0, d_u0, add, shift5); - PIXEL_ADD(src1, d_u1, add, shift5); - PIXEL_ADD(src2, d_u2, add, shift5); - PIXEL_ADD(src3, d_u3, add, shift5); - PIXEL_ADD(src4, d_u4, add, shift5); - PIXEL_ADD(src5, d_u5, add, shift5); - PIXEL_ADD(src6, d_u6, add, shift5); - PIXEL_ADD(src7, d_u7, add, shift5); + PIXEL_ADD(in[0], d_u0, add, shift5); + PIXEL_ADD(in[1], d_u1, add, shift5); + PIXEL_ADD(in[2], d_u2, add, shift5); + PIXEL_ADD(in[3], d_u3, add, shift5); + PIXEL_ADD(in[4], d_u4, add, shift5); + PIXEL_ADD(in[5], d_u5, add, shift5); + PIXEL_ADD(in[6], d_u6, add, shift5); + PIXEL_ADD(in[7], d_u7, add, shift5); output0 = vec_packsu(d_u0, d_u1); output1 = vec_packsu(d_u2, d_u3); output2 = vec_packsu(d_u4, d_u5); @@ -329,24 +387,24 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); } -#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \ - in6, in7, in8, in9, inA, inB, inC, inD, inE, inF) \ - in0 = load(offset, source); \ - in1 = load((step) + (offset), source); \ - in2 = load(2 * (step) + (offset), source); \ - in3 = load(3 * (step) + (offset), source); \ - in4 = load(4 * (step) + (offset), source); \ - in5 = load(5 * (step) + (offset), source); \ - in6 = load(6 * (step) + (offset), source); \ - in7 = load(7 * (step) + (offset), source); \ - in8 = load(8 * (step) + (offset), source); \ - in9 = load(9 * (step) + (offset), source); \ - inA = load(10 * (step) + (offset), source); \ - inB = load(11 * (step) + (offset), source); \ - inC = load(12 * (step) + (offset), source); \ - inD = load(13 * (step) + (offset), source); \ - inE = load(14 * (step) + (offset), source); \ - inF = load(15 * (step) + (offset), source); +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src[8], tmp[8]; + + src[0] = load_tran_low(0, input); + src[1] = load_tran_low(8 * sizeof(*input), input); + src[2] = load_tran_low(16 * sizeof(*input), input); + src[3] = load_tran_low(24 * sizeof(*input), input); + src[4] = load_tran_low(32 * sizeof(*input), input); + src[5] = load_tran_low(40 * sizeof(*input), input); + src[6] = load_tran_low(48 * sizeof(*input), input); + src[7] = load_tran_low(56 * sizeof(*input), input); + + vpx_idct8_vsx(src, tmp); + vpx_idct8_vsx(tmp, src); + + vpx_round_store8x8_vsx(src, dest, stride); +} #define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \ tmp16_0 = vec_mergeh(inpt0, inpt1); \ @@ -446,9 +504,9 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, tmp16_0 = vec_mergeh(outA, outD); \ tmp16_1 = vec_mergel(outA, outD); \ temp10 = \ - vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \ + vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v)); \ temp11 = \ - vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \ + vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v)); \ DCT_CONST_ROUND_SHIFT(temp10); \ DCT_CONST_ROUND_SHIFT(temp11); \ inA = vec_packs(temp10, temp11); \ @@ -520,95 +578,131 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, PIXEL_ADD(in1, d_ul, add, shift6); \ vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest); -void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, - int stride) { +static void half_idct16x8_vsx(int16x8_t *src) { + int16x8_t tmp0[8], tmp1[8]; int32x4_t temp10, temp11, temp20, temp21, temp30; - int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10, - src11, src12, src13, src14, src15, src16, src17; - int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30, - src31, src32, src33, src34, src35, src36, src37; - int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, - tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1; - int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, - tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37; - uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8, - dest9, destA, destB, destC, destD, destE, destF; - int16x8_t d_uh, d_ul; - int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); - uint16x8_t shift6 = vec_splat_u16(6); - uint8x16_t zerov = vec_splat_u8(0); + int16x8_t tmp16_0, tmp16_1; ROUND_SHIFT_INIT; - // transform rows - // load and transform the upper half of 16x16 matrix - LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01, - src11, src02, src12, src03, src13, src04, src14, src05, src15, - src06, src16, src07, src17); - TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, - tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); - TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, - tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); - IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11, - tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03, - src04, src05, src06, src07, src10, src11, src12, src13, src14, src15, - src16, src17); - TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, - tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); - TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, - tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); + TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12], + src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13], + src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14], + src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]); +} - // load and transform the lower half of 16x16 matrix +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); + + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); + + IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); +} + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride) { + uint8x16_t destv[16]; + int16x8_t d_uh, d_ul; + uint8x16_t zerov = vec_splat_u8(0); + uint16x8_t shift6 = vec_splat_u16(6); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + + // load dest + LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv); + + PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0); + PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride); + PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride); + PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride); + PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride); + PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride); + PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride); + PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride); + + PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride); + PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride); + PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride); + PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride); + PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride); + PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride); + PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride); + PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride); +} +void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[16], src1[16]; + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0); LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), - 8 * sizeof(*input), src20, src30, src21, src31, src22, src32, - src23, src33, src24, src34, src25, src35, src26, src36, src27, - src37); - TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, - tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); - TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, - tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); - IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31, - tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23, - src24, src25, src26, src27, src30, src31, src32, src33, src34, src35, - src36, src37); - TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, - tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); - TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, - tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); + 8 * sizeof(*input), src1); + + // transform rows + // transform the upper half of 16x16 matrix + half_idct16x8_vsx(src0); + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + + // transform the lower half of 16x16 matrix + half_idct16x8_vsx(src1); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); // transform columns // left half first - IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21, - tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03, - src04, src05, src06, src07, src20, src21, src22, src23, src24, src25, - src26, src27); + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); // right half - IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31, - tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13, - src14, src15, src16, src17, src30, src31, src32, src33, src34, src35, - src36, src37); + IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); - // load dest - LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4, - dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD, - destE, destF); - - PIXEL_ADD_STORE16(src00, src10, dest0, 0); - PIXEL_ADD_STORE16(src01, src11, dest1, stride); - PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride); - PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride); - PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride); - PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride); - PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride); - PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride); - - PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride); - PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride); - PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride); - PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride); - PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride); - PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride); - PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride); - PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride); + vpx_round_store16x16_vsx(src0, src1, dest, stride); } #define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \ @@ -980,15 +1074,15 @@ void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, PIXEL_ADD(in3, d_ul, add, shift6); \ vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest); -#define ADD_STORE_BLOCK(in, offset) \ - PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \ - PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \ - PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \ - PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \ - PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \ - PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \ - PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \ - PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7); +#define ADD_STORE_BLOCK(in, offset) \ + PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], (offset) + 0); \ + PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], (offset) + 1); \ + PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], (offset) + 2); \ + PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], (offset) + 3); \ + PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], (offset) + 4); \ + PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], (offset) + 5); \ + PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], (offset) + 6); \ + PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], (offset) + 7); void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, int stride) { @@ -1061,3 +1155,674 @@ void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, ADD_STORE_BLOCK(src2, 16); ADD_STORE_BLOCK(src3, 24); } + +#define TRANSFORM_COLS \ + v32_a = vec_add(v32_a, v32_c); \ + v32_d = vec_sub(v32_d, v32_b); \ + v32_e = vec_sub(v32_a, v32_d); \ + v32_e = vec_sra(v32_e, one); \ + v32_b = vec_sub(v32_e, v32_b); \ + v32_c = vec_sub(v32_e, v32_c); \ + v32_a = vec_sub(v32_a, v32_b); \ + v32_d = vec_add(v32_d, v32_c); \ + v_a = vec_packs(v32_a, v32_b); \ + v_c = vec_packs(v32_c, v32_d); + +#define TRANSPOSE_WHT \ + tmp_a = vec_mergeh(v_a, v_c); \ + tmp_c = vec_mergel(v_a, v_c); \ + v_a = vec_mergeh(tmp_a, tmp_c); \ + v_c = vec_mergel(tmp_a, tmp_c); + +void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t v_a = load_tran_low(0, input); + int16x8_t v_c = load_tran_low(8 * sizeof(*input), input); + int16x8_t tmp_a, tmp_c; + uint16x8_t two = vec_splat_u16(2); + uint32x4_t one = vec_splat_u32(1); + int16x8_t tmp16_0, tmp16_1; + int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e; + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0); + int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1); + int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2); + int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3); + uint8x16_t output_v; + uint8_t tmp_dest[16]; + int i, j; + + v_a = vec_sra(v_a, two); + v_c = vec_sra(v_c, two); + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + PACK_STORE(v_a, v_c); +} + +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) { + int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v; + int32x4_t v_v[5], u_v[4]; + int32x4_t zerov = vec_splat_s32(0); + int16x8_t tmp0, tmp1; + int16x8_t zero16v = vec_splat_s16(0); + uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1)); + ROUND_SHIFT_INIT; + + sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v); + sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v); + sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v); + sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v); + sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v), + vec_sub(zero16v, sinpi_3_9_v)); + + tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]); + tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]); + in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1); + in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1); + + v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov); + v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov); + v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov); + v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov); + v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov); + + in[0] = vec_sub(in[0], in[1]); + in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16); + in[0] = vec_add(in[0], in[1]); + in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16); + + u_v[0] = vec_add(v_v[0], v_v[1]); + u_v[1] = vec_sub(v_v[2], v_v[3]); + u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov); + u_v[3] = vec_sub(v_v[1], v_v[3]); + u_v[3] = vec_add(u_v[3], v_v[4]); + + DCT_CONST_ROUND_SHIFT(u_v[0]); + DCT_CONST_ROUND_SHIFT(u_v[1]); + DCT_CONST_ROUND_SHIFT(u_v[2]); + DCT_CONST_ROUND_SHIFT(u_v[3]); + + out[0] = vec_packs(u_v[0], u_v[1]); + out[1] = vec_packs(u_v[2], u_v[3]); +} + +#define MSUM_ROUND_SHIFT(a, b, cospi) \ + b = vec_msums(a, cospi, zerov); \ + DCT_CONST_ROUND_SHIFT(b); + +#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \ + MSUM_ROUND_SHIFT(in0, tmp0, cospi); \ + MSUM_ROUND_SHIFT(in1, tmp1, cospi); \ + out = vec_packs(tmp0, tmp1); + +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[16], tmp1[16]; + + int32x4_t zerov = vec_splat_s32(0); + int16x8_t zero16v = vec_splat_s16(0); + int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v); + int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v); + int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v); + int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v); + int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v); + int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v); + int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v); + int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v); + int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v); + int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + // stage 1 + // interleave and multiply/add into 32-bit integer + in[0] = vec_mergeh(out[7], out[0]); + in[1] = vec_mergel(out[7], out[0]); + in[2] = vec_mergeh(out[5], out[2]); + in[3] = vec_mergel(out[5], out[2]); + in[4] = vec_mergeh(out[3], out[4]); + in[5] = vec_mergel(out[3], out[4]); + in[6] = vec_mergeh(out[1], out[6]); + in[7] = vec_mergel(out[1], out[6]); + + tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov); + tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov); + tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov); + tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov); + tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov); + tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov); + tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov); + tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov); + tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov); + tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov); + tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov); + tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov); + tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov); + tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov); + tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov); + tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[8]); + tmp0[1] = vec_add(tmp1[1], tmp1[9]); + tmp0[2] = vec_add(tmp1[2], tmp1[10]); + tmp0[3] = vec_add(tmp1[3], tmp1[11]); + tmp0[4] = vec_add(tmp1[4], tmp1[12]); + tmp0[5] = vec_add(tmp1[5], tmp1[13]); + tmp0[6] = vec_add(tmp1[6], tmp1[14]); + tmp0[7] = vec_add(tmp1[7], tmp1[15]); + tmp0[8] = vec_sub(tmp1[0], tmp1[8]); + tmp0[9] = vec_sub(tmp1[1], tmp1[9]); + tmp0[10] = vec_sub(tmp1[2], tmp1[10]); + tmp0[11] = vec_sub(tmp1[3], tmp1[11]); + tmp0[12] = vec_sub(tmp1[4], tmp1[12]); + tmp0[13] = vec_sub(tmp1[5], tmp1[13]); + tmp0[14] = vec_sub(tmp1[6], tmp1[14]); + tmp0[15] = vec_sub(tmp1[7], tmp1[15]); + + // shift and rounding + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + DCT_CONST_ROUND_SHIFT(tmp0[8]); + DCT_CONST_ROUND_SHIFT(tmp0[9]); + DCT_CONST_ROUND_SHIFT(tmp0[10]); + DCT_CONST_ROUND_SHIFT(tmp0[11]); + DCT_CONST_ROUND_SHIFT(tmp0[12]); + DCT_CONST_ROUND_SHIFT(tmp0[13]); + DCT_CONST_ROUND_SHIFT(tmp0[14]); + DCT_CONST_ROUND_SHIFT(tmp0[15]); + + // back to 16-bit + out[0] = vec_packs(tmp0[0], tmp0[1]); + out[1] = vec_packs(tmp0[2], tmp0[3]); + out[2] = vec_packs(tmp0[4], tmp0[5]); + out[3] = vec_packs(tmp0[6], tmp0[7]); + out[4] = vec_packs(tmp0[8], tmp0[9]); + out[5] = vec_packs(tmp0[10], tmp0[11]); + out[6] = vec_packs(tmp0[12], tmp0[13]); + out[7] = vec_packs(tmp0[14], tmp0[15]); + + // stage 2 + in[0] = vec_add(out[0], out[2]); + in[1] = vec_add(out[1], out[3]); + in[2] = vec_sub(out[0], out[2]); + in[3] = vec_sub(out[1], out[3]); + in[4] = vec_mergeh(out[4], out[5]); + in[5] = vec_mergel(out[4], out[5]); + in[6] = vec_mergeh(out[6], out[7]); + in[7] = vec_mergel(out[6], out[7]); + + tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov); + tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov); + tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov); + tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov); + tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov); + tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov); + tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov); + tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[4]); + tmp0[1] = vec_add(tmp1[1], tmp1[5]); + tmp0[2] = vec_add(tmp1[2], tmp1[6]); + tmp0[3] = vec_add(tmp1[3], tmp1[7]); + tmp0[4] = vec_sub(tmp1[0], tmp1[4]); + tmp0[5] = vec_sub(tmp1[1], tmp1[5]); + tmp0[6] = vec_sub(tmp1[2], tmp1[6]); + tmp0[7] = vec_sub(tmp1[3], tmp1[7]); + + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + + in[4] = vec_packs(tmp0[0], tmp0[1]); + in[5] = vec_packs(tmp0[2], tmp0[3]); + in[6] = vec_packs(tmp0[4], tmp0[5]); + in[7] = vec_packs(tmp0[6], tmp0[7]); + + // stage 3 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v); + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v); + + out[0] = in[0]; + out[2] = in[6]; + out[4] = in[3]; + out[6] = in[5]; + + out[1] = vec_sub(zero16v, in[4]); + out[3] = vec_sub(zero16v, in[2]); + out[5] = vec_sub(zero16v, in[7]); + out[7] = vec_sub(zero16v, in[1]); +} + +static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[32], tmp1[32]; + int16x8_t tmp16_0[8]; + int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v); + int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v); + int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v); + int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v); + int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v); + int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v); + int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v); + int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v); + int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v); + int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v); + int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v); + int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v); + int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v); + int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v); + int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v); + int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v); + int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v); + int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v); + int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v); + int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v); + int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v); + int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v); + int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v); + int32x4_t zerov = vec_splat_s32(0); + ROUND_SHIFT_INIT; + + tmp16_0[0] = vec_mergeh(in[15], in[0]); + tmp16_0[1] = vec_mergel(in[15], in[0]); + tmp16_0[2] = vec_mergeh(in[13], in[2]); + tmp16_0[3] = vec_mergel(in[13], in[2]); + tmp16_0[4] = vec_mergeh(in[11], in[4]); + tmp16_0[5] = vec_mergel(in[11], in[4]); + tmp16_0[6] = vec_mergeh(in[9], in[6]); + tmp16_0[7] = vec_mergel(in[9], in[6]); + tmp16_0[8] = vec_mergeh(in[7], in[8]); + tmp16_0[9] = vec_mergel(in[7], in[8]); + tmp16_0[10] = vec_mergeh(in[5], in[10]); + tmp16_0[11] = vec_mergel(in[5], in[10]); + tmp16_0[12] = vec_mergeh(in[3], in[12]); + tmp16_0[13] = vec_mergel(in[3], in[12]); + tmp16_0[14] = vec_mergeh(in[1], in[14]); + tmp16_0[15] = vec_mergel(in[1], in[14]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov); + tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov); + tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov); + tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov); + tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov); + tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov); + tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov); + tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov); + tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov); + tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov); + tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov); + tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov); + tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov); + tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov); + tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov); + tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov); + tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[16]); + tmp1[1] = vec_add(tmp0[1], tmp0[17]); + tmp1[2] = vec_add(tmp0[2], tmp0[18]); + tmp1[3] = vec_add(tmp0[3], tmp0[19]); + tmp1[4] = vec_add(tmp0[4], tmp0[20]); + tmp1[5] = vec_add(tmp0[5], tmp0[21]); + tmp1[6] = vec_add(tmp0[6], tmp0[22]); + tmp1[7] = vec_add(tmp0[7], tmp0[23]); + tmp1[8] = vec_add(tmp0[8], tmp0[24]); + tmp1[9] = vec_add(tmp0[9], tmp0[25]); + tmp1[10] = vec_add(tmp0[10], tmp0[26]); + tmp1[11] = vec_add(tmp0[11], tmp0[27]); + tmp1[12] = vec_add(tmp0[12], tmp0[28]); + tmp1[13] = vec_add(tmp0[13], tmp0[29]); + tmp1[14] = vec_add(tmp0[14], tmp0[30]); + tmp1[15] = vec_add(tmp0[15], tmp0[31]); + tmp1[16] = vec_sub(tmp0[0], tmp0[16]); + tmp1[17] = vec_sub(tmp0[1], tmp0[17]); + tmp1[18] = vec_sub(tmp0[2], tmp0[18]); + tmp1[19] = vec_sub(tmp0[3], tmp0[19]); + tmp1[20] = vec_sub(tmp0[4], tmp0[20]); + tmp1[21] = vec_sub(tmp0[5], tmp0[21]); + tmp1[22] = vec_sub(tmp0[6], tmp0[22]); + tmp1[23] = vec_sub(tmp0[7], tmp0[23]); + tmp1[24] = vec_sub(tmp0[8], tmp0[24]); + tmp1[25] = vec_sub(tmp0[9], tmp0[25]); + tmp1[26] = vec_sub(tmp0[10], tmp0[26]); + tmp1[27] = vec_sub(tmp0[11], tmp0[27]); + tmp1[28] = vec_sub(tmp0[12], tmp0[28]); + tmp1[29] = vec_sub(tmp0[13], tmp0[29]); + tmp1[30] = vec_sub(tmp0[14], tmp0[30]); + tmp1[31] = vec_sub(tmp0[15], tmp0[31]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + DCT_CONST_ROUND_SHIFT(tmp1[16]); + DCT_CONST_ROUND_SHIFT(tmp1[17]); + DCT_CONST_ROUND_SHIFT(tmp1[18]); + DCT_CONST_ROUND_SHIFT(tmp1[19]); + DCT_CONST_ROUND_SHIFT(tmp1[20]); + DCT_CONST_ROUND_SHIFT(tmp1[21]); + DCT_CONST_ROUND_SHIFT(tmp1[22]); + DCT_CONST_ROUND_SHIFT(tmp1[23]); + DCT_CONST_ROUND_SHIFT(tmp1[24]); + DCT_CONST_ROUND_SHIFT(tmp1[25]); + DCT_CONST_ROUND_SHIFT(tmp1[26]); + DCT_CONST_ROUND_SHIFT(tmp1[27]); + DCT_CONST_ROUND_SHIFT(tmp1[28]); + DCT_CONST_ROUND_SHIFT(tmp1[29]); + DCT_CONST_ROUND_SHIFT(tmp1[30]); + DCT_CONST_ROUND_SHIFT(tmp1[31]); + + in[0] = vec_packs(tmp1[0], tmp1[1]); + in[1] = vec_packs(tmp1[2], tmp1[3]); + in[2] = vec_packs(tmp1[4], tmp1[5]); + in[3] = vec_packs(tmp1[6], tmp1[7]); + in[4] = vec_packs(tmp1[8], tmp1[9]); + in[5] = vec_packs(tmp1[10], tmp1[11]); + in[6] = vec_packs(tmp1[12], tmp1[13]); + in[7] = vec_packs(tmp1[14], tmp1[15]); + in[8] = vec_packs(tmp1[16], tmp1[17]); + in[9] = vec_packs(tmp1[18], tmp1[19]); + in[10] = vec_packs(tmp1[20], tmp1[21]); + in[11] = vec_packs(tmp1[22], tmp1[23]); + in[12] = vec_packs(tmp1[24], tmp1[25]); + in[13] = vec_packs(tmp1[26], tmp1[27]); + in[14] = vec_packs(tmp1[28], tmp1[29]); + in[15] = vec_packs(tmp1[30], tmp1[31]); + + // stage 2 + tmp16_0[0] = vec_mergeh(in[8], in[9]); + tmp16_0[1] = vec_mergel(in[8], in[9]); + tmp16_0[2] = vec_mergeh(in[10], in[11]); + tmp16_0[3] = vec_mergel(in[10], in[11]); + tmp16_0[4] = vec_mergeh(in[12], in[13]); + tmp16_0[5] = vec_mergel(in[12], in[13]); + tmp16_0[6] = vec_mergeh(in[14], in[15]); + tmp16_0[7] = vec_mergel(in[14], in[15]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[8]); + tmp1[1] = vec_add(tmp0[1], tmp0[9]); + tmp1[2] = vec_add(tmp0[2], tmp0[10]); + tmp1[3] = vec_add(tmp0[3], tmp0[11]); + tmp1[4] = vec_add(tmp0[4], tmp0[12]); + tmp1[5] = vec_add(tmp0[5], tmp0[13]); + tmp1[6] = vec_add(tmp0[6], tmp0[14]); + tmp1[7] = vec_add(tmp0[7], tmp0[15]); + tmp1[8] = vec_sub(tmp0[0], tmp0[8]); + tmp1[9] = vec_sub(tmp0[1], tmp0[9]); + tmp1[10] = vec_sub(tmp0[2], tmp0[10]); + tmp1[11] = vec_sub(tmp0[3], tmp0[11]); + tmp1[12] = vec_sub(tmp0[4], tmp0[12]); + tmp1[13] = vec_sub(tmp0[5], tmp0[13]); + tmp1[14] = vec_sub(tmp0[6], tmp0[14]); + tmp1[15] = vec_sub(tmp0[7], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + tmp16_0[0] = vec_add(in[0], in[4]); + tmp16_0[1] = vec_add(in[1], in[5]); + tmp16_0[2] = vec_add(in[2], in[6]); + tmp16_0[3] = vec_add(in[3], in[7]); + tmp16_0[4] = vec_sub(in[0], in[4]); + tmp16_0[5] = vec_sub(in[1], in[5]); + tmp16_0[6] = vec_sub(in[2], in[6]); + tmp16_0[7] = vec_sub(in[3], in[7]); + tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]); + tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]); + tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]); + tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]); + tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]); + tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]); + tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]); + tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 3 + in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]); + in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]); + in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]); + in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]); + in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]); + in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]); + in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]); + in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]); + + tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov); + tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov); + tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov); + tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov); + tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov); + tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov); + tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov); + tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov); + tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov); + tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov); + tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov); + tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov); + tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov); + tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov); + tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov); + tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[4]); + tmp1[1] = vec_add(tmp0[1], tmp0[5]); + tmp1[2] = vec_add(tmp0[2], tmp0[6]); + tmp1[3] = vec_add(tmp0[3], tmp0[7]); + tmp1[4] = vec_sub(tmp0[0], tmp0[4]); + tmp1[5] = vec_sub(tmp0[1], tmp0[5]); + tmp1[6] = vec_sub(tmp0[2], tmp0[6]); + tmp1[7] = vec_sub(tmp0[3], tmp0[7]); + tmp1[8] = vec_add(tmp0[8], tmp0[12]); + tmp1[9] = vec_add(tmp0[9], tmp0[13]); + tmp1[10] = vec_add(tmp0[10], tmp0[14]); + tmp1[11] = vec_add(tmp0[11], tmp0[15]); + tmp1[12] = vec_sub(tmp0[8], tmp0[12]); + tmp1[13] = vec_sub(tmp0[9], tmp0[13]); + tmp1[14] = vec_sub(tmp0[10], tmp0[14]); + tmp1[15] = vec_sub(tmp0[11], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + in[0] = vec_add(tmp16_0[0], tmp16_0[2]); + in[1] = vec_add(tmp16_0[1], tmp16_0[3]); + in[2] = vec_sub(tmp16_0[0], tmp16_0[2]); + in[3] = vec_sub(tmp16_0[1], tmp16_0[3]); + in[4] = vec_packs(tmp1[0], tmp1[1]); + in[5] = vec_packs(tmp1[2], tmp1[3]); + in[6] = vec_packs(tmp1[4], tmp1[5]); + in[7] = vec_packs(tmp1[6], tmp1[7]); + in[8] = vec_add(tmp16_0[8], tmp16_0[10]); + in[9] = vec_add(tmp16_0[9], tmp16_0[11]); + in[10] = vec_sub(tmp16_0[8], tmp16_0[10]); + in[11] = vec_sub(tmp16_0[9], tmp16_0[11]); + in[12] = vec_packs(tmp1[8], tmp1[9]); + in[13] = vec_packs(tmp1[10], tmp1[11]); + in[14] = vec_packs(tmp1[12], tmp1[13]); + in[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 4 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + out[4] = vec_mergeh(in[10], in[11]); + out[5] = vec_mergel(in[10], in[11]); + out[6] = vec_mergeh(in[14], in[15]); + out[7] = vec_mergel(in[14], in[15]); +} + +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[16], tmp1[16], tmp2[8]; + int32x4_t tmp3, tmp4; + int16x8_t zero16v = vec_splat_s16(0); + int32x4_t zerov = vec_splat_s32(0); + int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v); + int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12], + tmp0[13], tmp0[14], tmp0[15]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12], + tmp1[13], tmp1[14], tmp1[15]); + + iadst16x8_vsx(tmp0, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16); + + src0[0] = tmp0[0]; + src0[2] = vec_sub(zero16v, tmp0[8]); + src0[4] = tmp0[12]; + src0[6] = vec_sub(zero16v, tmp0[4]); + src1[8] = tmp0[5]; + src1[10] = vec_sub(zero16v, tmp0[13]); + src1[12] = tmp0[9]; + src1[14] = vec_sub(zero16v, tmp0[1]); + + iadst16x8_vsx(tmp1, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16); + + src0[1] = tmp1[0]; + src0[3] = vec_sub(zero16v, tmp1[8]); + src0[5] = tmp1[12]; + src0[7] = vec_sub(zero16v, tmp1[4]); + src1[9] = tmp1[5]; + src1[11] = vec_sub(zero16v, tmp1[13]); + src1[13] = tmp1[9]; + src1[15] = vec_sub(zero16v, tmp1[1]); +} diff --git a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h new file mode 100644 index 0000000000..7031742c1c --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ +#define VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ + +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, + int stride); +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out); + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride); +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out); + +#define LOAD_INPUT16(load, source, offset, step, in) \ + in[0] = load(offset, source); \ + in[1] = load((step) + (offset), source); \ + in[2] = load(2 * (step) + (offset), source); \ + in[3] = load(3 * (step) + (offset), source); \ + in[4] = load(4 * (step) + (offset), source); \ + in[5] = load(5 * (step) + (offset), source); \ + in[6] = load(6 * (step) + (offset), source); \ + in[7] = load(7 * (step) + (offset), source); \ + in[8] = load(8 * (step) + (offset), source); \ + in[9] = load(9 * (step) + (offset), source); \ + in[10] = load(10 * (step) + (offset), source); \ + in[11] = load(11 * (step) + (offset), source); \ + in[12] = load(12 * (step) + (offset), source); \ + in[13] = load(13 * (step) + (offset), source); \ + in[14] = load(14 * (step) + (offset), source); \ + in[15] = load(15 * (step) + (offset), source); + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride); +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1); +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1); + +#endif // VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/quantize_vsx.c b/libs/libvpx/vpx_dsp/ppc/quantize_vsx.c new file mode 100644 index 0000000000..d85e63bd14 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/quantize_vsx.c @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); +} + +// Quantization function used for 4x4, 8x8 and 16x16 blocks. +static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, bool16x8_t mask) { + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + qcoeff = vec_mulhi(qcoeff, quant_shift); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +// Quantization function used for 32x32 blocks. +static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, + bool16x8_t mask) { + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + // 32x32 blocks require an extra multiplication by 2, this compensates for the + // extra right shift added in vec_mulhi, as such vec_madds can be used + // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15) + qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); +} + +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, + const int16_t *iscan_ptr, int index) { + int16x8_t scan = vec_vsx_ld(index, iscan_ptr); + bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); + scan = vec_sub(scan, mask); + return vec_andc(scan, zero_coeff); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; + + // First set of 8 coeff starts with DC + 7 AC + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + qcoeff0 = + quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + quant_shift = vec_splat(quant_shift, 1); + qcoeff1 = + quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + + if (n_coeffs > 16) { + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2; + bool16x8_t zero_mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = + vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} + +void vpx_quantize_b_32x32_vsx( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; + + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + (void)scan_ptr; + (void)skip_block; + (void)n_coeffs; + assert(!skip_block); + + // 32x32 quantization requires that zbin and round be divided by 2 + zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); // remove DC from zbin + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + round = vec_splat(round, 1); // remove DC from round + quant = vec_splat(quant, 1); // remove DC from quant + quant_shift = vec_splat(quant_shift, 1); // remove DC from quant_shift + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); // remove DC from dequant + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; + bool16x8_t zero_mask2; + + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); + + eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + // 24 int16_t is 48 bytes + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} diff --git a/libs/libvpx/vpx_dsp/ppc/sad_vsx.c b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c index bb49addae1..a08ae12413 100644 --- a/libs/libvpx/vpx_dsp/ppc/sad_vsx.c +++ b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c @@ -17,71 +17,75 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#define PROCESS16(offset) \ - v_a = vec_vsx_ld(offset, a); \ - v_b = vec_vsx_ld(offset, b); \ - v_ah = unpack_to_s16_h(v_a); \ - v_al = unpack_to_s16_l(v_a); \ - v_bh = unpack_to_s16_h(v_b); \ - v_bl = unpack_to_s16_l(v_b); \ - v_subh = vec_sub(v_ah, v_bh); \ - v_subl = vec_sub(v_al, v_bl); \ - v_absh = vec_abs(v_subh); \ - v_absl = vec_abs(v_subl); \ - v_sad = vec_sum4s(v_absh, v_sad); \ - v_sad = vec_sum4s(v_absl, v_sad); +#define PROCESS16(offset) \ + v_a = vec_vsx_ld(offset, a); \ + v_b = vec_vsx_ld(offset, b); \ + v_abs = vec_absd(v_a, v_b); \ + v_sad = vec_sum4s(v_abs, v_sad); + +#define SAD8(height) \ + unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0) \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[1] + v_sad[0]; \ + } #define SAD16(height) \ unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride) { \ - int y; \ - unsigned int sad[4]; \ - uint8x16_t v_a, v_b; \ - int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ - int32x4_t v_sad = vec_splat_s32(0); \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ \ - for (y = 0; y < height; y++) { \ + do { \ PROCESS16(0); \ \ a += a_stride; \ b += b_stride; \ - } \ - vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + y++; \ + } while (y < height); \ \ - return sad[3] + sad[2] + sad[1] + sad[0]; \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ } #define SAD32(height) \ unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride) { \ - int y; \ - unsigned int sad[4]; \ - uint8x16_t v_a, v_b; \ - int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ - int32x4_t v_sad = vec_splat_s32(0); \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ \ - for (y = 0; y < height; y++) { \ + do { \ PROCESS16(0); \ PROCESS16(16); \ \ a += a_stride; \ b += b_stride; \ - } \ - vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + y++; \ + } while (y < height); \ \ - return sad[3] + sad[2] + sad[1] + sad[0]; \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ } #define SAD64(height) \ unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride) { \ - int y; \ - unsigned int sad[4]; \ - uint8x16_t v_a, v_b; \ - int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ - int32x4_t v_sad = vec_splat_s32(0); \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ \ - for (y = 0; y < height; y++) { \ + do { \ PROCESS16(0); \ PROCESS16(16); \ PROCESS16(32); \ @@ -89,12 +93,15 @@ \ a += a_stride; \ b += b_stride; \ - } \ - vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + y++; \ + } while (y < height); \ \ - return sad[3] + sad[2] + sad[1] + sad[0]; \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ } +SAD8(4); +SAD8(8); +SAD8(16); SAD16(8); SAD16(16); SAD16(32); @@ -108,7 +115,7 @@ SAD64(64); unsigned int vpx_sad16x##height##_avg_vsx( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ - DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]); \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * (height)]); \ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \ ref_stride); \ \ @@ -119,7 +126,7 @@ SAD64(64); unsigned int vpx_sad32x##height##_avg_vsx( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ - DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]); \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * (height)]); \ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \ ref_stride); \ \ @@ -130,7 +137,7 @@ SAD64(64); unsigned int vpx_sad64x##height##_avg_vsx( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ - DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]); \ + DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * (height)]); \ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \ ref_stride); \ return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \ diff --git a/libs/libvpx/vpx_dsp/ppc/subtract_vsx.c b/libs/libvpx/vpx_dsp/ppc/subtract_vsx.c new file mode 100644 index 0000000000..76ad302da6 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/subtract_vsx.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static VPX_FORCE_INLINE void subtract_block4x4( + int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { + int16_t *diff1 = diff + 2 * diff_stride; + const uint8_t *src1 = src + 2 * src_stride; + const uint8_t *pred1 = pred + 2 * pred_stride; + + const int16x8_t d0 = vec_vsx_ld(0, diff); + const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride); + const int16x8_t d2 = vec_vsx_ld(0, diff1); + const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride); + + const uint8x16_t s0 = read4x2(src, (int)src_stride); + const uint8x16_t p0 = read4x2(pred, (int)pred_stride); + const uint8x16_t s1 = read4x2(src1, (int)src_stride); + const uint8x16_t p1 = read4x2(pred1, (int)pred_stride); + + const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + + vec_vsx_st(xxpermdi(da, d0, 1), 0, diff); + vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride); + vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1); + vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride); +} + +void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r = rows, c; + + switch (cols) { + case 64: + case 32: + do { + for (c = 0; c < cols; c += 32) { + const uint8x16_t s0 = vec_vsx_ld(0, src + c); + const uint8x16_t s1 = vec_vsx_ld(16, src + c); + const uint8x16_t p0 = vec_vsx_ld(0, pred + c); + const uint8x16_t p1 = vec_vsx_ld(16, pred + c); + const int16x8_t d0l = + vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = + vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t d1l = + vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1)); + const int16x8_t d1h = + vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + vec_vsx_st(d0h, 0, diff + c); + vec_vsx_st(d0l, 16, diff + c); + vec_vsx_st(d1h, 0, diff + c + 16); + vec_vsx_st(d1l, 16, diff + c + 16); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 16: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + vec_vsx_st(d0l, 16, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 8: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 4: + subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride); + if (r > 4) { + diff += 4 * diff_stride; + pred += 4 * pred_stride; + src += 4 * src_stride; + + subtract_block4x4(diff, diff_stride, + + src, src_stride, + + pred, pred_stride); + } + break; + default: assert(0); // unreachable + } +} diff --git a/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h index f02556d522..4883b734ad 100644 --- a/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h +++ b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_ -#define VPX_DSP_PPC_TRANSPOSE_VSX_H_ +#ifndef VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ +#define VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ #include "./vpx_config.h" #include "vpx_dsp/ppc/types_vsx.h" @@ -98,4 +98,36 @@ static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) { // v[7]: 07 17 27 37 47 57 67 77 } -#endif // VPX_DSP_PPC_TRANSPOSE_VSX_H_ +static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { + // Stage 1 + const int16x8_t s1_0 = vec_mergeh(a[0], a[4]); + const int16x8_t s1_1 = vec_mergel(a[0], a[4]); + const int16x8_t s1_2 = vec_mergeh(a[1], a[5]); + const int16x8_t s1_3 = vec_mergel(a[1], a[5]); + const int16x8_t s1_4 = vec_mergeh(a[2], a[6]); + const int16x8_t s1_5 = vec_mergel(a[2], a[6]); + const int16x8_t s1_6 = vec_mergeh(a[3], a[7]); + const int16x8_t s1_7 = vec_mergel(a[3], a[7]); + + // Stage 2 + const int16x8_t s2_0 = vec_mergeh(s1_0, s1_4); + const int16x8_t s2_1 = vec_mergel(s1_0, s1_4); + const int16x8_t s2_2 = vec_mergeh(s1_1, s1_5); + const int16x8_t s2_3 = vec_mergel(s1_1, s1_5); + const int16x8_t s2_4 = vec_mergeh(s1_2, s1_6); + const int16x8_t s2_5 = vec_mergel(s1_2, s1_6); + const int16x8_t s2_6 = vec_mergeh(s1_3, s1_7); + const int16x8_t s2_7 = vec_mergel(s1_3, s1_7); + + // Stage 2 + b[0] = vec_mergeh(s2_0, s2_4); + b[1] = vec_mergel(s2_0, s2_4); + b[2] = vec_mergeh(s2_1, s2_5); + b[3] = vec_mergel(s2_1, s2_5); + b[4] = vec_mergeh(s2_2, s2_6); + b[5] = vec_mergel(s2_2, s2_6); + b[6] = vec_mergeh(s2_3, s2_7); + b[7] = vec_mergel(s2_3, s2_7); +} + +#endif // VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h b/libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h new file mode 100644 index 0000000000..2907a1fe40 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ +#define VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ + +#include "vpx_dsp/ppc/types_vsx.h" + +static const int32x4_t vec_dct_const_rounding = { 8192, 8192, 8192, 8192 }; + +static const uint32x4_t vec_dct_const_bits = { 14, 14, 14, 14 }; + +static const uint16x8_t vec_dct_scale_log2 = { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +#endif // VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/types_vsx.h b/libs/libvpx/vpx_dsp/ppc/types_vsx.h index f611d02d2d..b891169245 100644 --- a/libs/libvpx/vpx_dsp/ppc/types_vsx.h +++ b/libs/libvpx/vpx_dsp/ppc/types_vsx.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_PPC_TYPES_VSX_H_ -#define VPX_DSP_PPC_TYPES_VSX_H_ +#ifndef VPX_VPX_DSP_PPC_TYPES_VSX_H_ +#define VPX_VPX_DSP_PPC_TYPES_VSX_H_ #include @@ -19,8 +19,11 @@ typedef vector signed short int16x8_t; typedef vector unsigned short uint16x8_t; typedef vector signed int int32x4_t; typedef vector unsigned int uint32x4_t; +typedef vector bool char bool8x16_t; +typedef vector bool short bool16x8_t; +typedef vector bool int bool32x4_t; -#ifdef __clang__ +#if defined(__clang__) && __clang_major__ < 6 static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; @@ -61,8 +64,45 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, #define unpack_to_s16_l(v) \ (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) #ifndef xxpermdi -#define xxpermdi(a, b, c) vec_xxpermdi(b, a, ((c >> 1) | (c & 1) << 1) ^ 3) +#define xxpermdi(a, b, c) vec_xxpermdi(b, a, (((c) >> 1) | ((c)&1) << 1) ^ 3) #endif #endif -#endif // VPX_DSP_PPC_TYPES_VSX_H_ +static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) { + const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); + const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); + + return (uint8x16_t)vec_mergeh(a0, a1); +} + +#ifndef __POWER9_VECTOR__ +#define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b)) +#endif + +static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 }; +static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const int16x8_t vec_twos_s16 = { 2, 2, 2, 2, 2, 2, 2, 2 }; +static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 }; +static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 }; +static const uint32x4_t vec_zeros_u32 = { 0, 0, 0, 0 }; +static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 }; +static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 }; +static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07 }; +static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03 }; +static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D, + 0x0E, 0x0F, 0x00, 0x01 }; + +static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11, + 0x04, 0x05, 0x14, 0x15, + 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; + +#endif // VPX_VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/variance_vsx.c b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c index 1efe2f0056..be9614a358 100644 --- a/libs/libvpx/vpx_dsp/ppc/variance_vsx.c +++ b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c @@ -10,24 +10,20 @@ #include +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/ppc/types_vsx.h" -static inline uint8x16_t read4x2(const uint8_t *a, int stride) { - const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); - const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); - - return (uint8x16_t)vec_mergeh(a0, a1); -} - -uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride) { +uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int distortion; - const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride)); - const int16x8_t a1 = unpack_to_s16_h(read4x2(a + a_stride * 2, a_stride)); - const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride)); - const int16x8_t b1 = unpack_to_s16_h(read4x2(b + b_stride * 2, b_stride)); + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t a1 = + unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); + const int16x8_t b1 = + unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride)); const int16x8_t d0 = vec_sub(a0, b0); const int16x8_t d1 = vec_sub(a1, b1); const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0))); @@ -39,12 +35,12 @@ uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b, } // TODO(lu_zero): Unroll -uint32_t vpx_get_mb_ss_vsx(const int16_t *a) { +uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) { unsigned int i, sum = 0; int32x4_t s = vec_splat_s32(0); for (i = 0; i < 256; i += 8) { - const int16x8_t v = vec_vsx_ld(0, a + i); + const int16x8_t v = vec_vsx_ld(0, src_ptr + i); s = vec_msum(v, v, s); } @@ -101,3 +97,175 @@ void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, } } } + +static INLINE void variance_inner_32(const uint8_t *src_ptr, + const uint8_t *ref_ptr, + int32x4_t *sum_squared, int32x4_t *sum) { + int32x4_t s = *sum; + int32x4_t ss = *sum_squared; + + const uint8x16_t va0 = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr); + const uint8x16_t va1 = vec_vsx_ld(16, src_ptr); + const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr); + + const int16x8_t a0 = unpack_to_s16_h(va0); + const int16x8_t b0 = unpack_to_s16_h(vb0); + const int16x8_t a1 = unpack_to_s16_l(va0); + const int16x8_t b1 = unpack_to_s16_l(vb0); + const int16x8_t a2 = unpack_to_s16_h(va1); + const int16x8_t b2 = unpack_to_s16_h(vb1); + const int16x8_t a3 = unpack_to_s16_l(va1); + const int16x8_t b3 = unpack_to_s16_l(vb1); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int16x8_t d2 = vec_sub(a2, b2); + const int16x8_t d3 = vec_sub(a3, b3); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + s = vec_sum4s(d2, s); + ss = vec_msum(d2, d2, ss); + s = vec_sum4s(d3, s); + ss = vec_msum(d3, d3, ss); + *sum = s; + *sum_squared = ss; +} + +static INLINE void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + int i; + + int32x4_t s = vec_splat_s32(0); + int32x4_t ss = vec_splat_s32(0); + + switch (w) { + case 4: + for (i = 0; i < h / 2; ++i) { + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); + const int16x8_t d = vec_sub(a0, b0); + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + src_ptr += src_stride * 2; + ref_ptr += ref_stride * 2; + } + break; + case 8: + for (i = 0; i < h; ++i) { + const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr)); + const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr)); + const int16x8_t d = vec_sub(a0, b0); + + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 16: + for (i = 0; i < h; ++i) { + const uint8x16_t va = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb = vec_vsx_ld(0, ref_ptr); + const int16x8_t a0 = unpack_to_s16_h(va); + const int16x8_t b0 = unpack_to_s16_h(vb); + const int16x8_t a1 = unpack_to_s16_l(va); + const int16x8_t b1 = unpack_to_s16_l(vb); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 32: + for (i = 0; i < h; ++i) { + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 64: + for (i = 0; i < h; ++i) { + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste(s, 0, sum); + + ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)ss, 0, sse); +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H))); \ + } + +#define VARIANCES(W, H) VAR(W, H) + +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) diff --git a/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c index 5c3ba4576f..2dc66055cc 100644 --- a/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c +++ b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c @@ -9,13 +9,16 @@ */ #include #include + #include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_filter.h" +#include "vpx/vpx_integer.h" #include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/vpx_filter.h" // TODO(lu_zero): unroll -static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { +static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { int i; for (i = h; i--;) { @@ -25,8 +28,9 @@ static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { +static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { int i; for (i = h; i--;) { @@ -37,8 +41,9 @@ static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { +static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { int i; for (i = h; i--;) { @@ -86,8 +91,9 @@ void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { +static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { int i; for (i = h; i--;) { @@ -98,8 +104,9 @@ static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { +static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { int i; for (i = h; i--;) { @@ -112,8 +119,9 @@ static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { +static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { int i; for (i = h; i--;) { @@ -155,8 +163,8 @@ void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void convolve_line(uint8_t *dst, const int16x8_t s, - const int16x8_t f) { +static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s, + const int16x8_t f) { const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0)); const int32x4_t bias = vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1)); @@ -166,8 +174,9 @@ static inline void convolve_line(uint8_t *dst, const int16x8_t s, vec_ste(v, 0, dst); } -static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x, - const int16_t *const x_filter) { +static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst, + const uint8_t *const src_x, + const int16_t *const x_filter) { const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x)); const int16x8_t f = vec_vsx_ld(0, x_filter); @@ -175,10 +184,12 @@ static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x, } // TODO(lu_zero): Implement 8x8 and bigger block special cases -static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, + int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; @@ -194,10 +205,10 @@ static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static VPX_FORCE_INLINE void convolve_avg_horiz( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; @@ -230,9 +241,10 @@ static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b, return (uint8x16_t)vec_mergeh(abcd, efgh); } -static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y, - ptrdiff_t src_stride, - const int16_t *const y_filter) { +static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst, + const uint8_t *const src_y, + ptrdiff_t src_stride, + const int16_t *const y_filter) { uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride); uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride); uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride); @@ -250,10 +262,12 @@ static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y, convolve_line(dst, unpack_to_s16_h(s), f); } -static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, + int h) { int x, y; src -= src_stride * (SUBPEL_TAPS / 2 - 1); @@ -270,10 +284,10 @@ static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static VPX_FORCE_INLINE void convolve_avg_vert( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { int x, y; src -= src_stride * (SUBPEL_TAPS / 2 - 1); @@ -291,11 +305,11 @@ static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } -static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { +static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const filter, + int x0_q4, int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. diff --git a/libs/libvpx/vpx_dsp/prob.h b/libs/libvpx/vpx_dsp/prob.h index f1cc0eaa10..7a71c0041f 100644 --- a/libs/libvpx/vpx_dsp/prob.h +++ b/libs/libvpx/vpx_dsp/prob.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_PROB_H_ -#define VPX_DSP_PROB_H_ +#ifndef VPX_VPX_DSP_PROB_H_ +#define VPX_VPX_DSP_PROB_H_ #include @@ -32,7 +32,7 @@ typedef int8_t vpx_tree_index; #define TREE_SIZE(leaf_count) (2 * (leaf_count)-2) -#define vpx_complement(x) (255 - x) +#define vpx_complement(x) (255 - (x)) #define MODE_MV_COUNT_SAT 20 @@ -103,4 +103,4 @@ DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]); } // extern "C" #endif -#endif // VPX_DSP_PROB_H_ +#endif // VPX_VPX_DSP_PROB_H_ diff --git a/libs/libvpx/vpx_dsp/psnr.c b/libs/libvpx/vpx_dsp/psnr.c index 47afd4388a..48bac04508 100644 --- a/libs/libvpx/vpx_dsp/psnr.c +++ b/libs/libvpx/vpx_dsp/psnr.c @@ -1,12 +1,12 @@ /* -* Copyright (c) 2016 The WebM project authors. All Rights Reserved. -* -* Use of this source code is governed by a BSD-style license -* that can be found in the LICENSE file in the root of the source -* tree. An additional intellectual property rights grant can be found -* in the file PATENTS. All contributing project authors may -* be found in the AUTHORS file in the root of the source tree. -*/ + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ #include #include @@ -24,8 +24,8 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { } /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() -* and highbd_8_variance(). It should not. -*/ + * and highbd_8_variance(). It should not. + */ static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, unsigned int *sse, int *sum) { diff --git a/libs/libvpx/vpx_dsp/psnr.h b/libs/libvpx/vpx_dsp/psnr.h index f321131d0b..a5563557e9 100644 --- a/libs/libvpx/vpx_dsp/psnr.h +++ b/libs/libvpx/vpx_dsp/psnr.h @@ -1,15 +1,15 @@ /* -* Copyright (c) 2016 The WebM project authors. All Rights Reserved. -* -* Use of this source code is governed by a BSD-style license -* that can be found in the LICENSE file in the root of the source -* tree. An additional intellectual property rights grant can be found -* in the file PATENTS. All contributing project authors may -* be found in the AUTHORS file in the root of the source tree. -*/ + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ -#ifndef VPX_DSP_PSNR_H_ -#define VPX_DSP_PSNR_H_ +#ifndef VPX_VPX_DSP_PSNR_H_ +#define VPX_VPX_DSP_PSNR_H_ #include "vpx_scale/yv12config.h" @@ -28,13 +28,13 @@ typedef struct { // TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t /*!\brief Converts SSE to PSNR -* -* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). -* -* \param[in] samples Number of samples -* \param[in] peak Max sample value -* \param[in] sse Sum of squared errors -*/ + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ double vpx_sse_to_psnr(double samples, double peak, double sse); int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); #if CONFIG_VP9_HIGHBITDEPTH @@ -54,4 +54,4 @@ double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source, #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_PSNR_H_ +#endif // VPX_VPX_DSP_PSNR_H_ diff --git a/libs/libvpx/vpx_dsp/psnrhvs.c b/libs/libvpx/vpx_dsp/psnrhvs.c index b3910152c4..d7ec1a429a 100644 --- a/libs/libvpx/vpx_dsp/psnrhvs.c +++ b/libs/libvpx/vpx_dsp/psnrhvs.c @@ -126,8 +126,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, const uint8_t *_dst8 = dst; const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); - int16_t dct_s[8 * 8], dct_d[8 * 8]; - tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8]; + DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); + DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); double mask[8][8]; int pixels; int x; @@ -142,7 +144,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, been normalized and then squared." Their CSF matrix (from PSNR-HVS) was also constructed from the JPEG matrices. I can not find any obvious scheme of normalizing to produce their table, but if I multiply their - CSF by 0.38857 and square the result I get their masking table. + CSF by 0.3885746225901003 and square the result I get their masking table. I have no idea where this constant comes from, but deviating from it too greatly hurts MOS agreement. @@ -150,11 +152,15 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking of DCT basis functions", CD-ROM Proceedings of the Third International Workshop on Video Processing and Quality Metrics for Consumer - Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/ + Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p. + + Suggested in aomedia issue #2363: + 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509) + of the old JPEG based matrix from the paper. Since you are not using that, + divide by actual maximum coefficient. */ for (x = 0; x < 8; x++) for (y = 0; y < 8; y++) - mask[x][y] = - (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003); + mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]); for (y = 0; y < _h - 7; y += _step) { for (x = 0; x < _w - 7; x += _step) { int i; diff --git a/libs/libvpx/vpx_dsp/quantize.c b/libs/libvpx/vpx_dsp/quantize.c index e37ca92ad4..0e6a0b83fa 100644 --- a/libs/libvpx/vpx_dsp/quantize.c +++ b/libs/libvpx/vpx_dsp/quantize.c @@ -12,12 +12,13 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/quantize.h" +#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int rc = 0; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); @@ -31,7 +32,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 16; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; if (tmp) eob = 0; } *eob_ptr = eob + 1; @@ -41,7 +42,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr) { int eob = -1; @@ -55,7 +56,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int64_t tmp = abs_coeff + round_ptr[0]; const int abs_qcoeff = (int)((tmp * quant) >> 16); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; @@ -65,7 +66,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int n_coeffs = 1024; const int rc = 0; const int coeff = coeff_ptr[rc]; @@ -81,7 +82,7 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 15; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2; if (tmp) eob = 0; } *eob_ptr = eob + 1; @@ -92,8 +93,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int n_coeffs = 1024; int eob = -1; @@ -107,7 +107,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); const int abs_qcoeff = (int)((tmp * quant) >> 15); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2; if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; @@ -260,7 +260,15 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 15; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; +#if (ARCH_X86 || ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH + // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than + // truncating with a cast, saturate the value. This is easier to implement + // on x86 and preserves the sign of the value. + dqcoeff_ptr[rc] = + clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX); +#else dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; +#endif // ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH if (tmp) eob = idx_arr[i]; } diff --git a/libs/libvpx/vpx_dsp/quantize.h b/libs/libvpx/vpx_dsp/quantize.h index e132845463..7cac140e9d 100644 --- a/libs/libvpx/vpx_dsp/quantize.h +++ b/libs/libvpx/vpx_dsp/quantize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_QUANTIZE_H_ -#define VPX_DSP_QUANTIZE_H_ +#ifndef VPX_VPX_DSP_QUANTIZE_H_ +#define VPX_VPX_DSP_QUANTIZE_H_ #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" @@ -19,30 +19,29 @@ extern "C" { #endif void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, - const int16_t quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, - const int16_t quant_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); #endif #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_QUANTIZE_H_ +#endif // VPX_VPX_DSP_QUANTIZE_H_ diff --git a/libs/libvpx/vpx_dsp/sad.c b/libs/libvpx/vpx_dsp/sad.c index 18b6dc6e09..873ddca093 100644 --- a/libs/libvpx/vpx_dsp/sad.c +++ b/libs/libvpx/vpx_dsp/sad.c @@ -17,54 +17,55 @@ #include "vpx_ports/mem.h" /* Sum the difference between every corresponding element of the buffers. */ -static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { +static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } return sad; } -#define sadMxN(m, n) \ - unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return sad(src, src_stride, ref, ref_stride, m, n); \ - } \ - unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ - vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return sad(src, src_stride, comp_pred, m, m, n); \ +#define sadMxN(m, n) \ + unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } \ + unsigned int vpx_sad##m##x##n##_avg_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ + vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ + return sad(src_ptr, src_stride, comp_pred, m, m, n); \ } // depending on call sites, pass **ref_array to avoid & in subsequent call and // de-dup with 4D below. -#define sadMxNxK(m, n, k) \ - void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref_array, int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ +#define sadMxNxK(m, n, k) \ + void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \ } // This appears to be equivalent to the above when k == 4 and refs is const -#define sadMxNx4D(m, n) \ - void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ +#define sadMxNx4D(m, n) \ + void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \ } /* clang-format off */ @@ -133,59 +134,61 @@ sadMxNx4D(4, 4) #if CONFIG_VP9_HIGHBITDEPTH static INLINE - unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, - int b_stride, int width, int height) { + unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int width, + int height) { int y, x; unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src += src_stride; + ref_ptr += ref_stride; } return sad; } -static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, - const uint16_t *b, int b_stride, +static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, int width, int height) { int y, x; unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src += src_stride; + ref_ptr += ref_stride; } return sad; } #define highbd_sadMxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, \ - int ref_stride) { \ - return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + unsigned int vpx_highbd_sad##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ } \ unsigned int vpx_highbd_sad##m##x##n##_avg_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \ - vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ + vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \ + n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \ + return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ } -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ - ref_array[i], ref_stride); \ - } \ +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ } /* clang-format off */ diff --git a/libs/libvpx/vpx_dsp/skin_detection.h b/libs/libvpx/vpx_dsp/skin_detection.h index a2e99baf7e..91640c33d5 100644 --- a/libs/libvpx/vpx_dsp/skin_detection.h +++ b/libs/libvpx/vpx_dsp/skin_detection.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_SKIN_DETECTION_H_ -#define VPX_DSP_SKIN_DETECTION_H_ +#ifndef VPX_VPX_DSP_SKIN_DETECTION_H_ +#define VPX_VPX_DSP_SKIN_DETECTION_H_ #ifdef __cplusplus extern "C" { @@ -21,4 +21,4 @@ int vpx_skin_pixel(const int y, const int cb, const int cr, int motion); } // extern "C" #endif -#endif // VPX_DSP_SKIN_DETECTION_H_ +#endif // VPX_VPX_DSP_SKIN_DETECTION_H_ diff --git a/libs/libvpx/vpx_dsp/ssim.c b/libs/libvpx/vpx_dsp/ssim.c index 7a29bd29f9..7c3c31bad8 100644 --- a/libs/libvpx/vpx_dsp/ssim.c +++ b/libs/libvpx/vpx_dsp/ssim.c @@ -73,7 +73,7 @@ static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, uint32_t sum_sq_r, uint32_t sum_sxr, int count, uint32_t bd) { - int64_t ssim_n, ssim_d; + double ssim_n, ssim_d; int64_t c1, c2; if (bd == 8) { // scale the constants by number of pixels @@ -90,14 +90,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, assert(0); } - ssim_n = (2 * sum_s * sum_r + c1) * - ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + ssim_n = (2.0 * sum_s * sum_r + c1) * + (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); - ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * - ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * + ((double)count * sum_sq_s - (double)sum_s * sum_s + + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); - return ssim_n * 1.0 / ssim_d; + return ssim_n / ssim_d; } static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { @@ -284,7 +284,7 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, for (i = 0; i < height; i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { for (j = 0; j < width; j += 4, ++c) { - Ssimv sv = { 0 }; + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; double ssim; double ssim2; double dssim; diff --git a/libs/libvpx/vpx_dsp/ssim.h b/libs/libvpx/vpx_dsp/ssim.h index 4f2bb1d556..c382237fc6 100644 --- a/libs/libvpx/vpx_dsp/ssim.h +++ b/libs/libvpx/vpx_dsp/ssim.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_SSIM_H_ -#define VPX_DSP_SSIM_H_ +#ifndef VPX_VPX_DSP_SSIM_H_ +#define VPX_VPX_DSP_SSIM_H_ #define MAX_SSIM_DB 100.0; @@ -84,4 +84,4 @@ double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, } // extern "C" #endif -#endif // VPX_DSP_SSIM_H_ +#endif // VPX_VPX_DSP_SSIM_H_ diff --git a/libs/libvpx/vpx_dsp/subtract.c b/libs/libvpx/vpx_dsp/subtract.c index 95e7071b27..45c819e67a 100644 --- a/libs/libvpx/vpx_dsp/subtract.c +++ b/libs/libvpx/vpx_dsp/subtract.c @@ -16,37 +16,37 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -void vpx_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src, - ptrdiff_t src_stride, const uint8_t *pred, +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; + for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - diff += diff_stride; - pred += pred_stride; - src += src_stride; + diff_ptr += diff_stride; + pred_ptr += pred_stride; + src_ptr += src_stride; } } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src8, - ptrdiff_t src_stride, const uint8_t *pred8, +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src8_ptr, + ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd) { int r, c; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); (void)bd; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) { - diff[c] = src[c] - pred[c]; + diff_ptr[c] = src[c] - pred[c]; } - diff += diff_stride; + diff_ptr += diff_stride; pred += pred_stride; src += src_stride; } diff --git a/libs/libvpx/vpx_dsp/sum_squares.c b/libs/libvpx/vpx_dsp/sum_squares.c index 7c535ac2db..b80cd588e4 100644 --- a/libs/libvpx/vpx_dsp/sum_squares.c +++ b/libs/libvpx/vpx_dsp/sum_squares.c @@ -10,8 +10,7 @@ #include "./vpx_dsp_rtcd.h" -uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, - int size) { +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) { int r, c; uint64_t ss = 0; @@ -20,7 +19,7 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, const int16_t v = src[c]; ss += v * v; } - src += src_stride; + src += stride; } return ss; diff --git a/libs/libvpx/vpx_dsp/txfm_common.h b/libs/libvpx/vpx_dsp/txfm_common.h index d01d7085a2..25f4fdb327 100644 --- a/libs/libvpx/vpx_dsp/txfm_common.h +++ b/libs/libvpx/vpx_dsp/txfm_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_TXFM_COMMON_H_ -#define VPX_DSP_TXFM_COMMON_H_ +#ifndef VPX_VPX_DSP_TXFM_COMMON_H_ +#define VPX_VPX_DSP_TXFM_COMMON_H_ #include "vpx_dsp/vpx_dsp_common.h" @@ -63,4 +63,4 @@ static const tran_coef_t sinpi_2_9 = 9929; static const tran_coef_t sinpi_3_9 = 13377; static const tran_coef_t sinpi_4_9 = 15212; -#endif // VPX_DSP_TXFM_COMMON_H_ +#endif // VPX_VPX_DSP_TXFM_COMMON_H_ diff --git a/libs/libvpx/vpx_dsp/variance.c b/libs/libvpx/vpx_dsp/variance.c index 93bd8f30de..30b55dcb40 100644 --- a/libs/libvpx/vpx_dsp/variance.c +++ b/libs/libvpx/vpx_dsp/variance.c @@ -21,36 +21,37 @@ static const uint8_t bilinear_filters[8][2] = { { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; -uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride) { +uint32_t vpx_get4x4sse_cs_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int distortion = 0; int r, c; for (r = 0; r < 4; ++r) { for (c = 0; c < 4; ++c) { - int diff = a[c] - b[c]; + int diff = src_ptr[c] - ref_ptr[c]; distortion += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } return distortion; } -uint32_t vpx_get_mb_ss_c(const int16_t *a) { +uint32_t vpx_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; for (i = 0; i < 256; ++i) { - sum += a[i] * a[i]; + sum += src_ptr[i] * src_ptr[i]; } return sum; } -static void variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, int *sum) { +static void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, int h, + uint32_t *sse, int *sum) { int i, j; *sum = 0; @@ -58,13 +59,13 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b, for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; + const int diff = src_ptr[j] - ref_ptr[j]; *sum += diff; *sse += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } } @@ -76,24 +77,23 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b, // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); - ++a; + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } @@ -106,91 +106,90 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); - ++a; + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } -#define VAR(W, H) \ - uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } -#define SUBPIX_VAR(W, H) \ - uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_c(temp2, W, ref_ptr, ref_stride, sse); \ } -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ - \ - return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_c(temp3, W, ref_ptr, ref_stride, sse); \ } /* Identical to the variance call except it takes an additional parameter, sum, * and returns that value using pass-by-reference instead of returning * sse - sum^2 / w*h */ -#define GET_VAR(W, H) \ - void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - int *sum) { \ - variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ } /* Identical to the variance call except it does not calculate the * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in * variable. */ -#define MSE(W, H) \ - uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse; \ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ } /* All three forms of the variance are available in the same sizes. */ @@ -237,128 +236,140 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, int64_t *sum) { +static void highbd_variance64(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint64_t *sse, int64_t *sum) { int i, j; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); *sum = 0; *sse = 0; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; + const int diff = src_ptr[j] - ref_ptr[j]; *sum += diff; *sse += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } } -static void highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)sse_long; *sum = (int)sum_long; } -static void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); } -static void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } -#define HIGHBD_VAR(W, H) \ - uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -#define HIGHBD_GET_VAR(S) \ - void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ } -#define HIGHBD_MSE(W, H) \ - uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ } static void highbd_var_filter_block2d_bil_first_pass( @@ -403,111 +414,111 @@ static void highbd_var_filter_block2d_bil_second_pass( } } -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ } /* All three forms of the variance are available in the same sizes. */ @@ -538,12 +549,10 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, +void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, int ref_stride) { int i, j; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; diff --git a/libs/libvpx/vpx_dsp/variance.h b/libs/libvpx/vpx_dsp/variance.h index 100573299b..6d0e1b8a6b 100644 --- a/libs/libvpx/vpx_dsp/variance.h +++ b/libs/libvpx/vpx_dsp/variance.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_VARIANCE_H_ -#define VPX_DSP_VARIANCE_H_ +#ifndef VPX_VPX_DSP_VARIANCE_H_ +#define VPX_VPX_DSP_VARIANCE_H_ #include "./vpx_config.h" @@ -22,37 +22,38 @@ extern "C" { #define FILTER_BITS 7 #define FILTER_WEIGHT 128 -typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b_ptr, int b_stride); +typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); -typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, +typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, - int b_stride, int n); +typedef void (*vp8_copy32xn_fn_t)(const uint8_t *src_ptr, int src_stride, + uint8_t *ref_ptr, int ref_stride, int n); -typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +typedef void (*vpx_sad_multi_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, +typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *src_ptr, int src_stride, const uint8_t *const b_array[], - int b_stride, unsigned int *sad_array); + int ref_stride, unsigned int *sad_array); -typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse); +typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, unsigned int *sse); -typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - unsigned int *sse); +typedef unsigned int (*vpx_subpixvariance_fn_t)( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); typedef unsigned int (*vpx_subp_avg_variance_fn_t)( - const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset, - const uint8_t *b_ptr, int b_stride, unsigned int *sse, + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + #if CONFIG_VP8 typedef struct variance_vtable { vpx_sad_fn_t sdf; @@ -82,4 +83,4 @@ typedef struct vp9_variance_vtable { } // extern "C" #endif -#endif // VPX_DSP_VARIANCE_H_ +#endif // VPX_VPX_DSP_VARIANCE_H_ diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.h b/libs/libvpx/vpx_dsp/vpx_convolve.h index 7979268a95..d5793e17ad 100644 --- a/libs/libvpx/vpx_dsp/vpx_convolve.h +++ b/libs/libvpx/vpx_dsp/vpx_convolve.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_VPX_CONVOLVE_H_ -#define VPX_DSP_VPX_CONVOLVE_H_ +#ifndef VPX_VPX_DSP_VPX_CONVOLVE_H_ +#define VPX_VPX_DSP_VPX_CONVOLVE_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -35,4 +35,4 @@ typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, } // extern "C" #endif -#endif // VPX_DSP_VPX_CONVOLVE_H_ +#endif // VPX_VPX_DSP_VPX_CONVOLVE_H_ diff --git a/libs/libvpx/vpx_dsp/vpx_dsp.mk b/libs/libvpx/vpx_dsp/vpx_dsp.mk index 3b1a873cd2..f013fa5922 100644 --- a/libs/libvpx/vpx_dsp/vpx_dsp.mk +++ b/libs/libvpx/vpx_dsp/vpx_dsp.mk @@ -47,13 +47,11 @@ endif # intra predictions DSP_SRCS-yes += intrapred.c -DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c @@ -69,6 +67,8 @@ DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/post_proc_sse2.c +DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c endif # CONFIG_POSTPROC DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) @@ -81,16 +81,19 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c +DSP_SRCS-yes += vpx_filter.h +ifeq ($(CONFIG_VP9),yes) # interpolation filters DSP_SRCS-yes += vpx_convolve.c DSP_SRCS-yes += vpx_convolve.h -DSP_SRCS-yes += vpx_filter.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c + +DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_4t_intrin_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm @@ -111,9 +114,17 @@ DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM) -DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM) -DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM) DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c +DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h DSP_SRCS-yes += arm/vpx_convolve_neon.c else ifeq ($(HAVE_NEON),yes) @@ -134,6 +145,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h +DSP_SRCS-$(HAVE_MMI) += mips/vpx_convolve8_mmi.c # common (dspr2) DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h @@ -153,8 +165,8 @@ DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c # loop filters DSP_SRCS-yes += loopfilter.c -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c -DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c +DSP_SRCS-$(HAVE_SSE2) += x86/loopfilter_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) @@ -180,6 +192,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_VP9 DSP_SRCS-yes += txfm_common.h DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h @@ -204,7 +217,12 @@ DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +DSP_SRCS-$(HAVE_VSX) += ppc/fdct32x32_vsx.c endif # CONFIG_VP9_ENCODER # inverse transform @@ -242,6 +260,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct_neon.h DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c @@ -279,11 +298,13 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h -DSP_SRCS-$(HAVE_SSE2) += x86/quantize_x86.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c +DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c endif @@ -310,6 +331,7 @@ ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c +DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c @@ -330,13 +352,12 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c -DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm -DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c +DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm @@ -358,7 +379,6 @@ DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c -DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c @@ -368,7 +388,6 @@ ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm endif # ARCH_X86_64 -DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) @@ -386,6 +405,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h # PPC VSX utilities DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/txfm_common_vsx.h DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_common.h b/libs/libvpx/vpx_dsp/vpx_dsp_common.h index c8c852374f..2de4495465 100644 --- a/libs/libvpx/vpx_dsp/vpx_dsp_common.h +++ b/libs/libvpx/vpx_dsp/vpx_dsp_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_VPX_DSP_COMMON_H_ -#define VPX_DSP_VPX_DSP_COMMON_H_ +#ifndef VPX_VPX_DSP_VPX_DSP_COMMON_H_ +#define VPX_VPX_DSP_VPX_DSP_COMMON_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -25,8 +25,8 @@ extern "C" { #define VPX_SWAP(type, a, b) \ do { \ type c = (b); \ - b = a; \ - a = c; \ + (b) = a; \ + (a) = c; \ } while (0) #if CONFIG_VP9_HIGHBITDEPTH @@ -57,6 +57,10 @@ static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } +static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) { + return value < low ? low : (value > high ? high : value); +} + static INLINE uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: @@ -70,4 +74,4 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) { } // extern "C" #endif -#endif // VPX_DSP_VPX_DSP_COMMON_H_ +#endif // VPX_VPX_DSP_VPX_DSP_COMMON_H_ diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1a743d910e..797ef7fe0d 100644 --- a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -37,325 +37,333 @@ if ($opts{arch} eq "x86_64") { # Intra prediction # -add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_4x4 sse2/; -add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_4x4 neon sse2/; -add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d63_predictor_4x4 ssse3/; -add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/; +add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +# TODO(crbug.com/webm/1522): Re-enable vsx implementation. +specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/; -add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_4x4 neon/; -add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_4x4 ssse3/; -add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_4x4 neon msa sse2/; -add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/; +add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +# TODO(crbug.com/webm/1522): Re-enable vsx implementation. +specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/; -add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/; -add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/; -add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/; -add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/; -add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_8x8 ssse3/; -add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/; +add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +# TODO(crbug.com/webm/1522): Re-enable vsx implementation. +specialize qw/vpx_d45_predictor_8x8 neon sse2/; -add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/; +add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +# TODO(crbug.com/webm/1522): Re-enable vsx implementation. +specialize qw/vpx_d63_predictor_8x8 ssse3/; -add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/; +add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +# TODO(crbug.com/webm/1522): Re-enable vsx implementation. +specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/; -add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_8x8 neon/; -add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_8x8 ssse3/; -add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/; +add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +# TODO(crbug.com/webm/1522): Re-enable vsx implementation. +specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/; -add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/; +add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +# TODO(crbug.com/webm/1522): Re-enable vsx implementation. +specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/; -add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_16x16 ssse3/; -add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/; -add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/; -add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_16x16 neon/; -add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_16x16 ssse3/; -add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_32x32 ssse3/; -add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/; -add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/; -add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; -add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_32x32 neon/; -add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_32x32 ssse3/; -add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/; -add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/; -add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/; -add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/; -add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/; # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/; - add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/; - add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/; - add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/; } # CONFIG_VP9_HIGHBITDEPTH +if (vpx_config("CONFIG_VP9") eq "yes") { # # Sub Pixel Filters # @@ -363,25 +371,25 @@ add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/; +specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/; add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_scaled_2d ssse3 neon msa/; @@ -395,36 +403,38 @@ add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; +} #CONFIG_VP9 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Sub Pixel Filters # - add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH +if (vpx_config("CONFIG_VP9") eq "yes") { # # Loopfilter # @@ -463,6 +473,7 @@ specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/; +} #CONFIG_VP9 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; @@ -583,7 +594,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/; add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa/; + specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct32x32_1 sse2 neon msa/; @@ -626,6 +637,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_1_add neon sse2/; + specialize qw/vpx_iwht4x4_16_add sse2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. @@ -646,7 +658,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; specialize qw/vpx_idct32x32_34_add dspr2 msa/; specialize qw/vpx_idct32x32_1_add dspr2 msa/; - specialize qw/vpx_iwht4x4_16_add msa sse2/; + specialize qw/vpx_iwht4x4_16_add msa/; specialize qw/vpx_iwht4x4_1_add msa/; } # !CONFIG_VP9_HIGHBITDEPTH } # !CONFIG_EMULATE_HARDWARE @@ -654,7 +666,6 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - specialize qw/vpx_iwht4x4_16_add sse2/; add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; @@ -699,10 +710,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b neon sse2 ssse3 avx/; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; @@ -718,7 +729,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") { # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa mmi sse2/; +specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/; # # Single block SAD @@ -748,13 +759,13 @@ add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x16 neon msa sse2 mmi/; +specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x8 neon msa sse2 mmi/; +specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x4 neon msa sse2 mmi/; +specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x8 neon msa sse2 mmi/; @@ -782,8 +793,23 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; + add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/vpx_hadamard_32x32 sse2 avx2/; + + add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/vpx_highbd_hadamard_8x8 avx2/; + + add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/vpx_highbd_hadamard_16x16 avx2/; + + add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/vpx_highbd_hadamard_32x32 avx2/; + add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; + + add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length"; + specialize qw/vpx_highbd_satd avx2/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; @@ -791,6 +817,9 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; + add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/vpx_hadamard_32x32 sse2 avx2/; + add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon msa/; } @@ -882,47 +911,47 @@ specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/; # # Multi-block SAD, comparing a reference to N independent blocks # -add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; -specialize qw/vpx_sum_squares_2d_i16 sse2 msa/; +specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; # # Structured Similarity (SSIM) @@ -939,7 +968,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Block subtraction # - add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; # # Single block SAD @@ -984,9 +1013,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Avg # - add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p"; - add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p"; - add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p"; + specialize qw/vpx_highbd_avg_8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p"; + specialize qw/vpx_highbd_avg_4x4 sse2/; + + add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad64x64_avg sse2/; @@ -1028,43 +1061,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Multi-block SAD, comparing a reference to N independent blocks # - add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad64x64x4d sse2/; - add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad64x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad32x64x4d sse2/; - add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad32x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad32x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad16x32x4d sse2/; - add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad16x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad16x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad8x16x4d sse2/; - add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad8x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad8x4x4d sse2/; - add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad4x8x4d sse2/; - add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad4x4x4d sse2/; # @@ -1081,70 +1114,70 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq " # # Variance # -add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x64 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x32 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x8 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x16 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x8 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x4 sse2 neon msa mmi/; +add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/; # # Specialty Variance # -add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get16x16var sse2 avx2 neon msa/; +add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/; -add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get8x8var sse2 neon msa/; +add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get8x8var sse2 neon msa vsx/; -add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/; +add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 msa mmi/; +add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/; -add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 msa mmi/; +add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_mse8x16 sse2 msa mmi vsx/; -add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 msa mmi/; +add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_mse8x8 sse2 msa mmi vsx/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; -add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; +add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"; specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; @@ -1153,440 +1186,449 @@ add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, # # Subpixel Variance # -add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance64x64 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance64x32 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance32x64 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance32x32 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance32x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance16x32 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance16x8 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance8x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance64x64 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance64x32 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance32x64 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance32x32 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance32x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance16x32 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance16x8 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance8x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance64x64 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance64x32 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance32x64 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance32x32 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance32x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance16x32 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance16x8 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance8x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_highbd_8_get16x16var sse2/; - add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_highbd_8_get8x8var sse2/; - add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_highbd_10_get16x16var sse2/; - add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_highbd_10_get8x8var sse2/; + + add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_highbd_12_get16x16var sse2/; + + add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_highbd_12_get8x8var sse2/; + + add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_mse16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_mse8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_mse16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_mse8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_mse16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_mse8x8 sse2/; - add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; # # Subpixel Variance # - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; } # CONFIG_VP9_HIGHBITDEPTH @@ -1598,13 +1640,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") specialize qw/vpx_plane_add_noise sse2 msa/; add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_down sse2 neon msa/; + specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/; - add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/; + add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit"; + specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/; add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; - specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/; + specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/; } diff --git a/libs/libvpx/vpx_dsp/vpx_filter.h b/libs/libvpx/vpx_dsp/vpx_filter.h index 6cea251bcc..54357ee6ca 100644 --- a/libs/libvpx/vpx_dsp/vpx_filter.h +++ b/libs/libvpx/vpx_dsp/vpx_filter.h @@ -8,9 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_VPX_FILTER_H_ -#define VPX_DSP_VPX_FILTER_H_ +#ifndef VPX_VPX_DSP_VPX_FILTER_H_ +#define VPX_VPX_DSP_VPX_FILTER_H_ +#include #include "vpx/vpx_integer.h" #ifdef __cplusplus @@ -26,8 +27,16 @@ extern "C" { typedef int16_t InterpKernel[SUBPEL_TAPS]; +static INLINE int vpx_get_filter_taps(const int16_t *const filter) { + assert(filter[3] != 128); + if (!filter[0] && !filter[1] && !filter[2]) + return 2; + else + return 8; +} + #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_VPX_FILTER_H_ +#endif // VPX_VPX_DSP_VPX_FILTER_H_ diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c index ff19ea6470..3f4f577a21 100644 --- a/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c @@ -15,6 +15,209 @@ #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" #include "vpx_ports/mem.h" +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi32(a0, a1); + __m256i b1 = _mm256_sub_epi32(a0, a1); + __m256i b2 = _mm256_add_epi32(a2, a3); + __m256i b3 = _mm256_sub_epi32(a2, a3); + __m256i b4 = _mm256_add_epi32(a4, a5); + __m256i b5 = _mm256_sub_epi32(a4, a5); + __m256i b6 = _mm256_add_epi32(a6, a7); + __m256i b7 = _mm256_sub_epi32(a6, a7); + + a0 = _mm256_add_epi32(b0, b2); + a1 = _mm256_add_epi32(b1, b3); + a2 = _mm256_sub_epi32(b0, b2); + a3 = _mm256_sub_epi32(b1, b3); + a4 = _mm256_add_epi32(b4, b6); + a5 = _mm256_add_epi32(b5, b7); + a6 = _mm256_sub_epi32(b4, b6); + a7 = _mm256_sub_epi32(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi32(a0, a4); + b7 = _mm256_add_epi32(a1, a5); + b3 = _mm256_add_epi32(a2, a6); + b4 = _mm256_add_epi32(a3, a7); + b2 = _mm256_sub_epi32(a0, a4); + b6 = _mm256_sub_epi32(a1, a5); + b1 = _mm256_sub_epi32(a2, a6); + b5 = _mm256_sub_epi32(a3, a7); + + a0 = _mm256_unpacklo_epi32(b0, b1); + a1 = _mm256_unpacklo_epi32(b2, b3); + a2 = _mm256_unpackhi_epi32(b0, b1); + a3 = _mm256_unpackhi_epi32(b2, b3); + a4 = _mm256_unpacklo_epi32(b4, b5); + a5 = _mm256_unpacklo_epi32(b6, b7); + a6 = _mm256_unpackhi_epi32(b4, b5); + a7 = _mm256_unpackhi_epi32(b6, b7); + + b0 = _mm256_unpacklo_epi64(a0, a1); + b1 = _mm256_unpacklo_epi64(a4, a5); + b2 = _mm256_unpackhi_epi64(a0, a1); + b3 = _mm256_unpackhi_epi64(a4, a5); + b4 = _mm256_unpacklo_epi64(a2, a3); + b5 = _mm256_unpacklo_epi64(a6, a7); + b6 = _mm256_unpackhi_epi64(a2, a3); + b7 = _mm256_unpackhi_epi64(a6, a7); + + in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); + in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); + in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); + in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); + in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); + in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); + in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); + in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); + } else { + in[0] = _mm256_add_epi32(a0, a4); + in[7] = _mm256_add_epi32(a1, a5); + in[3] = _mm256_add_epi32(a2, a6); + in[4] = _mm256_add_epi32(a3, a7); + in[2] = _mm256_sub_epi32(a0, a4); + in[6] = _mm256_sub_epi32(a1, a5); + in[1] = _mm256_sub_epi32(a2, a6); + in[5] = _mm256_sub_epi32(a3, a7); + } +} + +void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src16[8]; + __m256i src32[8]; + + src16[0] = _mm_loadu_si128((const __m128i *)src_diff); + src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + + src32[0] = _mm256_cvtepi16_epi32(src16[0]); + src32[1] = _mm256_cvtepi16_epi32(src16[1]); + src32[2] = _mm256_cvtepi16_epi32(src16[2]); + src32[3] = _mm256_cvtepi16_epi32(src16[3]); + src32[4] = _mm256_cvtepi16_epi32(src16[4]); + src32[5] = _mm256_cvtepi16_epi32(src16[5]); + src32[6] = _mm256_cvtepi16_epi32(src16[6]); + src32[7] = _mm256_cvtepi16_epi32(src16[7]); + + highbd_hadamard_col8_avx2(src32, 0); + highbd_hadamard_col8_avx2(src32, 1); + + _mm256_storeu_si256((__m256i *)coeff, src32[0]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[1]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[2]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[3]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[4]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[5]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[6]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[7]); +} + +void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} + +void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 2); + b1 = _mm256_srai_epi32(b1, 2); + b2 = _mm256_srai_epi32(b2, 2); + b3 = _mm256_srai_epi32(b3, 2); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); + + coeff += 8; + t_coeff += 8; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; @@ -91,7 +294,7 @@ static void hadamard_col8x2_avx2(__m256i *in, int iter) { } } -static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride, +static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { __m256i src[8]; src[0] = _mm256_loadu_si256((const __m256i *)src_diff); @@ -131,18 +334,19 @@ static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride, _mm256_permute2x128_si256(src[6], src[7], 0x31)); } -void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, - tran_low_t *coeff) { - int idx; +static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; #else int16_t *t_coeff = coeff; #endif - + int16_t *coeff16 = (int16_t *)coeff; + int idx; for (idx = 0; idx < 2; ++idx) { - int16_t const *src_ptr = src_diff + idx * 8 * src_stride; + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); } @@ -161,11 +365,69 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, b1 = _mm256_srai_epi16(b1, 1); b2 = _mm256_srai_epi16(b2, 1); b3 = _mm256_srai_epi16(b3, 1); + if (is_final) { + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + coeff += 16; + } else { + _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); + coeff16 += 16; + } + t_coeff += 16; + } +} + +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); +} + +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_avx2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 2); + b1 = _mm256_srai_epi16(b1, 2); + b2 = _mm256_srai_epi16(b2, 2); + b3 = _mm256_srai_epi16(b3, 2); store_tran_low(_mm256_add_epi16(b0, b2), coeff); - store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); - store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); - store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); coeff += 16; t_coeff += 16; @@ -195,3 +457,26 @@ int vpx_satd_avx2(const tran_low_t *coeff, int length) { return _mm_cvtsi128_si32(accum_128); } } + +#if CONFIG_VP9_HIGHBITDEPTH +int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) { + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 8, coeff += 8) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi32(src_line); + accum = _mm256_add_epi32(accum, abs); + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index a235ba41df..5aba903a2d 100644 --- a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -138,6 +138,56 @@ unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) { return (avg + 8) >> 4; } +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const __m128i zero = _mm_setzero_si128(); + s0 = _mm_loadu_si128((const __m128i *)(s)); + s1 = _mm_loadu_si128((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpackhi_epi16(s0, zero); + s0 = _mm_unpacklo_epi16(s0, zero); + s0 = _mm_add_epi32(s0, s1); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); + avg = _mm_cvtsi128_si32(s0); + + return (avg + 32) >> 6; +} + +unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + s0 = _mm_loadl_epi64((const __m128i *)(s)); + s1 = _mm_loadl_epi64((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2)); + avg = _mm_extract_epi16(s0, 0); + + return (avg + 8) >> 4; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static void hadamard_col8_sse2(__m128i *in, int iter) { __m128i a0 = in[0]; __m128i a1 = in[1]; @@ -214,8 +264,9 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } } -void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride, - tran_low_t *coeff) { +static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); @@ -229,37 +280,74 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride, hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); - store_tran_low(src[0], coeff); - coeff += 8; - store_tran_low(src[1], coeff); - coeff += 8; - store_tran_low(src[2], coeff); - coeff += 8; - store_tran_low(src[3], coeff); - coeff += 8; - store_tran_low(src[4], coeff); - coeff += 8; - store_tran_low(src[5], coeff); - coeff += 8; - store_tran_low(src[6], coeff); - coeff += 8; - store_tran_low(src[7], coeff); + if (is_final) { + store_tran_low(src[0], coeff); + coeff += 8; + store_tran_low(src[1], coeff); + coeff += 8; + store_tran_low(src[2], coeff); + coeff += 8; + store_tran_low(src[3], coeff); + coeff += 8; + store_tran_low(src[4], coeff); + coeff += 8; + store_tran_low(src[5], coeff); + coeff += 8; + store_tran_low(src[6], coeff); + coeff += 8; + store_tran_low(src[7], coeff); + } else { + int16_t *coeff16 = (int16_t *)coeff; + _mm_store_si128((__m128i *)coeff16, src[0]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[1]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[2]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[3]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[4]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[5]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[6]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[7]); + } } -void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride, - tran_low_t *coeff) { +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); +} + +static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 4; ++idx) { - int16_t const *src_ptr = + const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; - vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); + hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), + 0); } for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = load_tran_low(coeff); - __m128i coeff1 = load_tran_low(coeff + 64); - __m128i coeff2 = load_tran_low(coeff + 128); - __m128i coeff3 = load_tran_low(coeff + 192); + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); @@ -271,17 +359,82 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride, b2 = _mm_srai_epi16(b2, 1); b3 = _mm_srai_epi16(b3, 1); + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + + if (is_final) { + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); + coeff += 8; + } else { + _mm_store_si128((__m128i *)coeff16, coeff0); + _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); + _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); + _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); + coeff16 += 8; + } + + t_coeff += 8; + } +} + +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); +} + +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_sse2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 2); + b1 = _mm_srai_epi16(b1, 2); + b2 = _mm_srai_epi16(b2, 2); + b3 = _mm_srai_epi16(b3, 2); + coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); store_tran_low(coeff0, coeff); - store_tran_low(coeff1, coeff + 64); + store_tran_low(coeff1, coeff + 256); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - store_tran_low(coeff2, coeff + 128); - store_tran_low(coeff3, coeff + 192); + store_tran_low(coeff2, coeff + 512); + store_tran_low(coeff3, coeff + 768); coeff += 8; + t_coeff += 8; } } @@ -311,7 +464,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) { return _mm_cvtsi128_si32(accum); } -void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, +void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height) { int idx; __m128i zero = _mm_setzero_si128(); @@ -360,7 +513,7 @@ void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, _mm_storeu_si128((__m128i *)hbuf, s1); } -int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) { +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) { __m128i zero = _mm_setzero_si128(); __m128i src_line = _mm_load_si128((const __m128i *)ref); __m128i s0 = _mm_sad_epu8(src_line, zero); @@ -380,7 +533,7 @@ int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) { return _mm_extract_epi16(s0, 0); } -int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) { +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) { int idx; int width = 4 << bwl; int16_t mean; diff --git a/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c index f83b26490e..e4e1e0e7a2 100644 --- a/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c @@ -13,11 +13,12 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" -void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { - /* comp and pred must be 16 byte aligned. */ - assert(((intptr_t)comp & 0xf) == 0); + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); assert(((intptr_t)pred & 0xf) == 0); if (width > 8) { int x, y; @@ -26,17 +27,17 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); const __m128i avg = _mm_avg_epu8(p, r); - _mm_store_si128((__m128i *)(comp + x), avg); + _mm_store_si128((__m128i *)(comp_pred + x), avg); } - comp += width; + comp_pred += width; pred += width; ref += ref_stride; } } else { // width must be 4 or 8. int i; - // Process 16 elements at a time. comp and pred have width == stride and - // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all - // divisible by 16 so just ref needs to be massaged when loading. + // Process 16 elements at a time. comp_pred and pred have width == stride + // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are + // all divisible by 16 so just ref needs to be massaged when loading. for (i = 0; i < width * height; i += 16) { const __m128i p = _mm_load_si128((const __m128i *)pred); __m128i r; @@ -45,10 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, r = _mm_loadu_si128((const __m128i *)ref); ref += 16; } else if (width == 4) { - r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride), - *(const uint32_t *)(ref + 2 * ref_stride), - *(const uint32_t *)(ref + ref_stride), - *(const uint32_t *)(ref)); + r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride), + loadu_uint32(ref + 2 * ref_stride), + loadu_uint32(ref + ref_stride), loadu_uint32(ref)); ref += 4 * ref_stride; } else { @@ -60,10 +60,10 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, ref += 2 * ref_stride; } avg = _mm_avg_epu8(p, r); - _mm_store_si128((__m128i *)comp, avg); + _mm_store_si128((__m128i *)comp_pred, avg); pred += 16; - comp += 16; + comp_pred += 16; } } } diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h index 3552c07cd3..c02b47a3eb 100644 --- a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h +++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ -#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ #include @@ -41,4 +41,4 @@ static INLINE void store_tran_low(__m256i a, tran_low_t *b) { _mm256_storeu_si256((__m256i *)b, a); #endif } -#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h index 5d1d779572..74dde656b1 100644 --- a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ -#define VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ +#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ #include @@ -53,4 +53,4 @@ static INLINE void store_zero_tran_low(tran_low_t *a) { _mm_store_si128((__m128i *)(a), zero); #endif } -#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ +#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/convolve.h b/libs/libvpx/vpx_dsp/x86/convolve.h index 68d7589d45..b75d4d7216 100644 --- a/libs/libvpx/vpx_dsp/x86/convolve.h +++ b/libs/libvpx/vpx_dsp/x86/convolve.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_CONVOLVE_H_ -#define VPX_DSP_X86_CONVOLVE_H_ +#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_H_ #include @@ -16,56 +16,83 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" +// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty +// hacky and awful to read. Note that there is a filter_x[3] == 128 check in +// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function +// assumes the filter is always 8 tap. typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter); -#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ +// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we +// have 4-tap vert avg filter. +#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \ void vpx_convolve8_##name##_##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ - const int16_t *filter = filter_kernel[offset]; \ + const int16_t *filter_row = filter[offset]; \ (void)x0_q4; \ (void)x_step_q4; \ (void)y0_q4; \ (void)y_step_q4; \ - assert(filter[3] != 128); \ + assert(filter_row[3] != 128); \ assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[2]) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ while (w >= 16) { \ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } else if (w == 4) { \ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } \ - } else { \ + (void)num_taps; \ + } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ while (w >= 16) { \ - vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ - vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ } else if (w == 4) { \ - vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ } \ + (void)num_taps; \ + } else { \ + const int num_taps = 2; \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } \ + (void)num_taps; \ } \ } -#define FUN_CONV_2D(avg, opt) \ +#define FUN_CONV_2D(avg, opt, is_avg) \ void vpx_convolve8_##avg##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ @@ -79,7 +106,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, assert(h <= 64); \ assert(x_step_q4 == 16); \ assert(y_step_q4 == 16); \ - if (filter_x[0] | filter_x[1] | filter_x[2]) { \ + if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ @@ -87,6 +114,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ filter, x0_q4, x_step_q4, y0_q4, \ y_step_q4, w, h); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vpx_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ + dst, dst_stride, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h); \ } else { \ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ @@ -106,57 +142,86 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, unsigned int output_height, const int16_t *filter, int bd); -#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \ + is_avg) \ void vpx_highbd_convolve8_##name##_##opt( \ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ - const int16_t *filter = filter_kernel[offset]; \ - if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] | filter[1] | filter[2]) { \ + const int16_t *filter_row = filter_kernel[offset]; \ + if (step_q4 == 16 && filter_row[3] != 128) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ src += 4; \ dst += 4; \ w -= 4; \ } \ + (void)num_taps; \ + } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ } else { \ + const int num_taps = 2; \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ src += 4; \ dst += 4; \ w -= 4; \ } \ + (void)num_taps; \ } \ } \ if (w) { \ @@ -166,7 +231,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } -#define HIGH_FUN_CONV_2D(avg, opt) \ +#define HIGH_FUN_CONV_2D(avg, opt, is_avg) \ void vpx_highbd_convolve8_##avg##opt( \ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ @@ -175,7 +240,8 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, assert(w <= 64); \ assert(h <= 64); \ if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ + filter_x[3] == 128) { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ fdata2, 64, filter, x0_q4, x_step_q4, \ @@ -183,6 +249,16 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, vpx_highbd_convolve8_##avg##vert_##opt( \ fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ y0_q4, y_step_q4, w, h, bd); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ + bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ } else { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ @@ -198,6 +274,6 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, bd); \ } \ } -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // VPX_DSP_X86_CONVOLVE_H_ +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_X86_CONVOLVE_H_ diff --git a/libs/libvpx/vpx_dsp/x86/convolve_avx2.h b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h index bc96b738f4..99bc9637fc 100644 --- a/libs/libvpx/vpx_dsp/x86/convolve_avx2.h +++ b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_ -#define VPX_DSP_X86_CONVOLVE_AVX2_H_ +#ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ #include // AVX2 @@ -100,6 +100,63 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s, return sum1; } +static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1); +} + +static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1); +} + +static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); + *((uint32_t *)(dst_ptr_2)) = + _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); +} + +static INLINE __m256i mm256_round_epi32(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth); + return _mm256_srai_epi32(nearest_src, depth); +} + +static INLINE __m256i mm256_round_epi16(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth); + return _mm256_srai_epi16(nearest_src, depth); +} + +static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0, + const __m256i *const src_1, + const __m256i *const ker_0, + const __m256i *const ker_1) { + const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0); + const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1); + return _mm256_add_epi32(tmp_0, tmp_1); +} + #undef MM256_BROADCASTSI128_SI256 -#endif // VPX_DSP_X86_CONVOLVE_AVX2_H_ +#endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/convolve_sse2.h b/libs/libvpx/vpx_dsp/x86/convolve_sse2.h new file mode 100644 index 0000000000..8443546394 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/convolve_sse2.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words +static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpacklo_epi32(*reg, *reg); + return _mm_unpackhi_epi64(tmp, tmp); +} + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words. +static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpackhi_epi32(*reg, *reg); + return _mm_unpacklo_epi64(tmp, tmp); +} + +// Interprets src as 8-bit words, zero extends to form 16-bit words, then +// multiplies with ker and add the adjacent results to form 32-bit words. +// Finally adds the result from 1 and 2 together. +static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); + const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); + const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +// Interprets src as 16-bit words, then multiplies with ker and add the +// adjacent results to form 32-bit words. Finally adds the result from 1 and 2 +// together. +static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0, + const __m128i *const src_1, + const __m128i *const ker) { + const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); + const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); + return _mm_packs_epi32(madd_1, madd_2); +} + +// Interleaves src_1 and src_2 +static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1, + const __m128i *const src_2) { + const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); + const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); + return _mm_packs_epi32(tmp_1, tmp_2); +} + +static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_add_epi32(*src, *half_depth); + return _mm_srai_epi32(nearest_src, depth); +} + +static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); + return _mm_srai_epi16(nearest_src, depth); +} + +#endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h index e5d452f99e..8a4b165133 100644 --- a/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h +++ b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_ -#define VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ #include #include // SSSE3 @@ -109,4 +109,4 @@ static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, return temp; } -#endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#endif // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm index 97cb43b671..9d8e5e3e09 100644 --- a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm @@ -232,237 +232,6 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2): ret %undef flimit -;void vpx_mbpost_proc_down_sse2(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vpx_rv) -global sym(vpx_mbpost_proc_down_sse2) PRIVATE -sym(vpx_mbpost_proc_down_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 128+16 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax - mov [rsp+128+8], eax - mov [rsp+128+12], eax -%define flimit4 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vpx_rv))] -%endif - - ;rows +=8; - add dword arg(2), 8 - - ;for(c=0; c - #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" // ----------------------------------------------------------------------------- // Copy and average @@ -20,7 +20,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, - int width, int h, int bd) { + int w, int h, int bd) { (void)filter; (void)x0_q4; (void)x_step_q4; @@ -28,8 +28,8 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, (void)y_step_q4; (void)bd; - assert(width % 4 == 0); - if (width > 32) { // width = 64 + assert(w % 4 == 0); + if (w > 32) { // w = 64 do { const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); @@ -43,7 +43,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 16) { // width = 32 + } else if (w > 16) { // w = 32 do { const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); @@ -53,7 +53,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 8) { // width = 16 + } else if (w > 8) { // w = 16 __m256i p0, p1; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -67,7 +67,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h -= 2; } while (h > 0); - } else if (width > 4) { // width = 8 + } else if (w > 4) { // w = 8 __m128i p0, p1; do { p0 = _mm_loadu_si128((const __m128i *)src); @@ -81,7 +81,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h -= 2; } while (h > 0); - } else { // width = 4 + } else { // w = 4 __m128i p0, p1; do { p0 = _mm_loadl_epi64((const __m128i *)src); @@ -102,7 +102,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, - int width, int h, int bd) { + int w, int h, int bd) { (void)filter; (void)x0_q4; (void)x_step_q4; @@ -110,8 +110,8 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, (void)y_step_q4; (void)bd; - assert(width % 4 == 0); - if (width > 32) { // width = 64 + assert(w % 4 == 0); + if (w > 32) { // w = 64 __m256i p0, p1, p2, p3, u0, u1, u2, u3; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -130,7 +130,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 16) { // width = 32 + } else if (w > 16) { // w = 32 __m256i p0, p1, u0, u1; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -143,7 +143,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 8) { // width = 16 + } else if (w > 8) { // w = 16 __m256i p0, p1, u0, u1; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -158,7 +158,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride << 1; h -= 2; } while (h > 0); - } else if (width > 4) { // width = 8 + } else if (w > 4) { // w = 8 __m128i p0, p1, u0, u1; do { p0 = _mm_loadu_si128((const __m128i *)src); @@ -172,7 +172,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride << 1; h -= 2; } while (h > 0); - } else { // width = 4 + } else { // w = 4 __m128i p0, p1, u0, u1; do { p0 = _mm_loadl_epi64((const __m128i *)src); @@ -192,8 +192,6 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, // ----------------------------------------------------------------------------- // Horizontal and vertical filtering -#define CONV8_ROUNDING_BITS (7) - static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; @@ -210,6 +208,9 @@ static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + // ----------------------------------------------------------------------------- // Horizontal Filtering @@ -923,6 +924,196 @@ static void vpx_highbd_filter_block1d16_h8_avg_avx2( } while (height > 0); } +static void vpx_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We extract the middle four elements of the kernel into two registers in + // the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add on the two + // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we + // can do this two rows at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + } +} + +static void vpx_highbd_filter_block1d8_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will extract the middle four elements of the kernel into two registers + // in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum of the first half. + // Calling add gives us first half of the output. Repat again to get the whole + // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows + // at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg, res_first, res_last; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for first half + res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for second half + res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round each result + res_first = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + res_last = mm256_round_epi32(&res_last, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_first, res_last); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg); + } +} + +static void vpx_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + static void vpx_highbd_filter_block1d8_v8_avg_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { @@ -1058,39 +1249,235 @@ static void vpx_highbd_filter_block1d8_v2_avg_avx2( } while (height > 0); } -void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); +static void vpx_highbd_filter_block1d4_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + // Result after multiply and add + __m256i res_reg; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + + // Output + res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d8_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel + + // Result after multiply and add + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23); + + // Output from first half + res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo, + &kernel_reg_23, &kernel_reg_45); + + // Output from second half + res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg_lo = + mm256_round_epi32(&res_reg_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_hi = + mm256_round_epi32(&res_reg_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d16_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; + #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); -HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); -HIGH_FUN_CONV_2D(, avx2); +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_avx2 \ + vpx_highbd_filter_block1d16_v8_avg_avx2 +#define vpx_highbd_filter_block1d16_h4_avg_avx2 \ + vpx_highbd_filter_block1d16_h8_avg_avx2 +#define vpx_highbd_filter_block1d8_v4_avg_avx2 \ + vpx_highbd_filter_block1d8_v8_avg_avx2 +#define vpx_highbd_filter_block1d8_h4_avg_avx2 \ + vpx_highbd_filter_block1d8_h8_avg_avx2 +#define vpx_highbd_filter_block1d4_v4_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_avx2 +#define vpx_highbd_filter_block1d4_h4_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_avx2 + +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , avx2, 0); +HIGH_FUN_CONV_2D(, avx2, 0); + +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; -void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); #define vpx_highbd_filter_block1d4_h8_avg_avx2 \ vpx_highbd_filter_block1d4_h8_avg_sse2 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \ @@ -1100,9 +1487,9 @@ void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ vpx_highbd_filter_block1d4_v2_avg_sse2 -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); -HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, - avx2); -HIGH_FUN_CONV_2D(avg_, avx2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); +HIGH_FUN_CONV_2D(avg_, avx2, 1); #undef HIGHBD_FUNC diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c index de097c66a6..7898ee12c8 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, out[15] = in[15]; } -static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) { __m128i step1[16], step2[16]; // stage 2 @@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, in = all[i]; highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); - highbd_idct16_4col(in); + vpx_highbd_idct16_4col_sse4_1(in); input += 4 * 16; } @@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, transpose_32bit_4x4(all[1] + i, out + 4); transpose_32bit_4x4(all[2] + i, out + 8); transpose_32bit_4x4(all[3] + i, out + 12); - highbd_idct16_4col(out); + vpx_highbd_idct16_4col_sse4_1(out); for (j = 0; j < 16; ++j) { highbd_write_buffer_4(dest + j * stride, out[j], bd); diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c index 38e64f3bc9..fe74d272ad 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -16,28 +16,6 @@ #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" -static INLINE void highbd_idct4(__m128i *const io) { - __m128i temp[2], step[4]; - - transpose_32bit_4x4(io, io); - - // stage 1 - temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - extend_64bit(temp[0], temp); - step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); - temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] - extend_64bit(temp[0], temp); - step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); - highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], - &step[3]); - - // stage 2 - io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] - io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] - io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] - io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] -} - void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, int stride, int bd) { __m128i io[4]; @@ -59,8 +37,8 @@ void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[0] = _mm_srai_epi16(io_short[0], 4); io[1] = _mm_srai_epi16(io_short[1], 4); } else { - highbd_idct4(io); - highbd_idct4(io); + highbd_idct4_sse4_1(io); + highbd_idct4_sse4_1(io); io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); } diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index 909a6b7948..bb7a510e15 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -124,8 +124,8 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, io_short[6] = _mm_packs_epi32(io[10], io[14]); io_short[7] = _mm_packs_epi32(io[11], io[15]); - idct8_sse2(io_short); - idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); round_shift_8x8(io_short, io); } else { __m128i temp[4]; diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c index ae391b2c02..8b2e3d2415 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -17,7 +17,7 @@ #include "vpx_dsp/x86/inv_txfm_ssse3.h" #include "vpx_dsp/x86/transpose_sse2.h" -static void highbd_idct8x8_half1d(__m128i *const io) { +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) { __m128i step1[8], step2[8]; transpose_32bit_4x4x2(io, io); @@ -126,13 +126,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io_short[6] = _mm_packs_epi32(io[10], io[14]); io_short[7] = _mm_packs_epi32(io[11], io[15]); - idct8_sse2(io_short); - idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); round_shift_8x8(io_short, io); } else { __m128i temp[4]; - highbd_idct8x8_half1d(io); + vpx_highbd_idct8x8_half1d_sse4_1(io); io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); @@ -142,7 +142,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); - highbd_idct8x8_half1d(&io[8]); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); temp[0] = io[4]; temp[1] = io[5]; @@ -152,13 +152,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[5] = io[9]; io[6] = io[10]; io[7] = io[11]; - highbd_idct8x8_half1d(io); + vpx_highbd_idct8x8_half1d_sse4_1(io); io[8] = temp[0]; io[9] = temp[1]; io[10] = temp[2]; io[11] = temp[3]; - highbd_idct8x8_half1d(&io[8]); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); highbd_idct8x8_final_round(io); } diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c index 2051381aa8..43634aea3a 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -460,7 +460,8 @@ void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, const int J = left[1]; const int K = left[2]; const int L = left[3]; - const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); + const __m128i XXXXXABC = _mm_castps_si128( + _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1))); const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c index b9dcef205b..d673fac493 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c @@ -170,9 +170,9 @@ void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, } } -DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 -}; +DECLARE_ALIGNED(16, static const uint8_t, + rotate_right_epu16[16]) = { 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1 }; static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { *a = _mm_shuffle_epi8(*a, *rotrw); diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm index c61b62104f..caf506ac07 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -256,7 +256,7 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd movd m1, [aboveq-2] movq m0, [aboveq] pshuflw m1, m1, 0x0 @@ -264,7 +264,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps movlhps m1, m1 ; tl tl tl tl tl tl tl tl ; Get the values to compute the maximum value at this bit depth pcmpeqw m3, m3 - movd m4, bpsd + movd m4, bdd psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl psllw m3, m4 pcmpeqw m2, m2 @@ -295,7 +295,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps RET INIT_XMM sse2 -cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one +cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one movd m1, [aboveq-2] mova m0, [aboveq] pshuflw m1, m1, 0x0 @@ -304,7 +304,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one pxor m3, m3 pxor m4, m4 pinsrw m3, oned, 0 - pinsrw m4, bpsd, 0 + pinsrw m4, bdd, 0 pshuflw m3, m3, 0x0 DEFINE_ARGS dst, stride, line, left punpcklqdq m3, m3 @@ -339,14 +339,14 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd movd m2, [aboveq-2] mova m0, [aboveq] mova m1, [aboveq+16] pshuflw m2, m2, 0x0 ; Get the values to compute the maximum value at this bit depth pcmpeqw m3, m3 - movd m4, bpsd + movd m4, bdd punpcklqdq m2, m2 psllw m3, m4 pcmpeqw m5, m5 @@ -386,7 +386,7 @@ cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps +cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd movd m0, [aboveq-2] mova m1, [aboveq] mova m2, [aboveq+16] @@ -395,7 +395,7 @@ cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps pshuflw m0, m0, 0x0 ; Get the values to compute the maximum value at this bit depth pcmpeqw m5, m5 - movd m6, bpsd + movd m6, bdd psllw m5, m6 pcmpeqw m7, m7 pxor m6, m6 ; min possible value diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h index e0f7495521..78cf9111d9 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ -#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ #include // SSE2 @@ -19,6 +19,10 @@ #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +// Note: There is no 64-bit bit-level shifting SIMD instruction. All +// coefficients are left shifted by 2, so that dct_const_round_shift() can be +// done by right shifting 2 bytes. + static INLINE void extend_64bit(const __m128i in, __m128i *const out /*out[2]*/) { out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 @@ -397,4 +401,4 @@ static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in, recon_and_store_4(out, dest, bd); } -#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 9c8eef40f7..f446bb13f3 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ -#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ #include // SSE4.1 @@ -84,4 +84,29 @@ static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, *out1 = multiplication_round_shift_sse4_1(temp, c1); } -#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +static INLINE void highbd_idct4_sse4_1(__m128i *const io) { + __m128i temp[2], step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io); +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/); + +#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c index ec22db9f4c..d265fc1a92 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -47,13 +47,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { // TODO(debargha, peter): Break up large functions into smaller ones // in this file. -void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); - __m128i blimit, limit, thresh; + __m128i blimit_v, limit_v, thresh_v; __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; __m128i ps1, qs1, ps0, qs0; @@ -70,35 +70,35 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, __m128i eight, four; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); } - q4 = _mm_load_si128((__m128i *)(s + 4 * p)); - p4 = _mm_load_si128((__m128i *)(s - 5 * p)); - q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - q0 = _mm_load_si128((__m128i *)(s + 0 * p)); - p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + q4 = _mm_load_si128((__m128i *)(s + 4 * pitch)); + p4 = _mm_load_si128((__m128i *)(s - 5 * pitch)); + q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); + p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); // highbd_filter_mask abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); @@ -111,14 +111,14 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // highbd_hev_mask (in C code this is actually called from highbd_filter4) flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); work = _mm_max_epi16( _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); @@ -132,7 +132,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // return ~mask // lp filter @@ -207,12 +207,12 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // (because, in both vars, each block of 16 either all 1s or all 0s) flat = _mm_and_si128(flat, mask); - p5 = _mm_load_si128((__m128i *)(s - 6 * p)); - q5 = _mm_load_si128((__m128i *)(s + 5 * p)); - p6 = _mm_load_si128((__m128i *)(s - 7 * p)); - q6 = _mm_load_si128((__m128i *)(s + 6 * p)); - p7 = _mm_load_si128((__m128i *)(s - 8 * p)); - q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + p5 = _mm_load_si128((__m128i *)(s - 6 * pitch)); + q5 = _mm_load_si128((__m128i *)(s + 5 * pitch)); + p6 = _mm_load_si128((__m128i *)(s - 7 * pitch)); + q6 = _mm_load_si128((__m128i *)(s + 6 * pitch)); + p7 = _mm_load_si128((__m128i *)(s - 8 * pitch)); + q7 = _mm_load_si128((__m128i *)(s + 7 * pitch)); // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 // but referred to as p0-p4 & q0-q4 in fn) @@ -389,8 +389,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q6 = _mm_and_si128(flat2, flat2_q6); // get values for when (flat2 && flat && mask) q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values - _mm_store_si128((__m128i *)(s - 7 * p), p6); - _mm_store_si128((__m128i *)(s + 6 * p), q6); + _mm_store_si128((__m128i *)(s - 7 * pitch), p6); + _mm_store_si128((__m128i *)(s + 6 * pitch), q6); p5 = _mm_andnot_si128(flat2, p5); // p5 remains unchanged if !(flat2 && flat && mask) @@ -404,8 +404,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // get values for when (flat2 && flat && mask) q5 = _mm_or_si128(q5, flat2_q5); // full list of q5 values - _mm_store_si128((__m128i *)(s - 6 * p), p5); - _mm_store_si128((__m128i *)(s + 5 * p), q5); + _mm_store_si128((__m128i *)(s - 6 * pitch), p5); + _mm_store_si128((__m128i *)(s + 5 * pitch), q5); p4 = _mm_andnot_si128(flat2, p4); // p4 remains unchanged if !(flat2 && flat && mask) @@ -417,8 +417,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q4 = _mm_and_si128(flat2, flat2_q4); // get values for when (flat2 && flat && mask) q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values - _mm_store_si128((__m128i *)(s - 5 * p), p4); - _mm_store_si128((__m128i *)(s + 4 * p), q4); + _mm_store_si128((__m128i *)(s - 5 * pitch), p4); + _mm_store_si128((__m128i *)(s + 4 * pitch), q4); p3 = _mm_andnot_si128(flat2, p3); // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -430,8 +430,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q3 = _mm_and_si128(flat2, flat2_q3); // get values for when (flat2 && flat && mask) q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values - _mm_store_si128((__m128i *)(s - 4 * p), p3); - _mm_store_si128((__m128i *)(s + 3 * p), q3); + _mm_store_si128((__m128i *)(s - 4 * pitch), p3); + _mm_store_si128((__m128i *)(s + 3 * pitch), q3); p2 = _mm_andnot_si128(flat2, p2); // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -444,8 +444,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q2 = _mm_and_si128(flat2, flat2_q2); // get values for when (flat2 && flat && mask) q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); p1 = _mm_andnot_si128(flat2, p1); // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -457,8 +457,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q1 = _mm_and_si128(flat2, flat2_q1); // get values for when (flat2 && flat && mask) q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s + 1 * p), q1); + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); p0 = _mm_andnot_si128(flat2, p0); // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -470,22 +470,22 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q0 = _mm_and_si128(flat2, flat2_q0); // get values for when (flat2 && flat && mask) q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s - 0 * p), q0); + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s - 0 * pitch), q0); } -void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd); - vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd); + vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd); } -void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); @@ -493,16 +493,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); const __m128i zero = _mm_set1_epi16(0); - __m128i blimit, limit, thresh; + __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; - __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_cmpeq_epi16(one, one); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -519,25 +519,25 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, __m128i filter1, filter2; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); t80 = _mm_set1_epi16(0x80); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); t80 = _mm_set1_epi16(0x200); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); t80 = _mm_set1_epi16(0x800); } @@ -553,16 +553,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); mask = _mm_max_epi16(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; mask = _mm_max_epi16(abs_q1q0, mask); @@ -576,7 +576,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // flat_mask4 @@ -674,7 +674,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -694,43 +694,43 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s + 0 * p), q0); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s + 0 * pitch), q0); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); } void vpx_highbd_lpf_horizontal_8_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); } -void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); - __m128i blimit, limit, thresh; + __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -760,57 +760,57 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, __m128i filter1, filter2; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); t80 = _mm_set1_epi16(0x80); - tff80 = _mm_set1_epi16(0xff80); - tffe0 = _mm_set1_epi16(0xffe0); + tff80 = _mm_set1_epi16((int16_t)0xff80); + tffe0 = _mm_set1_epi16((int16_t)0xffe0); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); + tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2); + tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); + tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4); + tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); } - ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); // filter_mask and hev_mask flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); mask = _mm_max_epi16(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; @@ -822,7 +822,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // filter4 @@ -872,18 +872,18 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); } void vpx_highbd_lpf_horizontal_4_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); } static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], @@ -998,9 +998,9 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, highbd_transpose(src1, in_p, dest1, out_p, 1); } -void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; @@ -1009,7 +1009,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, src[0] = s - 4; dst[0] = t_dst; - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); @@ -1018,11 +1018,11 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[0] = s - 4; // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose(src, 8, dst, pitch, 1); } void vpx_highbd_lpf_vertical_4_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); @@ -1030,7 +1030,7 @@ void vpx_highbd_lpf_vertical_4_dual_sse2( uint16_t *dst[2]; // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, @@ -1038,15 +1038,15 @@ void vpx_highbd_lpf_vertical_4_dual_sse2( src[0] = t_dst; src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose(src, 16, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; @@ -1055,7 +1055,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, src[0] = s - 4; dst[0] = t_dst; - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); @@ -1064,11 +1064,11 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[0] = s - 4; // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose(src, 8, dst, pitch, 1); } void vpx_highbd_lpf_vertical_8_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); @@ -1076,7 +1076,7 @@ void vpx_highbd_lpf_vertical_8_dual_sse2( uint16_t *dst[2]; // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, @@ -1085,13 +1085,14 @@ void vpx_highbd_lpf_vertical_8_dual_sse2( src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose(src, 16, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, +void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); @@ -1104,7 +1105,7 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[1] = t_dst + 8 * 8; // Transpose 16x8 - highbd_transpose(src, p, dst, 8, 2); + highbd_transpose(src, pitch, dst, 8, 2); // Loop filtering vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, @@ -1115,24 +1116,25 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[1] = s; // Transpose back - highbd_transpose(src, 8, dst, p, 2); + highbd_transpose(src, 8, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, +void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[256]); // Transpose 16x16 - highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); // Loop filtering vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); // Transpose back - highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, + pitch); } diff --git a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index cedf98aff4..7149e4fb74 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -11,6 +11,7 @@ #include #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" diff --git a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index d9a6932e0b..cefde0f57d 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -32,12 +32,12 @@ SECTION .text ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *ref, ptrdiff_t ref_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse psubw %3, %4 psubw %1, %2 mova %4, %3 ; make copies to manipulate to calc sum @@ -91,81 +91,65 @@ SECTION .text %define filter_idx_shift 5 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq + ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq %else - cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp - - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back + %define second_str second_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, \ - sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + ref, ref_stride, height, sse %define block_height heightd - - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp - %endif + %define second_str second_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, sse + x_offset, y_offset, \ + ref, ref_stride, height, sse %define block_height heightd %endif @@ -181,7 +165,7 @@ SECTION .text sar block_height, 1 %endif %if %2 == 1 ; avg - shl sec_str, 1 + shl second_str, 1 %endif ; FIXME(rbultje) replace by jumptable? @@ -196,35 +180,35 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m2, [srcq + 16] - mova m1, [dstq] - mova m3, [dstq + 16] + mova m1, [refq] + mova m3, [refq + 16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m2, [secq+16] + pavgw m0, [second_predq] + pavgw m2, [second_predq+16] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq + src_strideq*2] - mova m1, [dstq] - mova m3, [dstq + dst_strideq*2] + mova m1, [refq] + mova m3, [refq + ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -242,40 +226,40 @@ SECTION .text movu m1, [srcq+16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] + mova m2, [refq] + mova m3, [refq+16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] + mova m2, [refq] + mova m3, [refq+ref_strideq*2] pavgw m0, m1 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -284,14 +268,14 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -308,7 +292,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -318,8 +302,8 @@ SECTION .text movu m1, [srcq + 16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] + mova m2, [refq] + mova m3, [refq+16] ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of ; instructions is the same (5), but it is 1 mul instead of 2, so might be @@ -336,23 +320,23 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] mova m4, m1 - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] + mova m2, [refq] + mova m3, [refq+ref_strideq*2] pmullw m1, filter_y_a pmullw m5, filter_y_b paddw m1, filter_rnd @@ -364,16 +348,16 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -397,41 +381,41 @@ SECTION .text movu m1, [srcq + 16] movu m4, [srcq + 2] movu m5, [srcq + 18] - mova m2, [dstq] - mova m3, [dstq + 16] + mova m2, [refq] + mova m3, [refq + 16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq + src_strideq*2] movu m4, [srcq + 2] movu m5, [srcq + src_strideq*2 + 2] - mova m2, [dstq] - mova m3, [dstq + dst_strideq*2] + mova m2, [refq] + mova m3, [refq + ref_strideq*2] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -460,20 +444,20 @@ SECTION .text pavgw m3, m5 pavgw m0, m2 pavgw m1, m3 - mova m4, [dstq] - mova m5, [dstq + 16] + mova m4, [refq] + mova m5, [refq + 16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -489,20 +473,20 @@ SECTION .text pavgw m3, m5 pavgw m0, m2 pavgw m2, m3 - mova m4, [dstq] - mova m5, [dstq + dst_strideq*2] + mova m4, [refq] + mova m5, [refq + ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -511,14 +495,14 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -535,7 +519,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -565,21 +549,21 @@ SECTION .text paddw m0, filter_rnd psrlw m1, 4 paddw m0, m2 - mova m2, [dstq] + mova m2, [refq] psrlw m0, 4 - mova m3, [dstq+16] + mova m3, [refq+16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -604,21 +588,21 @@ SECTION .text paddw m0, filter_rnd psrlw m4, 4 paddw m0, m2 - mova m2, [dstq] + mova m2, [refq] psrlw m0, 4 - mova m3, [dstq+dst_strideq*2] + mova m3, [refq+ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -633,14 +617,14 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -657,7 +641,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -667,8 +651,8 @@ SECTION .text movu m1, [srcq+16] movu m2, [srcq+2] movu m3, [srcq+18] - mova m4, [dstq] - mova m5, [dstq+16] + mova m4, [refq] + mova m5, [refq+16] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd @@ -680,23 +664,23 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m2, [srcq+2] movu m3, [srcq+src_strideq*2+2] - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] + mova m4, [refq] + mova m5, [refq+ref_strideq*2] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd @@ -708,16 +692,16 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] + lea refq, [refq+ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -732,14 +716,14 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -756,7 +740,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -789,24 +773,24 @@ SECTION .text paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+16] + mova m4, [refq] + mova m5, [refq+16] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m1, m3 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -830,24 +814,24 @@ SECTION .text paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] + mova m4, [refq] + mova m5, [refq+ref_strideq*2] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m2, m3 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] + lea refq, [refq+ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -859,8 +843,8 @@ SECTION .text .x_nonhalf_y_nonhalf: ; loading filter - this is same as in 8-bit depth -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 shl y_offsetd, filter_idx_shift @@ -869,7 +853,7 @@ SECTION .text mova m9, [bilin_filter+x_offsetq+16] mova m10, [bilin_filter+y_offsetq] mova m11, [bilin_filter+y_offsetq+16] - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -897,7 +881,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif ; end of load filter @@ -945,23 +929,23 @@ SECTION .text pmullw m3, filter_y_b paddw m0, m2 paddw m1, filter_rnd - mova m2, [dstq] + mova m2, [refq] paddw m1, m3 psrlw m0, 4 psrlw m1, 4 - mova m3, [dstq+16] + mova m3, [refq+16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 2] + lea refq, [refq + ref_strideq * 2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -999,23 +983,23 @@ SECTION .text pmullw m3, filter_y_b paddw m0, m2 paddw m4, filter_rnd - mova m2, [dstq] + mova m2, [refq] paddw m4, m3 psrlw m0, 4 psrlw m4, 4 - mova m3, [dstq+dst_strideq*2] + mova m3, [refq+ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 4] + lea refq, [refq + ref_strideq * 4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm index e646767e19..a256a59ec0 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -16,9 +16,9 @@ SECTION .text ;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, -; int source_stride, +; int src_stride, ; unsigned char * ref_ptr, -; int recon_stride, +; int ref_stride, ; unsigned int * SSE, ; int * Sum ;) @@ -36,8 +36,8 @@ sym(vpx_highbd_calc16x16var_sse2): mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes @@ -169,9 +169,9 @@ sym(vpx_highbd_calc16x16var_sse2): ;unsigned int vpx_highbd_calc8x8var_sse2 ;( ; unsigned char * src_ptr, -; int source_stride, +; int src_stride, ; unsigned char * ref_ptr, -; int recon_stride, +; int ref_stride, ; unsigned int * SSE, ; int * Sum ;) @@ -189,8 +189,8 @@ sym(vpx_highbd_calc8x8var_sse2): mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c index a6f7c3d25d..dd6cfbb2c4 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,8 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "./vpx_config.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, @@ -89,9 +90,9 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, } #define HIGH_GET_VAR(S) \ - void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ + void vpx_highbd_8_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ @@ -135,7 +136,7 @@ HIGH_GET_VAR(8); highbd_8_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift)); \ } \ \ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ @@ -148,7 +149,7 @@ HIGH_GET_VAR(8); highbd_10_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ @@ -162,7 +163,7 @@ HIGH_GET_VAR(8); highbd_12_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \ return (var >= 0) ? (uint32_t)var : 0; \ } @@ -251,7 +252,7 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, #define DECL(w, opt) \ int vpx_highbd_sub_pixel_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + const uint16_t *ref, ptrdiff_t ref_stride, int height, \ unsigned int *sse, void *unused0, void *unused); #define DECLS(opt) \ DECL(8, opt); \ @@ -265,28 +266,28 @@ DECLS(sse2); #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ NULL); \ if (w > wf) { \ unsigned int sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -298,29 +299,29 @@ DECLS(sse2); \ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -335,40 +336,40 @@ DECLS(sse2); \ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ int start_row; \ uint32_t sse; \ int se = 0; \ int64_t var; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ + ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL, \ NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \ &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ @@ -404,8 +405,8 @@ FNS(sse2); #define DECL(w, opt) \ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second, \ + ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \ void *unused); #define DECLS(opt1) \ DECL(16, opt1) \ @@ -418,30 +419,30 @@ DECLS(sse2); #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -453,31 +454,31 @@ DECLS(sse2); \ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -492,7 +493,7 @@ DECLS(sse2); \ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int start_row; \ int64_t var; \ @@ -500,34 +501,34 @@ DECLS(sse2); int se = 0; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + ref + (start_row * ref_stride), ref_stride, sec + (start_row * w), \ w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, \ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index f6e56b6f9e..4b02da9666 100644 --- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -100,49 +100,44 @@ void idct4_sse2(__m128i *const in) { } void iadst4_sse2(__m128i *const in) { - const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); - const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); - const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8], in7; + const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9); + const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9); + const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9); + const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_12_n3 = + pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9); + __m128i u[4], v[5]; - transpose_16bit_4(in); - in7 = _mm_srli_si128(in[1], 8); - in7 = _mm_add_epi16(in7, in[0]); - in7 = _mm_sub_epi16(in7, in[1]); + // 00 01 20 21 02 03 22 23 + // 10 11 30 31 12 13 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]); - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpackhi_epi16(in[0], kZero); + // 00 01 10 11 20 21 30 31 + // 02 03 12 13 22 23 32 33 + in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1); - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 - v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3); // s_1 * x0 + s_3 * x1 + v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2); // s_4 * x2 + s_2 * x3 + v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3); // s_2 * x0 + s_3 * x1 + v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4); // s_1 * x2 + s_4 * x3 + v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3); // (s_1 + s_2) * x0 - s_3 * x1 + in[0] = _mm_sub_epi16(in[0], in[1]); // x0 - x2 + in[1] = _mm_srli_epi32(in[1], 16); + in[0] = _mm_add_epi16(in[0], in[1]); + in[0] = _mm_slli_epi32(in[0], 16); // x0 - x2 + x3 u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_add_epi32(v[3], v[4]); - u[2] = v[2]; - u[3] = _mm_add_epi32(u[0], u[1]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_add_epi32(u[3], v[5]); - u[6] = _mm_sub_epi32(u[5], u[4]); + u[1] = _mm_sub_epi32(v[2], v[3]); + u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[3] = _mm_add_epi32(u[3], v[4]); - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); in[0] = _mm_packs_epi32(u[0], u[1]); in[1] = _mm_packs_epi32(u[2], u[3]); @@ -170,7 +165,7 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, // 2-D for (i = 0; i < 2; i++) { - idct8_sse2(in); + vpx_idct8_sse2(in); } write_buffer_8x8(in, dest, stride); @@ -226,7 +221,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, recon_and_store_8_dual(dest, dc_value, stride); } -void idct8_sse2(__m128i *const in) { +void vpx_idct8_sse2(__m128i *const in) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() transpose_16bit_8x8(in, in); @@ -248,191 +243,149 @@ void iadst8_sse2(__m128i *const in) { const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; + const __m128i kZero = _mm_set1_epi16(0); + __m128i s[8], u[16], v[8], w[16]; // transpose transpose_16bit_8x8(in, in); - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - // column transformation // stage 1 // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); + s[0] = _mm_unpacklo_epi16(in[7], in[0]); + s[1] = _mm_unpackhi_epi16(in[7], in[0]); + s[2] = _mm_unpacklo_epi16(in[5], in[2]); + s[3] = _mm_unpackhi_epi16(in[5], in[2]); + s[4] = _mm_unpacklo_epi16(in[3], in[4]); + s[5] = _mm_unpackhi_epi16(in[3], in[4]); + s[6] = _mm_unpacklo_epi16(in[1], in[6]); + s[7] = _mm_unpackhi_epi16(in[1], in[6]); - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30); + u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30); + u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02); + u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02); + u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22); + u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22); + u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10); + u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10); + u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14); + u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14); + u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18); + u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18); + u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06); + u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06); + u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26); + u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26); // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); + w[0] = _mm_add_epi32(u[0], u[8]); + w[1] = _mm_add_epi32(u[1], u[9]); + w[2] = _mm_add_epi32(u[2], u[10]); + w[3] = _mm_add_epi32(u[3], u[11]); + w[4] = _mm_add_epi32(u[4], u[12]); + w[5] = _mm_add_epi32(u[5], u[13]); + w[6] = _mm_add_epi32(u[6], u[14]); + w[7] = _mm_add_epi32(u[7], u[15]); + w[8] = _mm_sub_epi32(u[0], u[8]); + w[9] = _mm_sub_epi32(u[1], u[9]); + w[10] = _mm_sub_epi32(u[2], u[10]); + w[11] = _mm_sub_epi32(u[3], u[11]); + w[12] = _mm_sub_epi32(u[4], u[12]); + w[13] = _mm_sub_epi32(u[5], u[13]); + w[14] = _mm_sub_epi32(u[6], u[14]); + w[15] = _mm_sub_epi32(u[7], u[15]); // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(w[0]); + u[1] = dct_const_round_shift_sse2(w[1]); + u[2] = dct_const_round_shift_sse2(w[2]); + u[3] = dct_const_round_shift_sse2(w[3]); + u[4] = dct_const_round_shift_sse2(w[4]); + u[5] = dct_const_round_shift_sse2(w[5]); + u[6] = dct_const_round_shift_sse2(w[6]); + u[7] = dct_const_round_shift_sse2(w[7]); + u[8] = dct_const_round_shift_sse2(w[8]); + u[9] = dct_const_round_shift_sse2(w[9]); + u[10] = dct_const_round_shift_sse2(w[10]); + u[11] = dct_const_round_shift_sse2(w[11]); + u[12] = dct_const_round_shift_sse2(w[12]); + u[13] = dct_const_round_shift_sse2(w[13]); + u[14] = dct_const_round_shift_sse2(w[14]); + u[15] = dct_const_round_shift_sse2(w[15]); // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); + in[2] = _mm_packs_epi32(u[4], u[5]); + in[3] = _mm_packs_epi32(u[6], u[7]); + in[4] = _mm_packs_epi32(u[8], u[9]); + in[5] = _mm_packs_epi32(u[10], u[11]); + in[6] = _mm_packs_epi32(u[12], u[13]); + in[7] = _mm_packs_epi32(u[14], u[15]); // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); + s[0] = _mm_add_epi16(in[0], in[2]); + s[1] = _mm_add_epi16(in[1], in[3]); + s[2] = _mm_sub_epi16(in[0], in[2]); + s[3] = _mm_sub_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[4], in[5]); + u[1] = _mm_unpackhi_epi16(in[4], in[5]); + u[2] = _mm_unpacklo_epi16(in[6], in[7]); + u[3] = _mm_unpackhi_epi16(in[6], in[7]); - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); + w[0] = _mm_add_epi32(v[0], v[4]); + w[1] = _mm_add_epi32(v[1], v[5]); + w[2] = _mm_add_epi32(v[2], v[6]); + w[3] = _mm_add_epi32(v[3], v[7]); + w[4] = _mm_sub_epi32(v[0], v[4]); + w[5] = _mm_sub_epi32(v[1], v[5]); + w[6] = _mm_sub_epi32(v[2], v[6]); + w[7] = _mm_sub_epi32(v[3], v[7]); - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(w[0]); + u[1] = dct_const_round_shift_sse2(w[1]); + u[2] = dct_const_round_shift_sse2(w[2]); + u[3] = dct_const_round_shift_sse2(w[3]); + u[4] = dct_const_round_shift_sse2(w[4]); + u[5] = dct_const_round_shift_sse2(w[5]); + u[6] = dct_const_round_shift_sse2(w[6]); + u[7] = dct_const_round_shift_sse2(w[7]); // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); + s[4] = _mm_packs_epi32(u[0], u[1]); + s[5] = _mm_packs_epi32(u[2], u[3]); + s[6] = _mm_packs_epi32(u[4], u[5]); + s[7] = _mm_packs_epi32(u[6], u[7]); // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); - s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16); - s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16); - s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16); - s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16); + s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16); + s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); + s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); + s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16); - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[4]); + in[2] = s[6]; + in[3] = _mm_sub_epi16(kZero, s[2]); + in[4] = s[3]; + in[5] = _mm_sub_epi16(kZero, s[7]); + in[6] = s[5]; + in[7] = _mm_sub_epi16(kZero, s[1]); } static INLINE void idct16_load8x8(const tran_low_t *const input, @@ -561,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, } } -static void iadst16_8col(__m128i *const in) { +void vpx_iadst16_8col_sse2(__m128i *const in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -593,7 +546,6 @@ static void iadst16_8col(__m128i *const in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i kZero = _mm_set1_epi16(0); u[0] = _mm_unpacklo_epi16(in[15], in[0]); @@ -679,71 +631,38 @@ static void iadst16_8col(__m128i *const in) { u[30] = _mm_sub_epi32(v[14], v[30]); u[31] = _mm_sub_epi32(v[15], v[31]); - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); + u[4] = dct_const_round_shift_sse2(u[4]); + u[5] = dct_const_round_shift_sse2(u[5]); + u[6] = dct_const_round_shift_sse2(u[6]); + u[7] = dct_const_round_shift_sse2(u[7]); + u[8] = dct_const_round_shift_sse2(u[8]); + u[9] = dct_const_round_shift_sse2(u[9]); + u[10] = dct_const_round_shift_sse2(u[10]); + u[11] = dct_const_round_shift_sse2(u[11]); + u[12] = dct_const_round_shift_sse2(u[12]); + u[13] = dct_const_round_shift_sse2(u[13]); + u[14] = dct_const_round_shift_sse2(u[14]); + u[15] = dct_const_round_shift_sse2(u[15]); + u[16] = dct_const_round_shift_sse2(u[16]); + u[17] = dct_const_round_shift_sse2(u[17]); + u[18] = dct_const_round_shift_sse2(u[18]); + u[19] = dct_const_round_shift_sse2(u[19]); + u[20] = dct_const_round_shift_sse2(u[20]); + u[21] = dct_const_round_shift_sse2(u[21]); + u[22] = dct_const_round_shift_sse2(u[22]); + u[23] = dct_const_round_shift_sse2(u[23]); + u[24] = dct_const_round_shift_sse2(u[24]); + u[25] = dct_const_round_shift_sse2(u[25]); + u[26] = dct_const_round_shift_sse2(u[26]); + u[27] = dct_const_round_shift_sse2(u[27]); + u[28] = dct_const_round_shift_sse2(u[28]); + u[29] = dct_const_round_shift_sse2(u[29]); + u[30] = dct_const_round_shift_sse2(u[30]); + u[31] = dct_const_round_shift_sse2(u[31]); s[0] = _mm_packs_epi32(u[0], u[1]); s[1] = _mm_packs_epi32(u[2], u[3]); @@ -806,39 +725,22 @@ static void iadst16_8col(__m128i *const in) { u[14] = _mm_sub_epi32(v[6], v[14]); u[15] = _mm_sub_epi32(v[7], v[15]); - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); + u[4] = dct_const_round_shift_sse2(u[4]); + u[5] = dct_const_round_shift_sse2(u[5]); + u[6] = dct_const_round_shift_sse2(u[6]); + u[7] = dct_const_round_shift_sse2(u[7]); + u[8] = dct_const_round_shift_sse2(u[8]); + u[9] = dct_const_round_shift_sse2(u[9]); + u[10] = dct_const_round_shift_sse2(u[10]); + u[11] = dct_const_round_shift_sse2(u[11]); + u[12] = dct_const_round_shift_sse2(u[12]); + u[13] = dct_const_round_shift_sse2(u[13]); + u[14] = dct_const_round_shift_sse2(u[14]); + u[15] = dct_const_round_shift_sse2(u[15]); x[0] = _mm_add_epi16(s[0], s[4]); x[1] = _mm_add_epi16(s[1], s[5]); @@ -901,39 +803,22 @@ static void iadst16_8col(__m128i *const in) { u[14] = _mm_sub_epi32(v[10], v[14]); u[15] = _mm_sub_epi32(v[11], v[15]); - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + v[0] = dct_const_round_shift_sse2(u[0]); + v[1] = dct_const_round_shift_sse2(u[1]); + v[2] = dct_const_round_shift_sse2(u[2]); + v[3] = dct_const_round_shift_sse2(u[3]); + v[4] = dct_const_round_shift_sse2(u[4]); + v[5] = dct_const_round_shift_sse2(u[5]); + v[6] = dct_const_round_shift_sse2(u[6]); + v[7] = dct_const_round_shift_sse2(u[7]); + v[8] = dct_const_round_shift_sse2(u[8]); + v[9] = dct_const_round_shift_sse2(u[9]); + v[10] = dct_const_round_shift_sse2(u[10]); + v[11] = dct_const_round_shift_sse2(u[11]); + v[12] = dct_const_round_shift_sse2(u[12]); + v[13] = dct_const_round_shift_sse2(u[13]); + v[14] = dct_const_round_shift_sse2(u[14]); + v[15] = dct_const_round_shift_sse2(u[15]); s[0] = _mm_add_epi16(x[0], x[2]); s[1] = _mm_add_epi16(x[1], x[3]); @@ -989,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) { void iadst16_sse2(__m128i *const in0, __m128i *const in1) { transpose_16bit_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); + vpx_iadst16_8col_sse2(in0); + vpx_iadst16_8col_sse2(in1); } // Group the coefficient calculation into smaller functions to prevent stack diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h index 5cd5098f14..b4bbd186d2 100644 --- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_ -#define VPX_DSP_X86_INV_TXFM_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ #include // SSE2 @@ -697,13 +697,14 @@ static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( } void idct4_sse2(__m128i *const in); -void idct8_sse2(__m128i *const in); +void vpx_idct8_sse2(__m128i *const in); void idct16_sse2(__m128i *const in0, __m128i *const in1); void iadst4_sse2(__m128i *const in); void iadst8_sse2(__m128i *const in); +void vpx_iadst16_8col_sse2(__m128i *const in); void iadst16_sse2(__m128i *const in0, __m128i *const in1); void idct32_1024_8x32(const __m128i *const in, __m128i *const out); void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out); -#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_ +#endif // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h index e785c8eda1..e9f0f69033 100644 --- a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h +++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_ -#define VPX_DSP_X86_INV_TXFM_SSSE3_H_ +#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ +#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ #include @@ -107,4 +107,4 @@ static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out); -#endif // VPX_DSP_X86_INV_TXFM_SSSE3_H_ +#endif // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c index 6652a62dcf..be391992af 100644 --- a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c @@ -13,38 +13,38 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { @@ -52,19 +52,19 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); + fe = _mm_set1_epi8((int8_t)0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -76,7 +76,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -84,7 +84,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i t1 = _mm_set1_epi16(0x1); __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); @@ -136,21 +136,21 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), @@ -321,44 +321,44 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); } } @@ -367,10 +367,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -380,32 +380,32 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); - p256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); - p256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); - p256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); - p256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); - p256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); - q256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); - q256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); - q256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); - q256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); - q256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); + p256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch))); + p256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch))); + p256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch))); + p256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch))); + p256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch))); + q256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch))); + q256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch))); + q256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch))); + q256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch))); + q256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch))); p4 = _mm256_castsi256_si128(p256_4); p3 = _mm256_castsi256_si128(p256_3); @@ -423,7 +423,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); @@ -431,12 +431,12 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); @@ -450,7 +450,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -458,8 +458,8 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); @@ -532,9 +532,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat = _mm_and_si128(flat, mask); p256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch))); q256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch))); p5 = _mm256_castsi256_si128(p256_5); q5 = _mm256_castsi256_si128(q256_5); flat2 = _mm_max_epu8( @@ -543,9 +543,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat2 = _mm_max_epu8(work, flat2); p256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch))); q256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch))); p6 = _mm256_castsi256_si128(p256_6); q6 = _mm256_castsi256_si128(q256_6); work = _mm_max_epu8( @@ -555,9 +555,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat2 = _mm_max_epu8(work, flat2); p256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch))); q256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch))); p7 = _mm256_castsi256_si128(p256_7); q7 = _mm256_castsi256_si128(q256_7); work = _mm_max_epu8( @@ -843,71 +843,71 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, p6 = _mm_andnot_si128(flat2, p6); flat2_p6 = _mm_and_si128(flat2, flat2_p6); p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); p5 = _mm_andnot_si128(flat2, p5); flat2_p5 = _mm_and_si128(flat2, flat2_p5); p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); p4 = _mm_andnot_si128(flat2, p4); flat2_p4 = _mm_and_si128(flat2, flat2_p4); p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); p3 = _mm_andnot_si128(flat2, p3); flat2_p3 = _mm_and_si128(flat2, flat2_p3); p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); p2 = _mm_andnot_si128(flat2, p2); flat2_p2 = _mm_and_si128(flat2, flat2_p2); p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); p1 = _mm_andnot_si128(flat2, p1); flat2_p1 = _mm_and_si128(flat2, flat2_p1); p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); p0 = _mm_andnot_si128(flat2, p0); flat2_p0 = _mm_and_si128(flat2, flat2_p0); p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); q0 = _mm_andnot_si128(flat2, q0); flat2_q0 = _mm_and_si128(flat2, flat2_q0); q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0); q1 = _mm_andnot_si128(flat2, q1); flat2_q1 = _mm_and_si128(flat2, flat2_q1); q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); q2 = _mm_andnot_si128(flat2, q2); flat2_q2 = _mm_and_si128(flat2, flat2_q2); q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); q3 = _mm_andnot_si128(flat2, q3); flat2_q3 = _mm_and_si128(flat2, flat2_q3); q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); q4 = _mm_andnot_si128(flat2, q4); flat2_q4 = _mm_and_si128(flat2, flat2_q4); q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); q5 = _mm_andnot_si128(flat2, q5); flat2_q5 = _mm_and_si128(flat2, flat2_q5); q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); q6 = _mm_andnot_si128(flat2, q6); flat2_q6 = _mm_and_si128(flat2, flat2_q6); q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); } } diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c index 28e6fd65f9..f90522cd7d 100644 --- a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" #include "vpx_ports/emmintrin_compat.h" +#include "vpx_dsp/x86/mem_sse2.h" static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); @@ -30,7 +31,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ hev = \ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_cmpgt_epi16(hev, thresh_v); \ hev = _mm_packs_epi16(hev, hev); \ \ /* const int8_t mask = filter_mask(*limit, *blimit, */ \ @@ -51,7 +52,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { flat = _mm_max_epu8(work, flat); \ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_subs_epu8(mask, limit_v); \ mask = _mm_cmpeq_epi8(mask, zero); \ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ } while (0) @@ -60,7 +61,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { do { \ const __m128i t3t4 = \ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ - const __m128i t80 = _mm_set1_epi8(0x80); \ + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); \ __m128i filter, filter2filter1, work; \ \ ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ @@ -103,27 +104,26 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ } while (0) -void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; - p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s - 4 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s + 0 * p))); - q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 4 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 0 * pitch))); + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); @@ -132,41 +132,40 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, FILTER_HEV_MASK; FILTER4; - _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 + _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1 } -void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i x0, x1, x2, x3; __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4))); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4))); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4))); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4))); // Transpose 8x8 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 @@ -212,69 +211,69 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); - *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); } -void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat, flat2; __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); + fe = _mm_set1_epi8((int8_t)0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = abs_diff(q0p0, p0q0); abs_p1q1 = abs_diff(q1p1, p1q1); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -284,7 +283,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -292,7 +291,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i t1 = _mm_set1_epi16(0x1); __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); @@ -342,18 +341,18 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); @@ -520,44 +519,44 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); } } @@ -591,15 +590,15 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat, flat2; __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; @@ -609,27 +608,27 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, __m128i max_abs_p1p0q1q0; - p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); { const __m128i abs_p1p0 = abs_diff(p1, p0); const __m128i abs_q1q0 = abs_diff(q1, q0); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i abs_p0q0 = abs_diff(p0, q0); __m128i abs_p1q1 = abs_diff(p1, q1); @@ -638,7 +637,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); @@ -648,7 +647,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, mask = _mm_max_epu8(work, mask); work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -678,8 +677,8 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); @@ -694,7 +693,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, oq0 = _mm_xor_si128(q0, t80); oq1 = _mm_xor_si128(q1, t80); - hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); @@ -851,82 +850,82 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2); f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ } } -void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -934,28 +933,28 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s - 0 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 2 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 0 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); p0q0 = _mm_shuffle_epi32(q0p0, 78); { // filter_mask and hev_mask const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; abs_p1p0 = abs_diff(q1p1, q0p0); @@ -964,12 +963,12 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, abs_p0q0 = abs_diff(q0p0, p0q0); abs_p1q1 = abs_diff(q1p1, p1q1); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -979,7 +978,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); // flat_mask4 @@ -997,14 +996,22 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, unsigned char *src = s; { __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); @@ -1047,16 +1054,16 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i ps1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1102,7 +1109,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_loadl_epi64((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -1120,27 +1127,25 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_loadl_epi64((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0); + _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0); + _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1); + _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2); } } -void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { +void vpx_lpf_horizontal_8_dual_sse2( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -1149,33 +1154,33 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); const __m128i zero = _mm_set1_epi16(0); const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); @@ -1227,14 +1232,22 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, do { __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); @@ -1279,20 +1292,20 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1344,7 +1357,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -1362,49 +1375,49 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); } } -void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit0, - const unsigned char *_limit0, - const unsigned char *_thresh0, - const unsigned char *_blimit1, - const unsigned char *_limit1, - const unsigned char *_thresh1) { +void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit0, + const unsigned char *limit0, + const unsigned char *thresh0, + const unsigned char *blimit1, + const unsigned char *limit1, + const unsigned char *thresh1) { const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); const __m128i zero = _mm_set1_epi16(0); __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i mask, hev, flat; - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); // filter_mask and hev_mask { @@ -1412,7 +1425,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); @@ -1448,20 +1461,20 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1506,10 +1519,10 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); } } @@ -1626,16 +1639,12 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpacklo_epi16(x2, x3); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 0 * out_p), - _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1 * out_p), - _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 + mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 2 * out_p), - _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3 * out_p), - _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 + mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 x4 = _mm_unpackhi_epi16(x0, x1); @@ -1643,21 +1652,17 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpackhi_epi16(x2, x3); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 4 * out_p), - _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5 * out_p), - _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 + mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 6 * out_p), - _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7 * out_p), - _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 + mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 } while (++idx8x8 < num_8x8_to_transpose); } -void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { @@ -1666,7 +1671,7 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, unsigned char *dst[2]; // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, @@ -1674,13 +1679,13 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, src[0] = t_dst; src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - transpose(src, 16, dst, p, 2); + transpose(src, 16, dst, pitch, 2); } -void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { @@ -1692,7 +1697,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, src[0] = s - 4; dst[0] = t_dst; - transpose(src, p, dst, 8, 1); + transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); @@ -1701,10 +1706,10 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, dst[0] = s - 4; // Transpose back - transpose(src, 8, dst, p, 1); + transpose(src, 8, dst, pitch, 1); } -void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { @@ -1713,7 +1718,7 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, unsigned char *dst[2]; // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, @@ -1722,13 +1727,13 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - transpose(src, 16, dst, p, 2); + transpose(src, 16, dst, pitch, 2); } -void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { @@ -1742,7 +1747,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, dst[1] = t_dst + 8 * 8; // Transpose 16x8 - transpose(src, p, dst, 8, 2); + transpose(src, pitch, dst, 8, 2); // Loop filtering vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); @@ -1753,22 +1758,22 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, dst[1] = s; // Transpose back - transpose(src, 8, dst, p, 2); + transpose(src, 8, dst, pitch, 2); } -void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { DECLARE_ALIGNED(16, unsigned char, t_dst[256]); // Transpose 16x16 - transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); // Loop filtering vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); // Transpose back - transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch); } diff --git a/libs/libvpx/vpx_dsp/x86/mem_sse2.h b/libs/libvpx/vpx_dsp/x86/mem_sse2.h index 2ce738fb77..258ab38e60 100644 --- a/libs/libvpx/vpx_dsp/x86/mem_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/mem_sse2.h @@ -8,13 +8,43 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_MEM_SSE2_H_ -#define VPX_DSP_X86_MEM_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_ +#define VPX_VPX_DSP_X86_MEM_SSE2_H_ #include // SSE2 +#include #include "./vpx_config.h" +static INLINE void storeu_uint32(void *dst, uint32_t v) { + memcpy(dst, &v, sizeof(v)); +} + +static INLINE uint32_t loadu_uint32(const void *src) { + uint32_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE __m128i load_unaligned_u32(const void *a) { + uint32_t val; + memcpy(&val, a, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +static INLINE void store_unaligned_u32(void *const a, const __m128i v) { + const uint32_t val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); +} + +#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) +#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) + +static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); @@ -121,4 +151,4 @@ static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); } -#endif // VPX_DSP_X86_MEM_SSE2_H_ +#endif // VPX_VPX_DSP_X86_MEM_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/post_proc_sse2.c b/libs/libvpx/vpx_dsp/x86/post_proc_sse2.c new file mode 100644 index 0000000000..d1029afc4f --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/post_proc_sse2.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +extern const int16_t vpx_rv[]; + +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, + int cols, int flimit) { + int col; + const __m128i zero = _mm_setzero_si128(); + const __m128i f = _mm_set1_epi32(flimit); + DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + int row, i; + __m128i s = _mm_loadl_epi64((__m128i *)dst); + __m128i sum, sumsq_0, sumsq_1; + __m128i tmp_0, tmp_1; + __m128i below_context; + + s = _mm_unpacklo_epi8(s, zero); + + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)above_context + i, s); + } + + // sum *= 9 + sum = _mm_slli_epi16(s, 3); + sum = _mm_add_epi16(s, sum); + + // sum^2 * 9 == (sum * 9) * sum + tmp_0 = _mm_mullo_epi16(sum, s); + tmp_1 = _mm_mulhi_epi16(sum, s); + + sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1); + sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1); + + // Prime sum/sumsq + for (i = 1; i <= 6; ++i) { + __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch)); + a = _mm_unpacklo_epi8(a, zero); + sum = _mm_add_epi16(sum, a); + a = _mm_mullo_epi16(a, a); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero)); + } + + for (row = 0; row < rows + 8; row++) { + const __m128i above = + _mm_load_si128((__m128i *)above_context + (row & 7)); + __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch)); + __m128i above_sq, below_sq; + __m128i mask_0, mask_1; + __m128i multmp_0, multmp_1; + __m128i rv; + __m128i out; + + this_row = _mm_unpacklo_epi8(this_row, zero); + + if (row + 7 < rows) { + // Instead of copying the end context we just stop loading when we get + // to the last one. + below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch)); + below_context = _mm_unpacklo_epi8(below_context, zero); + } + + sum = _mm_sub_epi16(sum, above); + sum = _mm_add_epi16(sum, below_context); + + // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero + // extend. Unfortunately we can't do below_sq - above_sq in 16 bits + // because x86 does not have unpack with sign extension. + above_sq = _mm_mullo_epi16(above, above); + sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero)); + sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero)); + + below_sq = _mm_mullo_epi16(below_context, below_context); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero)); + + // sumsq * 16 - sumsq == sumsq * 15 + mask_0 = _mm_slli_epi32(sumsq_0, 4); + mask_0 = _mm_sub_epi32(mask_0, sumsq_0); + mask_1 = _mm_slli_epi32(sumsq_1, 4); + mask_1 = _mm_sub_epi32(mask_1, sumsq_1); + + multmp_0 = _mm_mullo_epi16(sum, sum); + multmp_1 = _mm_mulhi_epi16(sum, sum); + + mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1)); + mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1)); + + // mask - f gives a negative value when mask < f + mask_0 = _mm_sub_epi32(mask_0, f); + mask_1 = _mm_sub_epi32(mask_1, f); + + // Shift the sign bit down to create a mask + mask_0 = _mm_srai_epi32(mask_0, 31); + mask_1 = _mm_srai_epi32(mask_1, 31); + + mask_0 = _mm_packs_epi32(mask_0, mask_1); + + rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127))); + + mask_1 = _mm_add_epi16(rv, sum); + mask_1 = _mm_add_epi16(mask_1, this_row); + mask_1 = _mm_srai_epi16(mask_1, 4); + + mask_1 = _mm_and_si128(mask_0, mask_1); + mask_0 = _mm_andnot_si128(mask_0, this_row); + out = _mm_or_si128(mask_1, mask_0); + + _mm_storel_epi64((__m128i *)(dst + row * pitch), + _mm_packus_epi16(out, zero)); + + _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row); + } + + dst += 8; + } +} diff --git a/libs/libvpx/vpx_dsp/x86/quantize_avx.c b/libs/libvpx/vpx_dsp/x86/quantize_avx.c index 6f4489004d..0a91d36eaf 100644 --- a/libs/libvpx/vpx_dsp/x86/quantize_avx.c +++ b/libs/libvpx/vpx_dsp/x86/quantize_avx.c @@ -17,15 +17,16 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" -#include "vpx_dsp/x86/quantize_x86.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; @@ -37,7 +38,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -90,15 +91,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, - zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -135,26 +133,25 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); } *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); @@ -167,7 +164,7 @@ void vpx_quantize_b_32x32_avx( __m128i all_zero; __m128i eob = zero, eob0; - (void)scan_ptr; + (void)scan; (void)n_coeffs; (void)skip_block; assert(!skip_block); @@ -233,28 +230,12 @@ void vpx_quantize_b_32x32_avx( store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - // Un-sign to bias rounding like C. - // dequant is almost always negative, so this is probably the backwards way - // to handle the sign. However, it matches the previous assembly. - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - // "Divide" by 2. - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, - zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -291,23 +272,13 @@ void vpx_quantize_b_32x32_avx( store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(coeff0, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c index c020b398c3..e38a4059ab 100644 --- a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -15,15 +15,15 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" -#include "vpx_dsp/x86/quantize_x86.h" +#include "vpx_dsp/x86/quantize_sse2.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); int index = 16; @@ -33,7 +33,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -74,15 +74,11 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = - scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -109,14 +105,11 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); index += 16; diff --git a/libs/libvpx/vpx_dsp/x86/quantize_x86.h b/libs/libvpx/vpx_dsp/x86/quantize_sse2.h similarity index 70% rename from libs/libvpx/vpx_dsp/x86/quantize_x86.h rename to libs/libvpx/vpx_dsp/x86/quantize_sse2.h index 34928fbb56..afe2f924b3 100644 --- a/libs/libvpx/vpx_dsp/x86/quantize_x86.h +++ b/libs/libvpx/vpx_dsp/x86/quantize_sse2.h @@ -8,11 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ + #include #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, @@ -42,21 +44,35 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, *coeff = _mm_mulhi_epi16(qcoeff, shift); } -static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { - return _mm_mullo_epi16(qcoeff, dequant); +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + tran_low_t *dqcoeff) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i low = _mm_mullo_epi16(qcoeff, dequant); + const __m128i high = _mm_mulhi_epi16(qcoeff, dequant); + + const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16); +#endif // CONFIG_VP9_HIGHBITDEPTH } -// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing -// to zbin to add 1 to the index in 'scan'. +// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to +// zbin to add 1 to the index in 'scan'. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, const __m128i zbin_mask0, const __m128i zbin_mask1, - const int16_t *scan_ptr, const int index, + const int16_t *scan, const int index, const __m128i zero) { const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); - __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); - __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); __m128i eob0, eob1; // Add one to convert from indices to counts scan0 = _mm_sub_epi16(scan0, zbin_mask0); @@ -76,3 +92,5 @@ static INLINE int16_t accumulate_eob(__m128i eob) { eob = _mm_max_epi16(eob, eob_shuffled); return _mm_extract_epi16(eob, 1); } + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c index 3f528e1a97..fc1d91959f 100644 --- a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c +++ b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -14,7 +14,8 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" -#include "vpx_dsp/x86/quantize_x86.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -22,7 +23,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); int index = 16; @@ -32,7 +33,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -67,15 +68,11 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = - scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -100,14 +97,11 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); index += 16; @@ -116,12 +110,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); int index; @@ -133,7 +129,7 @@ void vpx_quantize_b_32x32_ssse3( __m128i all_zero; __m128i eob = zero, eob0; - (void)scan_ptr; + (void)scan; (void)n_coeffs; (void)skip_block; assert(!skip_block); @@ -206,28 +202,12 @@ void vpx_quantize_b_32x32_ssse3( store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - // Un-sign to bias rounding like C. - // dequant is almost always negative, so this is probably the backwards way - // to handle the sign. However, it matches the previous assembly. - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - // "Divide" by 2. - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, - zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -268,23 +248,13 @@ void vpx_quantize_b_32x32_ssse3( store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); - coeff0 = calculate_dqcoeff(coeff0, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.h b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.h new file mode 100644 index 0000000000..e8d2a05771 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 2. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + _mm_store_si128((__m128i *)(dqcoeff), + _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1)); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c b/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c index 962b8fb11a..b18fecf709 100644 --- a/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c @@ -11,154 +11,120 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; - const uint8_t *ref0, *ref1, *ref2, *ref3; +static INLINE void calc_final(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { + int i; + const uint8_t *refs[4]; + __m256i sums[4]; + + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 32; i++) { - // load src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + __m256i r[4]; - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // sum of the absolute differences between every ref[] to src + r[0] = _mm256_sad_epu8(r[0], s); + r[1] = _mm256_sad_epu8(r[1], s); + r[2] = _mm256_sad_epu8(r[2], s); + r[3] = _mm256_sad_epu8(r[3], s); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r[0]); + sums[1] = _mm256_add_epi32(sums[1], r[1]); + sums[2] = _mm256_add_epi32(sums[2], r[2]); + sums[3] = _mm256_add_epi32(sums[3], r[3]); + + src_ptr += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; } - { - __m128i sum; - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } + calc_final(sums, sad_array); } -void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; - __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; - __m256i ref3_reg, ref3next_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { + __m256i sums[4]; int i; - const uint8_t *ref0, *ref1, *ref2, *ref3; + const uint8_t *refs[4]; + + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 64; i++) { - // load 64 bytes from src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); - ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); - ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); - ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + __m256i r_lo[4], r_hi[4]; + // load 64 bytes from src and all ref[] + const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); + const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); + r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); + r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32)); + r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32)); + r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + // sum of the absolute differences between every ref[] to src + r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); + r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); + r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); + r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo); + r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi); + r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi); + r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); + r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); + sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); + sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); + sums[3] = _mm256_add_epi32(sums[3], r_lo[3]); + sums[0] = _mm256_add_epi32(sums[0], r_hi[0]); + sums[1] = _mm256_add_epi32(sums[1], r_hi[1]); + sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); + sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); + + src_ptr += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; } - { - __m128i sum; - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } + calc_final(sums, sad_array); } diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c index 5f2ab6ea71..4c5d70464d 100644 --- a/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c +++ b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c @@ -11,8 +11,8 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t res[4]) { __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; @@ -20,33 +20,33 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, int i; const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; + ref0 = ref_array[0]; + ref1 = ref_array[1]; + ref2 = ref_array[2]; + ref3 = ref_array[3]; sum_ref0 = _mm512_set1_epi16(0); sum_ref1 = _mm512_set1_epi16(0); sum_ref2 = _mm512_set1_epi16(0); sum_ref3 = _mm512_set1_epi16(0); for (i = 0; i < 64; i++) { - // load src and all refs - src_reg = _mm512_loadu_si512((const __m512i *)src); + // load src and all ref[] + src_reg = _mm512_loadu_si512((const __m512i *)src_ptr); ref0_reg = _mm512_loadu_si512((const __m512i *)ref0); ref1_reg = _mm512_loadu_si512((const __m512i *)ref1); ref2_reg = _mm512_loadu_si512((const __m512i *)ref2); ref3_reg = _mm512_loadu_si512((const __m512i *)ref3); - // sum of the absolute differences between every ref-i to src + // sum of the absolute differences between every ref[] to src ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg); ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg); ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg); - // sum every ref-i + // sum every ref[] sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg); sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg); sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg); sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg); - src += src_stride; + src_ptr += src_stride; ref0 += ref_stride; ref1 += ref_stride; ref2 += ref_stride; @@ -55,7 +55,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, { __m256i sum256; __m128i sum128; - // in sum_ref-i the result is saved in the first 4 bytes + // in sum_ref[] the result is saved in the first 4 bytes // the other 4 bytes are zeroed. // sum_ref1 and sum_ref3 are shifted left by 4 bytes sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4); @@ -65,7 +65,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1); sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3); - // merge every 64 bit from each sum_ref-i + // merge every 64 bit from each sum_ref[] sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2); sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2); diff --git a/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm index cee4468c1f..5adb9b8c3d 100644 --- a/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm @@ -41,12 +41,12 @@ SECTION .text ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *ref, ptrdiff_t ref_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse psubw %3, %4 psubw %1, %2 paddw %5, %3 @@ -114,84 +114,65 @@ SECTION .text ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back + %define second_str second_stridemp %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse, \ - g_bilin_filter, g_pw_8 + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse %define block_height heightd - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, second_pred, second_stride, \ + height, sse %define block_height dword heightm - %define sec_str sec_stridemp - %endif + %define second_str second_stridemp %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse %define block_height heightd %endif - %define bilin_filter bilin_filter_m %endif %endif @@ -211,7 +192,7 @@ SECTION .text %if %1 < 16 sar block_height, 1 %if %2 == 1 ; avg - shl sec_str, 1 + shl second_str, 1 %endif %endif @@ -226,9 +207,9 @@ SECTION .text .x_zero_y_zero_loop: %if %1 == 16 movu m0, [srcq] - mova m1, [dstq] + mova m1, [refq] %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m3, m1, m5 punpcklbw m1, m5 %endif @@ -242,7 +223,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] %if %2 == 1 ; avg @@ -256,14 +237,14 @@ SECTION .text movx m2, [srcq+src_strideq] %endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] %if %2 == 1 ; avg %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 %endif punpcklbw m3, m5 @@ -284,10 +265,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_zero_loop @@ -302,11 +283,11 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] - mova m1, [dstq] + mova m1, [refq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 @@ -314,7 +295,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] @@ -325,22 +306,22 @@ SECTION .text movx m1, [srcq+src_strideq*2] punpckldq m2, m1 %endif - movx m1, [dstq] + movx m1, [refq] %if %1 > 4 movlhps m0, m2 %else ; 4xh punpckldq m0, m2 %endif - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m0, m2 punpcklbw m1, m5 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpcklbw m3, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m4, [secq] + movh m4, [second_predq] pavgb m0, m4 punpcklbw m3, m5 punpcklbw m0, m5 @@ -348,9 +329,9 @@ SECTION .text %endif %else ; !avg movx m4, [srcq+src_strideq*2] - movx m1, [dstq] + movx m1, [refq] pavgb m0, m2 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -360,10 +341,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_half_loop @@ -371,8 +352,8 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -380,7 +361,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -397,7 +378,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -405,7 +386,7 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] - mova m1, [dstq] + mova m1, [refq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 @@ -437,7 +418,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -446,14 +427,14 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq*2] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) - movx m1, [dstq] + movx m1, [refq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -473,7 +454,7 @@ SECTION .text pmullw m4, filter_y_b paddw m0, m1 paddw m2, filter_rnd - movx m1, [dstq] + movx m1, [refq] paddw m2, m4 %endif psraw m0, 4 @@ -485,11 +466,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -499,10 +480,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_other_loop @@ -523,11 +504,11 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 @@ -535,7 +516,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m4, [srcq+1] @@ -549,17 +530,17 @@ SECTION .text movx m2, [srcq+src_strideq+1] punpckldq m4, m2 %endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] pavgb m0, m4 punpcklbw m3, m5 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m1, m5 punpcklbw m0, m5 @@ -567,10 +548,10 @@ SECTION .text %endif %else ; !avg movx m2, [srcq+src_strideq] - movx m1, [dstq] + movx m1, [refq] pavgb m0, m4 movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -580,10 +561,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_zero_loop @@ -602,13 +583,13 @@ SECTION .text .x_half_y_half_loop: movu m4, [srcq] movu m3, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m4, m3 punpckhbw m3, m1, m5 pavgb m0, m4 %if %2 == 1 ; avg punpcklbw m1, m5 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else @@ -620,7 +601,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] @@ -647,13 +628,13 @@ SECTION .text punpckldq m0, m2 pshuflw m4, m2, 0xe %endif - movx m1, [dstq] + movx m1, [refq] pavgb m0, m2 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 %endif punpcklbw m3, m5 @@ -672,8 +653,8 @@ SECTION .text pavgb m4, m1 pavgb m0, m2 pavgb m2, m4 - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 @@ -683,10 +664,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_half_loop @@ -694,8 +675,8 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -703,7 +684,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -720,7 +701,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -732,7 +713,7 @@ SECTION .text .x_half_y_other_loop: movu m4, [srcq] movu m2, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m4, m2 %if cpuflag(ssse3) punpckhbw m2, m0, m4 @@ -762,7 +743,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -771,7 +752,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] @@ -787,9 +768,9 @@ SECTION .text movx m3, [srcq+src_strideq+1] pavgb m2, m1 pavgb m4, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) - movx m1, [dstq] + movx m1, [refq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -809,7 +790,7 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] %endif psraw m0, 4 psraw m2, 4 @@ -820,11 +801,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -835,10 +816,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_other_loop @@ -852,8 +833,8 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -861,7 +842,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -878,7 +859,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -886,7 +867,7 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] - mova m1, [dstq] + mova m1, [refq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 @@ -913,7 +894,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -922,16 +903,16 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) punpcklbw m0, m1 - movx m1, [dstq] + movx m1, [refq] punpcklbw m2, m4 pmaddubsw m0, filter_x_a pmaddubsw m2, filter_x_a @@ -951,7 +932,7 @@ SECTION .text pmullw m4, filter_x_b paddw m0, m1 paddw m2, filter_rnd - movx m1, [dstq] + movx m1, [refq] paddw m2, m4 %endif psraw m0, 4 @@ -963,11 +944,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -977,10 +958,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_zero_loop @@ -994,8 +975,8 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -1003,7 +984,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -1020,7 +1001,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -1056,7 +1037,7 @@ SECTION .text movu m4, [srcq] movu m3, [srcq+1] %if cpuflag(ssse3) - mova m1, [dstq] + mova m1, [refq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a @@ -1082,7 +1063,7 @@ SECTION .text paddw m2, filter_rnd paddw m4, m3 paddw m2, m1 - mova m1, [dstq] + mova m1, [refq] psraw m4, 4 psraw m2, 4 punpckhbw m3, m1, m5 @@ -1096,7 +1077,7 @@ SECTION .text %endif %if %2 == 1 ; avg ; FIXME(rbultje) pipeline - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpckhbw m2, m0, m5 punpcklbw m0, m5 @@ -1104,7 +1085,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] @@ -1132,8 +1113,8 @@ SECTION .text punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] paddw m2, filter_rnd paddw m4, filter_rnd %else @@ -1148,9 +1129,9 @@ SECTION .text pmullw m3, filter_x_b paddw m4, filter_rnd paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] paddw m4, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %endif psraw m2, 4 psraw m4, 4 @@ -1163,11 +1144,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -1179,10 +1160,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_half_loop @@ -1192,8 +1173,8 @@ SECTION .text STORE_AND_RET %1 .x_nonhalf_y_nonhalf: -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift @@ -1206,7 +1187,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m11, [bilin_filter+y_offsetq+16] %endif - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -1234,7 +1215,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -1273,7 +1254,7 @@ SECTION .text %if cpuflag(ssse3) movu m4, [srcq] movu m3, [srcq+1] - mova m1, [dstq] + mova m1, [refq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a @@ -1319,7 +1300,7 @@ SECTION .text pmullw m0, filter_y_a pmullw m3, filter_y_b paddw m2, m1 - mova m1, [dstq] + mova m1, [refq] paddw m0, filter_rnd psraw m2, 4 paddw m0, m3 @@ -1330,7 +1311,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -1338,7 +1319,7 @@ SECTION .text mova m0, m4 INC_SRC_BY_SRC_STRIDE - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] @@ -1374,8 +1355,8 @@ SECTION .text punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movx m3, [dstq+dst_strideq] - movx m1, [dstq] + movx m3, [refq+ref_strideq] + movx m1, [refq] paddw m2, filter_rnd paddw m4, filter_rnd psraw m2, 4 @@ -1414,9 +1395,9 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m0, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] psraw m0, 4 psraw m2, 4 punpcklbw m3, m5 @@ -1429,11 +1410,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -1443,10 +1424,10 @@ SECTION .text mova m0, m4 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_other_loop diff --git a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c index 026d0ca2f2..9eaf6ee1b8 100644 --- a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c @@ -10,120 +10,96 @@ #include #include -#include #include "./vpx_dsp_rtcd.h" - -static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src, - int stride) { - const __m128i v_val_0_w = - _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); - const __m128i v_val_1_w = - _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); - const __m128i v_val_2_w = - _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); - const __m128i v_val_3_w = - _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - - const __m128i v_sum_d = - _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); - - return (uint64_t)_mm_cvtsi128_si32(v_sum_d); -} - -// TODO(jingning): Evaluate the performance impact here. -#ifdef __GNUC__ -// This prevents GCC/Clang from inlining this function into -// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack -// maintenance instructions in the common case of 4x4. -__attribute__((noinline)) -#endif -static uint64_t -vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) { - int r, c; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); - __m128i v_acc_q = _mm_setzero_si128(); - - for (r = 0; r < size; r += 8) { - __m128i v_acc_d = _mm_setzero_si128(); - - for (c = 0; c < size; c += 8) { - const int16_t *b = src + c; - const __m128i v_val_0_w = - _mm_load_si128((const __m128i *)(b + 0 * stride)); - const __m128i v_val_1_w = - _mm_load_si128((const __m128i *)(b + 1 * stride)); - const __m128i v_val_2_w = - _mm_load_si128((const __m128i *)(b + 2 * stride)); - const __m128i v_val_3_w = - _mm_load_si128((const __m128i *)(b + 3 * stride)); - const __m128i v_val_4_w = - _mm_load_si128((const __m128i *)(b + 4 * stride)); - const __m128i v_val_5_w = - _mm_load_si128((const __m128i *)(b + 5 * stride)); - const __m128i v_val_6_w = - _mm_load_si128((const __m128i *)(b + 6 * stride)); - const __m128i v_val_7_w = - _mm_load_si128((const __m128i *)(b + 7 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); - const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); - const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); - const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); - const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); - - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); - - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); - } - - v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); - - src += 8 * stride; - } - - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); - -#if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(v_acc_q); -#else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_acc_q); - return tmp; - } -#endif -} +#include "vpx_dsp/x86/mem_sse2.h" uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { - // 4 elements per row only requires half an XMM register, so this - // must be a special case, but also note that over 75% of all calls - // are with size == 4, so it is also the common case. + // Over 75% of all calls are with size == 4. if (size == 4) { - return vpx_sum_squares_2d_i16_4x4_sse2(src, stride); + __m128i s[2], sq[2], ss; + + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + s[0] = loadh_epi64(s[0], src + 1 * stride); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + s[1] = loadh_epi64(s[1], src + 3 * stride); + sq[0] = _mm_madd_epi16(s[0], s[0]); + sq[1] = _mm_madd_epi16(s[1], s[1]); + sq[0] = _mm_add_epi32(sq[0], sq[1]); + ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8)); + ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32)); + + return (uint64_t)_mm_cvtsi128_si32(ss); } else { // Generic case + int r = size; + const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + __m128i v_acc_q = _mm_setzero_si128(); + assert(size % 8 == 0); - return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size); + + do { + int c = 0; + __m128i v_acc_d = _mm_setzero_si128(); + + do { + const int16_t *const b = src + c; + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); + c += 8; + } while (c < size); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 8 * stride; + r -= 8; + } while (r); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); + return tmp; + } +#endif } } diff --git a/libs/libvpx/vpx_dsp/x86/transpose_sse2.h b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h index 8a0119ca7e..6e07871b18 100644 --- a/libs/libvpx/vpx_dsp/x86/transpose_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_ -#define VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ #include // SSE2 @@ -364,4 +364,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in, out[7] = _mm_unpackhi_epi64(a6, a7); } -#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#endif // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h index 0a9542c85b..de5ce43b00 100644 --- a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_ -#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ #include #include "vpx/vpx_integer.h" @@ -29,4 +29,4 @@ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) -#endif // VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#endif // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/variance_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_avx2.c index d15a89c746..9232acbfbb 100644 --- a/libs/libvpx/vpx_dsp/x86/variance_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/variance_avx2.c @@ -38,130 +38,140 @@ DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = { }; /* clang-format on */ -void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *sse, int *sum) { - unsigned int i, src_2strides, ref_2strides; - __m256i sum_reg = _mm256_setzero_si256(); - __m256i sse_reg = _mm256_setzero_si256(); - // process two 16 byte locations in a 256 bit register - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; ++i) { - // convert up values in 128 bit registers across lanes - const __m256i src0 = - _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(src_ptr))); - const __m256i src1 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i const *)(src_ptr + source_stride))); - const __m256i ref0 = - _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(ref_ptr))); - const __m256i ref1 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride))); - const __m256i diff0 = _mm256_sub_epi16(src0, ref0); - const __m256i diff1 = _mm256_sub_epi16(src1, ref1); - const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); - const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2); - // add to the running totals - sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1)); - sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1)); + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); - src_ptr += src_2strides; - ref_ptr += ref_2strides; - } - { - // extract the low lane and add it to the high lane - const __m128i sum_reg_128 = _mm_add_epi16( - _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1)); - const __m128i sse_reg_128 = _mm_add_epi32( - _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); - // sum upper and lower 64 bits together and convert up to 32 bit values - const __m128i sum_reg_64 = - _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); - const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); +} - // unpack sse and sum registers and add - const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32); - const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32); - const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); +static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse, + __m128i vsum, + unsigned int *const sse, + int *const sum) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse), + _mm256_extractf128_si256(vsse, 1)); - // perform the final summation and extract the results - const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); - *((int *)sse) = _mm_cvtsi128_si32(res); - *((int *)sum) = _mm_extract_epi32(res, 1); + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + *((int *)sum) = _mm_extract_epi32(res, 1); +} + +static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse, + __m256i vsum, + unsigned int *const sse, + int *const sum) { + // extract the low lane and add it to the high lane + const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + const __m128i sum_reg_64 = + _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + + variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; } } -static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *sse, int *sum) { - unsigned int i, src_2strides, ref_2strides; - const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2); - __m256i sum_reg = _mm256_setzero_si256(); - __m256i sse_reg = _mm256_setzero_si256(); +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); - // process 64 elements in an iteration - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; i++) { - const __m256i src0 = _mm256_loadu_si256((__m256i const *)(src_ptr)); - const __m256i src1 = - _mm256_loadu_si256((__m256i const *)(src_ptr + source_stride)); - const __m256i ref0 = _mm256_loadu_si256((__m256i const *)(ref_ptr)); - const __m256i ref1 = - _mm256_loadu_si256((__m256i const *)(ref_ptr + recon_stride)); - - // unpack into pairs of source and reference values - const __m256i src_ref0 = _mm256_unpacklo_epi8(src0, ref0); - const __m256i src_ref1 = _mm256_unpackhi_epi8(src0, ref0); - const __m256i src_ref2 = _mm256_unpacklo_epi8(src1, ref1); - const __m256i src_ref3 = _mm256_unpackhi_epi8(src1, ref1); - - // subtract adjacent elements using src*1 + ref*-1 - const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); - const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); - const __m256i diff2 = _mm256_maddubs_epi16(src_ref2, adj_sub); - const __m256i diff3 = _mm256_maddubs_epi16(src_ref3, adj_sub); - const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); - const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); - const __m256i madd2 = _mm256_madd_epi16(diff2, diff2); - const __m256i madd3 = _mm256_madd_epi16(diff3, diff3); - - // add to the running totals - sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1)); - sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff2, diff3)); - sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1)); - sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd2, madd3)); - - src_ptr += src_2strides; - ref_ptr += ref_2strides; + for (i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; } +} - { - // extract the low lane and add it to the high lane - const __m128i sum_reg_128 = _mm_add_epi16( - _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1)); - const __m128i sse_reg_128 = _mm_add_epi32( - _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); - // sum upper and lower 64 bits together and convert up to 32 bit values - const __m128i sum_reg_64 = - _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); - const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); - - // unpack sse and sum registers and add - const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32); - const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32); - const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); - - // perform the final summation and extract the results - const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); - *((int *)sse) = _mm_cvtsi128_si32(res); - *((int *)sum) = _mm_extract_epi32(res, 1); + for (i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; } } +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum); +} + #define FILTER_SRC(filter) \ /* filter the source */ \ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ @@ -214,8 +224,9 @@ static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride, static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { const __m256i zero_reg = _mm256_setzero_si256(); __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; int i; @@ -223,11 +234,11 @@ static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src); if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); @@ -241,9 +252,10 @@ static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, // (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction. static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, - int do_sec, int height, __m256i *sum_reg, - __m256i *sse_reg, int sstep) { + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int sstep) { const __m256i zero_reg = _mm256_setzero_si256(); __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; int i; @@ -253,11 +265,11 @@ static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); @@ -270,24 +282,27 @@ static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { - spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, src_stride); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, src_stride); } static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { - spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, 1); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, 1); } static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); @@ -304,11 +319,11 @@ static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, prev_src_avg = src_avg; if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg); @@ -323,9 +338,10 @@ static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, // (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction. static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, - int do_sec, int height, __m256i *sum_reg, - __m256i *sse_reg, int offset, int sstep) { + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int offset, int sstep) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i filter = _mm256_load_si256( @@ -341,10 +357,10 @@ static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, FILTER_SRC(filter) if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); - sec += sec_stride; + second_pred += second_stride; exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); } @@ -356,27 +372,27 @@ static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int y_offset) { - spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, y_offset, src_stride); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, y_offset, src_stride); } static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int x_offset) { - spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, x_offset, 1); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, x_offset, 1); } static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int y_offset) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i filter = _mm256_load_si256( @@ -398,12 +414,12 @@ static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, FILTER_SRC(filter) if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } CALC_SUM_SSE_INSIDE_LOOP dst += dst_stride; @@ -413,9 +429,9 @@ static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int x_offset) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i filter = _mm256_load_si256( @@ -446,11 +462,11 @@ static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, src_pack = _mm256_avg_epu8(src_pack, src_reg); if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); @@ -464,9 +480,9 @@ static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int x_offset, int y_offset) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset, int y_offset) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i xfilter = _mm256_load_si256( @@ -501,12 +517,12 @@ static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, FILTER_SRC(yfilter) if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } prev_src_pack = src_pack; @@ -520,7 +536,7 @@ static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, + const uint8_t *second_pred, int second_stride, int do_sec, int height, unsigned int *sse) { const __m256i zero_reg = _mm256_setzero_si256(); __m256i sum_reg = _mm256_setzero_si256(); @@ -530,44 +546,44 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, // x_offset = 0 and y_offset = 0 if (x_offset == 0) { if (y_offset == 0) { - spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 0 and y_offset = 4 } else if (y_offset == 4) { - spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 0 and y_offset = bilin interpolation } else { - spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, y_offset); + spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); } // x_offset = 4 and y_offset = 0 } else if (x_offset == 4) { if (y_offset == 0) { - spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 4 and y_offset = 4 } else if (y_offset == 4) { - spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 4 and y_offset = bilin interpolation } else { - spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, y_offset); + spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); } // x_offset = bilin interpolation and y_offset = 0 } else { if (y_offset == 0) { - spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, x_offset); + spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); // x_offset = bilin interpolation and y_offset = 4 } else if (y_offset == 4) { - spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, x_offset); + spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); // x_offset = bilin interpolation and y_offset = bilin interpolation } else { - spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, x_offset, y_offset); + spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset); } } CALC_SUM_AND_SSE @@ -583,127 +599,177 @@ static unsigned int sub_pixel_variance32xh_avx2( static unsigned int sub_pixel_avg_variance32xh_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sse) { + const uint8_t *dst, int dst_stride, const uint8_t *second_pred, + int second_stride, int height, unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, - sec, sec_stride, 1, height, sse); + second_pred, second_stride, 1, height, sse); } -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int w, int h, - unsigned int *sse, int *sum, get_var_avx2 var_fn, - int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += 16) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 7); } -unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, - vpx_get16x16var_avx2, 16); + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } -unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse; -} - -unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - get32x16var_avx2, 32); + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); } -unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - get32x16var_avx2, 32); + __m256i vsse, vsum; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + __m128i vsum_128; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); } -unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - get32x16var_avx2, 32); - return *sse - (uint32_t)(((int64_t)sum * sum) >> 12); -} - -unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - get32x16var_avx2, 32); + __m256i vsse, vsum; + __m128i vsum_128; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum); + vsum = sum_to_32bit_avx2(vsum); + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse = _mm256_setzero_si256(); + __m256i vsum = _mm256_setzero_si256(); + __m128i vsum_128; + int sum; + variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + vsum = sum_to_32bit_avx2(vsum); + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse = _mm256_setzero_si256(); + __m256i vsum = _mm256_setzero_si256(); + __m128i vsum_128; + int sum; + int i = 0; + + for (i = 0; i < 2; i++) { + __m256i vsum16; + variance64_avx2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); + } + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse; +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse; +} + +unsigned int vpx_sub_pixel_variance64x64_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { unsigned int sse1; const int se1 = sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1); unsigned int sse2; const int se2 = - sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); + sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset, + ref_ptr + 32, ref_stride, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; return *sse - (uint32_t)(((int64_t)se * se) >> 12); } -unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { +unsigned int vpx_sub_pixel_variance32x32_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { const int se = sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } unsigned int vpx_sub_pixel_avg_variance64x64_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { unsigned int sse1; - const int se1 = sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); + const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 64, 64, &sse1); unsigned int sse2; const int se2 = sub_pixel_avg_variance32xh_avx2( - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, - 64, 64, &sse2); + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride, + second_pred + 32, 64, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; @@ -712,10 +778,12 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2( } unsigned int vpx_sub_pixel_avg_variance32x32_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { // Process 32 elements in parallel. - const int se = sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); + const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 32, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } diff --git a/libs/libvpx/vpx_dsp/x86/variance_sse2.c b/libs/libvpx/vpx_dsp/x86/variance_sse2.c index 8d8bf183b2..37ef64ecaa 100644 --- a/libs/libvpx/vpx_dsp/x86/variance_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/variance_sse2.c @@ -8,312 +8,426 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include // SSE2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" - #include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" -typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return _mm_cvtsi128_si32(val); +} -unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { +unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { __m128i vsum = _mm_setzero_si128(); int i; for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); + const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr); vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); - src += 8; + src_ptr += 8; } - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - return _mm_cvtsi128_si32(vsum); + return add32x4_sse2(vsum); } -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8( \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride)); + const __m128i p01 = _mm_unpacklo_epi32(p0, p1); + return _mm_unpacklo_epi8(p01, _mm_setzero_si128()); +} -static void get4x4var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); - const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); - const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); - const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); +static INLINE void variance_kernel_sse2(const __m128i src_ptr, + const __m128i ref_ptr, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - // sum - __m128i vsum = _mm_add_epi16(diff0, diff1); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsum = - _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - *sse = _mm_cvtsi128_si32(vsum); } -void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, - int ref_stride, unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - } - - // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); } -void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = add32x4_sse2(vsum); +} + +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} + +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE int sum_final_sse2(const __m128i sum) { + const __m128i t = sum_to_32bit_sse2(sum); + return add32x4_sse2(t); +} + +static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { int i; - for (i = 0; i < 16; ++i) { - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); + assert(h <= 256); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); + for (i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src_ptr, src_stride); + const __m128i r = load4x2_sse2(ref_ptr, ref_stride); - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - - src += src_stride; - ref += ref_stride; - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = - (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} - -static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, int w, - int h, unsigned int *sse, int *sum, - getNxMvar_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } + variance_kernel_sse2(s, r, sse, sum); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; } } -unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + int i; + + assert(h <= 128); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; i++) { + const __m128i s = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero); + const __m128i r = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero); + + variance_kernel_sse2(s, r, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr, + const uint8_t *const ref_ptr, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); +} + +static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 64); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 16); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum); + variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, sum); +} + +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_256_pel_sse2(vsse, vsum, sse, sum); +} + +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 4); } -unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, - get4x4var_sse2, 4); + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 5); } -unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, - get4x4var_sse2, 4); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 5); } -unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 6); } -unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, - vpx_get8x8var_sse2, 8); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 7); } -unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, - vpx_get8x8var_sse2, 8); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 7); } -unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_256_pel_sse2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } -unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - vpx_get16x16var_sse2, 16); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + variance_final_512_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum; + int sum; + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_512_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum; + int sum; + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + *sse = add32x4_sse2(vsse); + sum = sum_final_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); } -unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); + int i = 0; + + for (i = 0; i < 2; i++) { + __m128i vsum16; + variance32_sse2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = add32x4_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); + int i = 0; + + for (i = 0; i < 2; i++) { + __m128i vsum16; + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = add32x4_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - vpx_get16x16var_sse2, 16); + int i = 0; + + for (i = 0; i < 4; i++) { + __m128i vsum16; + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); } -unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} - -unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} - -unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm -#define DECL(w, opt) \ - int vpx_sub_pixel_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ - void *unused0, void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) #define DECLS(opt1, opt2) \ DECL(4, opt1); \ DECL(8, opt1); \ @@ -324,36 +438,37 @@ DECLS(ssse3, ssse3); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ + &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } #define FNS(opt1, opt2) \ @@ -378,12 +493,12 @@ FNS(ssse3, ssse3); #undef FN // The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ - int vpx_sub_pixel_avg_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ - void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, \ + const uint8_t *second_pred, ptrdiff_t second_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) #define DECLS(opt1, opt2) \ DECL(4, opt1); \ DECL(8, opt1); \ @@ -394,37 +509,38 @@ DECLS(ssse3, ssse3); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, \ + second_pred, w, h, &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } #define FNS(opt1, opt2) \ diff --git a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c deleted file mode 100644 index 4f164afeb4..0000000000 --- a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/convolve.h" - -#if HAVE_SSE2 -filter8_1dfunction vpx_filter_block1d16_v8_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_sse2; -filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; - -filter8_1dfunction vpx_filter_block1d16_v2_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_sse2; -filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; - -// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, -// int y_step_q4, int w, int h); -// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); - -// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_, sse2); - -#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; - -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; - -// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, - sse2); - -// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, -// int y_step_q4, int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_, sse2); -#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -#endif // HAVE_SSE2 diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index d83507dc99..c57149657a 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -45,7 +45,7 @@ ;Compute max and min values of a pixel mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b @@ -121,7 +121,7 @@ ;Compute max and min values of a pixel mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b @@ -199,7 +199,7 @@ SECTION .text -;void vpx_filter_block1d4_v8_sse2 +;void vpx_highbd_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -269,7 +269,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2): pop rbp ret -;void vpx_filter_block1d8_v8_sse2 +;void vpx_highbd_filter_block1d8_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -328,7 +328,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2): pop rbp ret -;void vpx_filter_block1d16_v8_sse2 +;void vpx_highbd_filter_block1d16_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -554,7 +554,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2): pop rbp ret -;void vpx_filter_block1d4_h8_sse2 +;void vpx_highbd_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -629,7 +629,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2): pop rbp ret -;void vpx_filter_block1d8_h8_sse2 +;void vpx_highbd_filter_block1d8_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -695,7 +695,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2): pop rbp ret -;void vpx_filter_block1d16_h8_sse2 +;void vpx_highbd_filter_block1d16_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm index 9bffe504b1..87bf75ebb8 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm @@ -26,7 +26,7 @@ pshufd xmm3, xmm3, 0 mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm5, rdx movq xmm2, rcx pshufd xmm5, xmm5, 0b @@ -82,7 +82,7 @@ pshufd xmm4, xmm4, 0 mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm8, rdx movq xmm5, rcx pshufd xmm8, xmm8, 0b diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c new file mode 100644 index 0000000000..e0e8b8f901 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -0,0 +1,1161 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_ports/mem.h" + +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + +static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first, dst_second; + __m128i even, odd; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 6 4 2 0 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + dst_first = mm_zip_epi32_sse2(&even, &odd); + + // Do again to get the second half of dst + src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 14 12 10 8 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 15 13 11 9 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the second half of the dst + dst_second = mm_zip_epi32_sse2(&even, &odd); + + // Round each result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm_packus_epi16(dst_first, dst_second); + _mm_store_si128((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +/* The macro used to generate functions shifts the src_ptr up by 3 rows already + * */ + +static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1, + src_reg_m10_hi_2; + __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2; + __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2; + __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128()); + src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128()); + src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); + + // Partial output from first half + res_reg_m10_lo = mm_madd_packs_epi16_sse2( + &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); + + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); + + // Add to get first half of the results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Now repeat everything again for the second half + // Partial output for second half + res_reg_m10_hi = mm_madd_packs_epi16_sse2( + &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23); + + res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2, + &kernel_reg_23); + + src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128()); + src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128()); + res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2, + &kernel_reg_45); + + src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128()); + src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128()); + res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2, + &kernel_reg_45); + + // Second half of the results + res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + + // Combine to get the result + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi); + + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_m10_lo_2 = src_reg_12_lo_2; + src_reg_m10_hi_1 = src_reg_12_hi_1; + src_reg_m10_hi_2 = src_reg_12_hi_2; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_01_lo_2 = src_reg_23_lo_2; + src_reg_01_hi_1 = src_reg_23_hi_1; + src_reg_01_hi_2 = src_reg_23_hi_2; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first; + __m128i even, odd; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 6 4 2 0 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + dst_first = mm_zip_epi32_sse2(&even, &odd); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Saturate and convert to 8-bit words + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo; + __m128i src_reg_12_lo, src_reg_23_lo; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1, src_reg_m10_lo_2; + __m128i src_reg_01_lo_1, src_reg_01_lo_2; + __m128i src_reg_12_lo_1, src_reg_12_lo_2; + __m128i src_reg_23_lo_1, src_reg_23_lo_2; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10_lo = mm_madd_packs_epi16_sse2( + &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); + + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + + // Convert to 8-bit words + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128()); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128()); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_m10_lo_2 = src_reg_12_lo_2; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_01_lo_2 = src_reg_23_lo_2; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first; + __m128i tmp_0, tmp_1; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Then we call multiply and add to get partial results + // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2] + // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the output + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Convert to 16-bit words + src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128()); + src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128()); + src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128()); + src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128()); + + // Shuffle into the right format + tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1); + tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3); + + // Partial output + tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23); + tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45); + + // Output + dst_first = _mm_add_epi32(tmp_0, tmp_1); + dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128()); + + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Saturate and convert to 8-bit words + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo; + __m128i src_reg_12_lo, src_reg_23_lo; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1; + __m128i src_reg_01_lo_1; + __m128i src_reg_12_lo_1; + __m128i src_reg_23_lo_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10_lo = + mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, &kernel_reg_23); + + res_reg_01_lo = + mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = + mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = + mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, &kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + + // Convert to 8-bit words + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero); + + // Save only half of the register (8 words) + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); + *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_1 = src_reg_3; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +static void vpx_highbd_filter_block1d4_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 2); + src_reg_shift_2 = _mm_srli_si128(src_reg, 4); + src_reg_shift_3 = _mm_srli_si128(src_reg, 6); + + // Output 2 0 + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 3 1 + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + res_reg = _mm_unpacklo_epi32(even, odd); + res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); + res_reg = _mm_packs_epi32(res_reg, reg_zero); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + _mm_storel_epi64((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_highbd_filter_block1d4_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23); + res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45); + + // Add to get results + res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = + mm_round_epi32_sse2(&res_reg_m1012, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123 = + mm_round_epi32_sse2(&res_reg_0123, ®_round, CONV8_ROUNDING_BITS); + + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d8_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + + __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2, + src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + __m128i tmp_0, tmp_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will put first half in the first half of the reg, and second half in + // second half + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // Output 6 4 2 0 + tmp_0 = _mm_srli_si128(src_reg, 4); + tmp_1 = _mm_srli_si128(src_reg_next, 2); + src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1); + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + tmp_0 = _mm_srli_si128(src_reg, 2); + tmp_1 = src_reg_next; + src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + tmp_0 = _mm_srli_si128(src_reg, 6); + tmp_1 = _mm_srli_si128(src_reg_next, 4); + src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + even = mm_round_epi32_sse2(&even, ®_round, CONV8_ROUNDING_BITS); + odd = mm_round_epi32_sse2(&odd, ®_round, CONV8_ROUNDING_BITS); + res_reg = mm_zip_epi32_sse2(&even, &odd); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + + _mm_store_si128((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_highbd_filter_block1d8_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi; + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + __m128i res_reg_m1012_hi, res_reg_0123_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3); + + // Partial output for first half + res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23); + res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = + mm_round_epi32_sse2(&res_reg_m1012_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_lo = + mm_round_epi32_sse2(&res_reg_0123_lo, ®_round, CONV8_ROUNDING_BITS); + + // Partial output for first half + res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23); + res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45); + + // Add to get results + res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_hi = + mm_round_epi32_sse2(&res_reg_m1012_hi, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_hi = + mm_round_epi32_sse2(&res_reg_0123_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine the two halfs + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d16_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +static void vpx_highbd_filter_block1d16_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 + +// From vpx_subpixel_8t_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v8_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2 +#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2 +#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2 +#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2 +#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2 +#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v2_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; + +// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, , + sse2, 0); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1); + +// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2, 0); +FUN_CONV_2D(avg_, sse2, 1); + +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ + vpx_highbd_filter_block1d16_v8_avg_sse2 +#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ + vpx_highbd_filter_block1d16_h8_avg_sse2 +#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ + vpx_highbd_filter_block1d8_v8_avg_sse2 +#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ + vpx_highbd_filter_block1d8_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; + +// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , sse2, 0); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1); + +// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2, 0); +HIGH_FUN_CONV_2D(avg_, sse2, 1); +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index d0919695ce..55919f9a0c 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -9,22 +9,24 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" #include "vpx_dsp/x86/convolve_avx2.h" +#include "vpx_dsp/x86/convolve_sse2.h" #include "vpx_ports/mem.h" // filters for 16_h8 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; +DECLARE_ALIGNED(32, static const uint8_t, + filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, + 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, + 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; +DECLARE_ALIGNED(32, static const uint8_t, + filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, @@ -326,6 +328,570 @@ static void vpx_filter_block1d16_v8_avg_avx2( height, filter, 1); } +static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. + + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i dst_first, dst_second; + __m256i tmp_0, tmp_1; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for second half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + dst_second = mm256_round_epi16(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm256_packus_epi16(dst_first, dst_second); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_first); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); + // Reorder into 2 1 1 2 + src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); + + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); + + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + + dst_first = _mm256_packus_epi16(dst_first, dst_first); + dst_first = _mm256_permute4x64_epi64(dst_first, 0x8); + + _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first)); + } +} + +static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi; + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23); + + // Output from first half + res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23); + res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45); + res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo); + + // Output from second half + res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23); + res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45); + res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi); + + // Round the words + res_reg_lo = mm256_round_epi16(&res_reg_lo, ®_32, 6); + res_reg_hi = mm256_round_epi16(&res_reg_hi, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i dst_reg; + __m256i tmp_0, tmp_1; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_reg = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round the result + dst_reg = mm256_round_epi16(&dst_reg, ®_32, 6); + + // Finally combine to get the final dst + dst_reg = _mm256_packus_epi16(dst_reg, dst_reg); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + __m128i dst_reg; + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + __m128i tmp_0, tmp_1; + + __m128i src_reg_shift_0 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0)); + __m128i src_reg_shift_2 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2)); + + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, + _mm256_castsi256_si128(kernel_reg_23)); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, + _mm256_castsi256_si128(kernel_reg_45)); + dst_reg = _mm_adds_epi16(tmp_0, tmp_1); + + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32, 6); + + dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, dst_reg); + } +} + +static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001, res_reg_1223; + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Output + res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23); + res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45); + res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + // Since avx2 has 256-bit register, we can do 2 rows at a time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + int h; + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + + __m256i src_reg, src_reg_shuf; + __m256i dst; + __m256i shuf_idx = + _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, + 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + for (h = height; h > 1; h -= 2) { + // Load the source + src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); + dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256()); + + // Round result + dst = mm256_round_epi16(&dst, ®_32, 6); + + // Pack to 8-bits + dst = _mm256_packus_epi16(dst, _mm256_setzero_si256()); + + // Save + mm256_storeu2_epi32((__m128i *const)dst_ptr, + (__m128i *const)(dst_ptr + dst_stride), &dst); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + if (h > 0) { + // Load the source + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr); + __m128i src_reg_shuf = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx)); + + // Get the result + __m128i dst = + _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg)); + dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); + + // Round result + dst = mm_round_epi16_sse2(&dst, ®_32, 6); + + // Pack to 8-bits + dst = _mm_packus_epi16(dst, _mm_setzero_si128()); + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst); + } +} + +static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get partial output. + // Calling horizontal add then gives us the completely output + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + + // Result after multiply and add + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Combine all the rows + src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223); + + // Output + res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg); + res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256()); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if ARCH_X86_64 @@ -376,6 +942,13 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; #define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3 #define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 #define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3 + +#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2 +#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2 +#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2 +#define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2 +#define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2 +#define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const InterpKernel *filter, int x0_q4, @@ -396,10 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + avx2, 0); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -411,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, avx2); -FUN_CONV_2D(avg_, avx2); +FUN_CONV_2D(, avx2, 0); +FUN_CONV_2D(avg_, avx2, 1); #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index e4f992780f..63049c9342 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -12,20 +12,17 @@ #include +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_sse2.h" #include "vpx_dsp/x86/convolve_ssse3.h" #include "vpx_dsp/x86/mem_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -// These are reused by the avx2 intrinsics. -// vpx_filter_block1d8_v8_intrin_ssse3() -// vpx_filter_block1d8_h8_intrin_ssse3() -// vpx_filter_block1d4_h8_intrin_ssse3() - static INLINE __m128i shuffle_filter_convolve8_8_ssse3( const __m128i *const s, const int16_t *const filter) { __m128i f[4]; @@ -33,6 +30,23 @@ static INLINE __m128i shuffle_filter_convolve8_8_ssse3( return convolve8_8_ssse3(s, f); } +// Used by the avx2 implementation. +#if ARCH_X86_64 +// Use the intrinsics below +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 +#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 +#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 +#else // ARCH_X86 +// Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm. +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +#endif + +#if ARCH_X86_64 void vpx_filter_block1d4_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -184,13 +198,490 @@ void vpx_filter_block1d8_v8_intrin_ssse3( output_ptr += out_pitch; } } +#endif // ARCH_X86_64 +static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first, dst_second; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Do again to get the second half of dst + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm_packus_epi16(dst_first, dst_second); + _mm_store_si128((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); + + // Partial output from first half + res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23); + + res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get first half of the results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Partial output for second half + res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23); + + res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45); + + // Second half of the results + res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + + // Combine to get the result + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi); + + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Get the result + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Round round result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23); + + res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45); + + // Add to get entire output + res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, ®_32, 6); + res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, ®_32, 6); + + // Pack from 16-bit to 8-bit + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128()); + res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + + __m128i kernel_reg; // Kernel + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shuf; + __m128i dst_first; + __m128i shuf_idx = + _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg); + dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128()); + + // Round result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call horizontal add to get the output. + // Finally, we can add multiple rows together to get the desired output. + // This is done two rows at a time + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + __m128i src_reg_m1001, src_reg_1223; + __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi; + + __m128i kernel_reg; // Kernel + + // Result after multiply and add + __m128i reg_0, reg_1; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1); + + // Put three rows next to each other + src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3); + + // Put three rows next to each other + src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23); + + // Put all four rows next to each other + src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223); + src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223); + + // Get the results + reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg); + reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg); + reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128()); + reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128()); + + // Round the words + reg_0 = mm_round_epi16_sse2(®_0, ®_32, 6); + reg_1 = mm_round_epi16_sse2(®_1, ®_32, 6); + + // Pack from 16-bit to 8-bit and put them in the right order + reg_0 = _mm_packus_epi16(reg_0, reg_0); + reg_1 = _mm_packus_epi16(reg_1, reg_1); + + // Save the result + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); + *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +// From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm filter8_1dfunction vpx_filter_block1d16_v8_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_ssse3; -filter8_1dfunction vpx_filter_block1d8_v8_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_v8_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_ssse3; filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; @@ -198,6 +689,15 @@ filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3 +#define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3 +#define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3 +#define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3 +#define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm filter8_1dfunction vpx_filter_block1d16_v2_ssse3; filter8_1dfunction vpx_filter_block1d16_h2_ssse3; filter8_1dfunction vpx_filter_block1d8_v2_ssse3; @@ -231,10 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + ssse3, 0); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1); static void filter_horiz_w8_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, @@ -571,7 +1073,7 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } } -// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, @@ -581,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_, ssse3); +FUN_CONV_2D(, ssse3, 0); +FUN_CONV_2D(avg_, ssse3, 1); diff --git a/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h index 2c259d322e..5631130243 100644 --- a/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h +++ b/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ -#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#ifndef VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#define VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ #include "./vpx_config.h" #define ADDRESS_STORAGE_SIZE sizeof(size_t) @@ -28,4 +28,4 @@ #define align_addr(addr, align) \ (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1)) -#endif // VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#endif // VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ diff --git a/libs/libvpx/vpx_mem/vpx_mem.c b/libs/libvpx/vpx_mem/vpx_mem.c index eeba34c373..18abf1158b 100644 --- a/libs/libvpx/vpx_mem/vpx_mem.c +++ b/libs/libvpx/vpx_mem/vpx_mem.c @@ -16,12 +16,14 @@ #include "include/vpx_mem_intrnl.h" #include "vpx/vpx_integer.h" +#if !defined(VPX_MAX_ALLOCABLE_MEMORY) #if SIZE_MAX > (1ULL << 40) #define VPX_MAX_ALLOCABLE_MEMORY (1ULL << 40) #else // For 32-bit targets keep this below INT_MAX to avoid valgrind warnings. #define VPX_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16)) #endif +#endif // Returns 0 in case of overflow of nmemb * size. static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) { diff --git a/libs/libvpx/vpx_mem/vpx_mem.h b/libs/libvpx/vpx_mem/vpx_mem.h index a4274b8856..7689a05e6e 100644 --- a/libs/libvpx/vpx_mem/vpx_mem.h +++ b/libs/libvpx/vpx_mem/vpx_mem.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_MEM_VPX_MEM_H_ -#define VPX_MEM_VPX_MEM_H_ +#ifndef VPX_VPX_MEM_VPX_MEM_H_ +#define VPX_VPX_MEM_VPX_MEM_H_ #include "vpx_config.h" #if defined(__uClinux__) @@ -49,4 +49,4 @@ static INLINE void *vpx_memset16(void *dest, int val, size_t length) { } #endif -#endif // VPX_MEM_VPX_MEM_H_ +#endif // VPX_VPX_MEM_VPX_MEM_H_ diff --git a/libs/libvpx/vpx_ports/arm.h b/libs/libvpx/vpx_ports/arm.h index 7be6104a4f..6458a2c5b0 100644 --- a/libs/libvpx/vpx_ports/arm.h +++ b/libs/libvpx/vpx_ports/arm.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_ARM_H_ -#define VPX_PORTS_ARM_H_ +#ifndef VPX_VPX_PORTS_ARM_H_ +#define VPX_VPX_PORTS_ARM_H_ #include #include "vpx_config.h" @@ -36,4 +36,4 @@ int arm_cpu_caps(void); } // extern "C" #endif -#endif // VPX_PORTS_ARM_H_ +#endif // VPX_VPX_PORTS_ARM_H_ diff --git a/libs/libvpx/vpx_ports/asmdefs_mmi.h b/libs/libvpx/vpx_ports/asmdefs_mmi.h index a9a49745af..28355bf9fb 100644 --- a/libs/libvpx/vpx_ports/asmdefs_mmi.h +++ b/libs/libvpx/vpx_ports/asmdefs_mmi.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_ASMDEFS_MMI_H_ -#define VPX_PORTS_ASMDEFS_MMI_H_ +#ifndef VPX_VPX_PORTS_ASMDEFS_MMI_H_ +#define VPX_VPX_PORTS_ASMDEFS_MMI_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -78,4 +78,4 @@ #endif /* HAVE_MMI */ -#endif /* VPX_PORTS_ASMDEFS_MMI_H_ */ +#endif // VPX_VPX_PORTS_ASMDEFS_MMI_H_ diff --git a/libs/libvpx/vpx_ports/bitops.h b/libs/libvpx/vpx_ports/bitops.h index 0ed7189ff6..5b2f31cd11 100644 --- a/libs/libvpx/vpx_ports/bitops.h +++ b/libs/libvpx/vpx_ports/bitops.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_BITOPS_H_ -#define VPX_PORTS_BITOPS_H_ +#ifndef VPX_VPX_PORTS_BITOPS_H_ +#define VPX_VPX_PORTS_BITOPS_H_ #include @@ -72,4 +72,4 @@ static INLINE int get_msb(unsigned int n) { } // extern "C" #endif -#endif // VPX_PORTS_BITOPS_H_ +#endif // VPX_VPX_PORTS_BITOPS_H_ diff --git a/libs/libvpx/vpx_ports/emmintrin_compat.h b/libs/libvpx/vpx_ports/emmintrin_compat.h index 903534e0c0..d6cc68ee4d 100644 --- a/libs/libvpx/vpx_ports/emmintrin_compat.h +++ b/libs/libvpx/vpx_ports/emmintrin_compat.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H_ -#define VPX_PORTS_EMMINTRIN_COMPAT_H_ +#ifndef VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ +#define VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ #if defined(__GNUC__) && __GNUC__ < 4 /* From emmintrin.h (gcc 4.5.3) */ @@ -52,4 +52,4 @@ extern __inline __m128d } #endif -#endif // VPX_PORTS_EMMINTRIN_COMPAT_H_ +#endif // VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ diff --git a/libs/libvpx/vpx_ports/emms_mmx.asm b/libs/libvpx/vpx_ports/emms_mmx.asm new file mode 100644 index 0000000000..9f33590a28 --- /dev/null +++ b/libs/libvpx/vpx_ports/emms_mmx.asm @@ -0,0 +1,18 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text +global sym(vpx_clear_system_state) PRIVATE +sym(vpx_clear_system_state): + emms + ret diff --git a/libs/libvpx/vpx_ports/config.h b/libs/libvpx/vpx_ports/emms_mmx.c similarity index 66% rename from libs/libvpx/vpx_ports/config.h rename to libs/libvpx/vpx_ports/emms_mmx.c index 3c1ab99f4a..f1036b98ed 100644 --- a/libs/libvpx/vpx_ports/config.h +++ b/libs/libvpx/vpx_ports/emms_mmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,9 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_CONFIG_H_ -#define VPX_PORTS_CONFIG_H_ +#include -#include "vpx_config.h" +#include "vpx_ports/system_state.h" -#endif // VPX_PORTS_CONFIG_H_ +void vpx_clear_system_state() { _mm_empty(); } diff --git a/libs/libvpx/vpx_ports/emms.asm b/libs/libvpx/vpx_ports/float_control_word.asm similarity index 90% rename from libs/libvpx/vpx_ports/emms.asm rename to libs/libvpx/vpx_ports/float_control_word.asm index db8da28737..256dae0844 100644 --- a/libs/libvpx/vpx_ports/emms.asm +++ b/libs/libvpx/vpx_ports/float_control_word.asm @@ -12,11 +12,6 @@ %include "vpx_ports/x86_abi_support.asm" section .text -global sym(vpx_reset_mmx_state) PRIVATE -sym(vpx_reset_mmx_state): - emms - ret - %if LIBVPX_YASM_WIN64 global sym(vpx_winx64_fldcw) PRIVATE diff --git a/libs/libvpx/vpx_ports/mem.h b/libs/libvpx/vpx_ports/mem.h index bfef783b13..317c6dc061 100644 --- a/libs/libvpx/vpx_ports/mem.h +++ b/libs/libvpx/vpx_ports/mem.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MEM_H_ -#define VPX_PORTS_MEM_H_ +#ifndef VPX_VPX_PORTS_MEM_H_ +#define VPX_VPX_PORTS_MEM_H_ #include "vpx_config.h" #include "vpx/vpx_integer.h" @@ -51,4 +51,4 @@ #define VPX_WITH_ASAN 0 #endif // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) -#endif // VPX_PORTS_MEM_H_ +#endif // VPX_VPX_PORTS_MEM_H_ diff --git a/libs/libvpx/vpx_ports/mem_ops.h b/libs/libvpx/vpx_ports/mem_ops.h index 343f27577c..b17015e7ec 100644 --- a/libs/libvpx/vpx_ports/mem_ops.h +++ b/libs/libvpx/vpx_ports/mem_ops.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MEM_OPS_H_ -#define VPX_PORTS_MEM_OPS_H_ +#ifndef VPX_VPX_PORTS_MEM_OPS_H_ +#define VPX_VPX_PORTS_MEM_OPS_H_ /* \file * \brief Provides portable memory access primitives @@ -224,5 +224,4 @@ static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { mem[3] = (MAU_T)((val >> 24) & 0xff); } /* clang-format on */ - -#endif // VPX_PORTS_MEM_OPS_H_ +#endif // VPX_VPX_PORTS_MEM_OPS_H_ diff --git a/libs/libvpx/vpx_ports/mem_ops_aligned.h b/libs/libvpx/vpx_ports/mem_ops_aligned.h index ccac391ba0..8649b87623 100644 --- a/libs/libvpx/vpx_ports/mem_ops_aligned.h +++ b/libs/libvpx/vpx_ports/mem_ops_aligned.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_ -#define VPX_PORTS_MEM_OPS_ALIGNED_H_ +#ifndef VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ +#define VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ #include "vpx/vpx_integer.h" @@ -168,4 +168,4 @@ mem_put_le_aligned_generic(32) #undef swap_endian_32_se /* clang-format on */ -#endif // VPX_PORTS_MEM_OPS_ALIGNED_H_ +#endif // VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ diff --git a/libs/libvpx/vpx_ports/msvc.h b/libs/libvpx/vpx_ports/msvc.h index 3ff71474b3..d58de3535a 100644 --- a/libs/libvpx/vpx_ports/msvc.h +++ b/libs/libvpx/vpx_ports/msvc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MSVC_H_ -#define VPX_PORTS_MSVC_H_ +#ifndef VPX_VPX_PORTS_MSVC_H_ +#define VPX_VPX_PORTS_MSVC_H_ #ifdef _MSC_VER #include "./vpx_config.h" @@ -29,4 +29,4 @@ static INLINE double round(double x) { #endif // _MSC_VER < 1800 #endif // _MSC_VER -#endif // VPX_PORTS_MSVC_H_ +#endif // VPX_VPX_PORTS_MSVC_H_ diff --git a/libs/libvpx/vpx_ports/ppc.h b/libs/libvpx/vpx_ports/ppc.h index ed29ef25b4..a11f4e8732 100644 --- a/libs/libvpx/vpx_ports/ppc.h +++ b/libs/libvpx/vpx_ports/ppc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_PPC_H_ -#define VPX_PORTS_PPC_H_ +#ifndef VPX_VPX_PORTS_PPC_H_ +#define VPX_VPX_PORTS_PPC_H_ #include #include "./vpx_config.h" @@ -26,4 +26,4 @@ int ppc_simd_caps(void); } // extern "C" #endif -#endif // VPX_PORTS_PPC_H_ +#endif // VPX_VPX_PORTS_PPC_H_ diff --git a/libs/libvpx/vpx_ports/system_state.h b/libs/libvpx/vpx_ports/system_state.h index 086c64681f..452cb5739b 100644 --- a/libs/libvpx/vpx_ports/system_state.h +++ b/libs/libvpx/vpx_ports/system_state.h @@ -8,15 +8,23 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_SYSTEM_STATE_H_ -#define VPX_PORTS_SYSTEM_STATE_H_ +#ifndef VPX_VPX_PORTS_SYSTEM_STATE_H_ +#define VPX_VPX_PORTS_SYSTEM_STATE_H_ #include "./vpx_config.h" -#if ARCH_X86 || ARCH_X86_64 -void vpx_reset_mmx_state(void); -#define vpx_clear_system_state() vpx_reset_mmx_state() +#ifdef __cplusplus +extern "C" { +#endif + +#if (ARCH_X86 || ARCH_X86_64) && HAVE_MMX +extern void vpx_clear_system_state(void); #else #define vpx_clear_system_state() -#endif // ARCH_X86 || ARCH_X86_64 -#endif // VPX_PORTS_SYSTEM_STATE_H_ +#endif // (ARCH_X86 || ARCH_X86_64) && HAVE_MMX + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_SYSTEM_STATE_H_ diff --git a/libs/libvpx/vpx_ports/vpx_once.h b/libs/libvpx/vpx_ports/vpx_once.h index 7d9fc3b406..4eb592b87e 100644 --- a/libs/libvpx/vpx_ports/vpx_once.h +++ b/libs/libvpx/vpx_ports/vpx_once.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_VPX_ONCE_H_ -#define VPX_PORTS_VPX_ONCE_H_ +#ifndef VPX_VPX_PORTS_VPX_ONCE_H_ +#define VPX_VPX_PORTS_VPX_ONCE_H_ #include "vpx_config.h" @@ -137,4 +137,4 @@ static void once(void (*func)(void)) { } #endif -#endif // VPX_PORTS_VPX_ONCE_H_ +#endif // VPX_VPX_PORTS_VPX_ONCE_H_ diff --git a/libs/libvpx/vpx_ports/vpx_ports.mk b/libs/libvpx/vpx_ports/vpx_ports.mk index e17145e6cb..aa9faf15ec 100644 --- a/libs/libvpx/vpx_ports/vpx_ports.mk +++ b/libs/libvpx/vpx_ports/vpx_ports.mk @@ -17,8 +17,19 @@ PORTS_SRCS-yes += msvc.h PORTS_SRCS-yes += system_state.h PORTS_SRCS-yes += vpx_timer.h +ifeq ($(ARCH_X86),yes) +PORTS_SRCS-$(HAVE_MMX) += emms_mmx.c +endif +ifeq ($(ARCH_X86_64),yes) +# Visual Studio x64 does not support the _mm_empty() intrinsic. +PORTS_SRCS-$(HAVE_MMX) += emms_mmx.asm +endif + +ifeq ($(ARCH_X86_64),yes) +PORTS_SRCS-$(CONFIG_MSVS) += float_control_word.asm +endif + ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) -PORTS_SRCS-yes += emms.asm PORTS_SRCS-yes += x86.h PORTS_SRCS-yes += x86_abi_support.asm endif diff --git a/libs/libvpx/vpx_ports/vpx_timer.h b/libs/libvpx/vpx_ports/vpx_timer.h index 2083b4ece4..4934d5296a 100644 --- a/libs/libvpx/vpx_ports/vpx_timer.h +++ b/libs/libvpx/vpx_ports/vpx_timer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_VPX_TIMER_H_ -#define VPX_PORTS_VPX_TIMER_H_ +#ifndef VPX_VPX_PORTS_VPX_TIMER_H_ +#define VPX_VPX_PORTS_VPX_TIMER_H_ #include "./vpx_config.h" @@ -106,4 +106,4 @@ static INLINE int vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { return 0; } #endif /* CONFIG_OS_SUPPORT */ -#endif // VPX_PORTS_VPX_TIMER_H_ +#endif // VPX_VPX_PORTS_VPX_TIMER_H_ diff --git a/libs/libvpx/vpx_ports/x86.h b/libs/libvpx/vpx_ports/x86.h index ced65ac058..9b48a1f4c3 100644 --- a/libs/libvpx/vpx_ports/x86.h +++ b/libs/libvpx/vpx_ports/x86.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_X86_H_ -#define VPX_PORTS_X86_H_ +#ifndef VPX_VPX_PORTS_X86_H_ +#define VPX_VPX_PORTS_X86_H_ #include #if defined(_MSC_VER) @@ -161,7 +161,7 @@ static INLINE uint64_t xgetbv(void) { #define HAS_AVX2 0x080 #define HAS_AVX512 0x100 #ifndef BIT -#define BIT(n) (1u << n) +#define BIT(n) (1u << (n)) #endif static INLINE int x86_simd_caps(void) { @@ -223,11 +223,26 @@ static INLINE int x86_simd_caps(void) { return flags & mask; } -// Note: -// 32-bit CPU cycle counter is light-weighted for most function performance -// measurement. For large function (CPU time > a couple of seconds), 64-bit -// counter should be used. -// 32-bit CPU cycle counter +// Fine-Grain Measurement Functions +// +// If you are timing a small region of code, access the timestamp counter +// (TSC) via: +// +// unsigned int start = x86_tsc_start(); +// ... +// unsigned int end = x86_tsc_end(); +// unsigned int diff = end - start; +// +// The start/end functions introduce a few more instructions than using +// x86_readtsc directly, but prevent the CPU's out-of-order execution from +// affecting the measurement (by having earlier/later instructions be evaluated +// in the time interval). See the white paper, "How to Benchmark Code +// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by +// Gabriele Paoloni for more information. +// +// If you are timing a large function (CPU time > a couple of seconds), use +// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The +// out-of-order leakage that can occur is minimal compared to total runtime. static INLINE unsigned int x86_readtsc(void) { #if defined(__GNUC__) && __GNUC__ unsigned int tsc; @@ -264,6 +279,41 @@ static INLINE uint64_t x86_readtsc64(void) { #endif } +// 32-bit CPU cycle counter with a partial fence against out-of-order execution. +static INLINE unsigned int x86_readtscp(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tscp; + __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tscp; + asm volatile("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(_MSC_VER) + unsigned int ui; + return (unsigned int)__rdtscp(&ui); +#else +#if ARCH_X86_64 + return (unsigned int)__rdtscp(); +#else + __asm rdtscp; +#endif +#endif +} + +static INLINE unsigned int x86_tsc_start(void) { + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return x86_readtsc(); +} + +static INLINE unsigned int x86_tsc_end(void) { + uint32_t v = x86_readtscp(); + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return v; +} + #if defined(__GNUC__) && __GNUC__ #define x86_pause_hint() __asm__ __volatile__("pause \n\t") #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) @@ -313,14 +363,23 @@ static unsigned short x87_get_control_word(void) { static INLINE unsigned int x87_set_double_precision(void) { unsigned int mode = x87_get_control_word(); + // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1 + // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf + // 8.1.5.2 Precision Control Field + // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control") + // determine the number of bits used in floating point calculations. To match + // later SSE instructions restrict x87 operations to Double Precision (0x200). + // Precision PC Field + // Single Precision (24-Bits) 00B + // Reserved 01B + // Double Precision (53-Bits) 10B + // Extended Precision (64-Bits) 11B x87_set_control_word((mode & ~0x300) | 0x200); return mode; } -extern void vpx_reset_mmx_state(void); - #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_PORTS_X86_H_ +#endif // VPX_VPX_PORTS_X86_H_ diff --git a/libs/libvpx/vpx_scale/generic/gen_scalers.c b/libs/libvpx/vpx_scale/generic/gen_scalers.c index b554a56e83..d8db4b3547 100644 --- a/libs/libvpx/vpx_scale/generic/gen_scalers.c +++ b/libs/libvpx/vpx_scale/generic/gen_scalers.c @@ -12,8 +12,8 @@ #include "vpx_scale/vpx_scale.h" #include "vpx_mem/vpx_mem.h" /**************************************************************************** -* Imports -****************************************************************************/ + * Imports + ****************************************************************************/ /**************************************************************************** * diff --git a/libs/libvpx/vpx_scale/generic/vpx_scale.c b/libs/libvpx/vpx_scale/generic/vpx_scale.c index 20e1ff90fd..958bb320fc 100644 --- a/libs/libvpx/vpx_scale/generic/vpx_scale.c +++ b/libs/libvpx/vpx_scale/generic/vpx_scale.c @@ -17,8 +17,8 @@ ***************************************************************************/ /**************************************************************************** -* Header Files -****************************************************************************/ + * Header Files + ****************************************************************************/ #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" diff --git a/libs/libvpx/vpx_scale/generic/yv12config.c b/libs/libvpx/vpx_scale/generic/yv12config.c index 9c7ca42c78..eee291c30d 100644 --- a/libs/libvpx/vpx_scale/generic/yv12config.c +++ b/libs/libvpx/vpx_scale/generic/yv12config.c @@ -15,9 +15,12 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#if defined(VPX_MAX_ALLOCABLE_MEMORY) +#include "vp9/common/vp9_onyxc_int.h" +#endif // VPX_MAX_ALLOCABLE_MEMORY /**************************************************************************** -* Exports -****************************************************************************/ + * Exports + ****************************************************************************/ /**************************************************************************** * @@ -54,13 +57,21 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int uv_width = aligned_width >> 1; int uv_height = aligned_height >> 1; /** There is currently a bunch of code which assumes - * uv_stride == y_stride/2, so enforce this here. */ + * uv_stride == y_stride/2, so enforce this here. */ int uv_stride = y_stride >> 1; int uvplane_size = (uv_height + border) * uv_stride; - const int frame_size = yplane_size + 2 * uvplane_size; + const size_t frame_size = yplane_size + 2 * uvplane_size; if (!ybf->buffer_alloc) { ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size); +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + // This memset is needed for fixing the issue of using uninitialized + // value in msan test. It will cause a perf loss, so only do this for + // msan test. + memset(ybf->buffer_alloc, 0, frame_size); +#endif +#endif ybf->buffer_alloc_sz = frame_size; } @@ -142,6 +153,17 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border, int byte_alignment, vpx_codec_frame_buffer_t *fb, vpx_get_frame_buffer_cb_fn_t cb, void *cb_priv) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1; +#endif + + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if (border & 0x1f) return -3; + if (ybf) { const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; const int aligned_width = (width + 7) & ~7; @@ -166,9 +188,16 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, uint8_t *buf = NULL; - // frame_size is stored in buffer_alloc_sz, which is an int. If it won't +#if defined(VPX_MAX_ALLOCABLE_MEMORY) + // The decoder may allocate REF_FRAMES frame buffers in the frame buffer + // pool. Bound the total amount of allocated memory as if these REF_FRAMES + // frame buffers were allocated in a single allocation. + if (frame_size > VPX_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1; +#endif // VPX_MAX_ALLOCABLE_MEMORY + + // frame_size is stored in buffer_alloc_sz, which is a size_t. If it won't // fit, fail early. - if (frame_size > INT_MAX) { + if (frame_size > SIZE_MAX) { return -1; } @@ -192,18 +221,19 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, // This memset is needed for fixing the issue of using uninitialized // value in msan test. It will cause a perf loss, so only do this for // msan test. - memset(ybf->buffer_alloc, 0, (int)frame_size); + memset(ybf->buffer_alloc, 0, (size_t)frame_size); #endif #endif - } else if (frame_size > (size_t)ybf->buffer_alloc_sz) { + } else if (frame_size > ybf->buffer_alloc_sz) { // Allocation to hold larger frame, or first allocation. vpx_free(ybf->buffer_alloc); ybf->buffer_alloc = NULL; + ybf->buffer_alloc_sz = 0; ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size); if (!ybf->buffer_alloc) return -1; - ybf->buffer_alloc_sz = (int)frame_size; + ybf->buffer_alloc_sz = (size_t)frame_size; // This memset is needed for fixing valgrind error from C loop filter // due to access uninitialized memory in frame border. It could be @@ -211,13 +241,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); } - /* Only support allocating buffers that have a border that's a multiple - * of 32. The border restriction is required to get 16-byte alignment of - * the start of the chroma rows without introducing an arbitrary gap - * between planes, which would break the semantics of things like - * vpx_img_set_rect(). */ - if (border & 0x1f) return -3; - ybf->y_crop_width = width; ybf->y_crop_height = height; ybf->y_width = aligned_width; @@ -231,7 +254,7 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, ybf->uv_stride = uv_stride; ybf->border = border; - ybf->frame_size = (int)frame_size; + ybf->frame_size = (size_t)frame_size; ybf->subsampling_x = ss_x; ybf->subsampling_y = ss_y; diff --git a/libs/libvpx/vpx_scale/vpx_scale.h b/libs/libvpx/vpx_scale/vpx_scale.h index 478a483461..fd5ba7ccdc 100644 --- a/libs/libvpx/vpx_scale/vpx_scale.h +++ b/libs/libvpx/vpx_scale/vpx_scale.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_SCALE_VPX_SCALE_H_ -#define VPX_SCALE_VPX_SCALE_H_ +#ifndef VPX_VPX_SCALE_VPX_SCALE_H_ +#define VPX_VPX_SCALE_VPX_SCALE_H_ #include "vpx_scale/yv12config.h" @@ -19,4 +19,4 @@ extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, unsigned int vscale, unsigned int vratio, unsigned int interlaced); -#endif // VPX_SCALE_VPX_SCALE_H_ +#endif // VPX_VPX_SCALE_VPX_SCALE_H_ diff --git a/libs/libvpx/vpx_scale/yv12config.h b/libs/libvpx/vpx_scale/yv12config.h index b9b3362144..2cf18217f6 100644 --- a/libs/libvpx/vpx_scale/yv12config.h +++ b/libs/libvpx/vpx_scale/yv12config.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_SCALE_YV12CONFIG_H_ -#define VPX_SCALE_YV12CONFIG_H_ +#ifndef VPX_VPX_SCALE_YV12CONFIG_H_ +#define VPX_VPX_SCALE_YV12CONFIG_H_ #ifdef __cplusplus extern "C" { @@ -49,9 +49,9 @@ typedef struct yv12_buffer_config { uint8_t *alpha_buffer; uint8_t *buffer_alloc; - int buffer_alloc_sz; + size_t buffer_alloc_sz; int border; - int frame_size; + size_t frame_size; int subsampling_x; int subsampling_y; unsigned int bit_depth; @@ -100,4 +100,4 @@ int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); } #endif -#endif // VPX_SCALE_YV12CONFIG_H_ +#endif // VPX_VPX_SCALE_YV12CONFIG_H_ diff --git a/libs/libvpx/vpx_util/endian_inl.h b/libs/libvpx/vpx_util/endian_inl.h index dc38774095..1b6ef56c69 100644 --- a/libs/libvpx/vpx_util/endian_inl.h +++ b/libs/libvpx/vpx_util/endian_inl.h @@ -9,8 +9,8 @@ // // Endian related functions. -#ifndef VPX_UTIL_ENDIAN_INL_H_ -#define VPX_UTIL_ENDIAN_INL_H_ +#ifndef VPX_VPX_UTIL_ENDIAN_INL_H_ +#define VPX_VPX_UTIL_ENDIAN_INL_H_ #include #include "./vpx_config.h" @@ -115,4 +115,4 @@ static INLINE uint64_t BSwap64(uint64_t x) { #endif // HAVE_BUILTIN_BSWAP64 } -#endif // VPX_UTIL_ENDIAN_INL_H_ +#endif // VPX_VPX_UTIL_ENDIAN_INL_H_ diff --git a/libs/libvpx/vpx_util/vpx_atomics.h b/libs/libvpx/vpx_util/vpx_atomics.h index b8cf80daeb..b06a8dce34 100644 --- a/libs/libvpx/vpx_util/vpx_atomics.h +++ b/libs/libvpx/vpx_util/vpx_atomics.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_UTIL_VPX_ATOMICS_H_ -#define VPX_UTIL_VPX_ATOMICS_H_ +#ifndef VPX_VPX_UTIL_VPX_ATOMICS_H_ +#define VPX_VPX_UTIL_VPX_ATOMICS_H_ #include "./vpx_config.h" @@ -68,7 +68,9 @@ extern "C" { // on any platform (to discourage programmer errors by setting values directly). // This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT // (NOT memset) and accessed through vpx_atomic_ functions. -typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int; +typedef struct vpx_atomic_int { + volatile int value; +} vpx_atomic_int; #define VPX_ATOMIC_INIT(num) \ { num } @@ -106,4 +108,4 @@ static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) { } // extern "C" #endif // __cplusplus -#endif // VPX_UTIL_VPX_ATOMICS_H_ +#endif // VPX_VPX_UTIL_VPX_ATOMICS_H_ diff --git a/libs/libvpx/vpx_util/vpx_debug_util.c b/libs/libvpx/vpx_util/vpx_debug_util.c new file mode 100644 index 0000000000..3ce4065ba5 --- /dev/null +++ b/libs/libvpx/vpx_util/vpx_debug_util.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "vpx_util/vpx_debug_util.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +static int frame_idx_w = 0; +static int frame_idx_r = 0; + +void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; } + +int bitstream_queue_get_frame_write(void) { return frame_idx_w; } + +void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; } + +int bitstream_queue_get_frame_read(void) { return frame_idx_r; } +#endif + +#if CONFIG_BITSTREAM_DEBUG +#define QUEUE_MAX_SIZE 2000000 +static int result_queue[QUEUE_MAX_SIZE]; +static int prob_queue[QUEUE_MAX_SIZE]; + +static int queue_r = 0; +static int queue_w = 0; +static int queue_prev_w = -1; +static int skip_r = 0; +static int skip_w = 0; +void bitstream_queue_set_skip_write(int skip) { skip_w = skip; } + +void bitstream_queue_set_skip_read(int skip) { skip_r = skip; } + +void bitstream_queue_record_write(void) { queue_prev_w = queue_w; } + +void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; } + +int bitstream_queue_get_write(void) { return queue_w; } + +int bitstream_queue_get_read(void) { return queue_r; } + +void bitstream_queue_pop(int *result, int *prob) { + if (!skip_r) { + if (queue_w == queue_r) { + printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + *result = result_queue[queue_r]; + *prob = prob_queue[queue_r]; + queue_r = (queue_r + 1) % QUEUE_MAX_SIZE; + } +} + +void bitstream_queue_push(int result, const int prob) { + if (!skip_w) { + result_queue[queue_w] = result; + prob_queue[queue_w] = prob; + queue_w = (queue_w + 1) % QUEUE_MAX_SIZE; + if (queue_w == queue_r) { + printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + } +} +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +static int frame_buf_idx_r = 0; +static int frame_buf_idx_w = 0; +#define MAX_FRAME_BUF_NUM 20 +#define MAX_FRAME_STRIDE 1920 +#define MAX_FRAME_HEIGHT 1080 +static uint16_t + frame_pre[MAX_FRAME_BUF_NUM][3] + [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only +static uint16_t + frame_tx[MAX_FRAME_BUF_NUM][3] + [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm +static int frame_stride = MAX_FRAME_STRIDE; +static int frame_height = MAX_FRAME_HEIGHT; +static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; +void mismatch_move_frame_idx_w(void) { + frame_buf_idx_w = (frame_buf_idx_w + 1) % MAX_FRAME_BUF_NUM; + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf overflow\n"); + assert(0); + } +} + +void mismatch_reset_frame(int num_planes) { + int plane; + for (plane = 0; plane < num_planes; ++plane) { + memset(frame_pre[frame_buf_idx_w][plane], 0, + sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size); + memset(frame_tx[frame_buf_idx_w][plane], 0, + sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size); + } +} + +void mismatch_move_frame_idx_r(void) { + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf underflow\n"); + assert(0); + } + frame_buf_idx_r = (frame_buf_idx_r + 1) % MAX_FRAME_BUF_NUM; +} + +void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int r, c; + + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + frame_pre[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + { + int ref_frame_idx = 3; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_pre frame_idx %d plane %d pixel_c %d pixel_r %d blk_w" + " %d blk_h %d\n", + frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h); + } + } +#endif +} +void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + frame_tx[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + { + int ref_frame_idx = 3; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_tx frame_idx %d plane %d pixel_c %d pixel_r %d blk_w " + "%d blk_h %d\n", + frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h); + } + } +#endif +} +void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + if (frame_pre[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + int rr, cc; + printf( + "\ncheck_block_pre failed frame_idx %d plane %d " + "pixel_c %d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_pre[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + if (frame_tx[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + int rr, cc; + printf( + "\ncheck_block_tx failed frame_idx %d plane %d pixel_c " + "%d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_tx[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +#endif // CONFIG_MISMATCH_DEBUG diff --git a/libs/libvpx/vpx_util/vpx_debug_util.h b/libs/libvpx/vpx_util/vpx_debug_util.h new file mode 100644 index 0000000000..df1a1aab2c --- /dev/null +++ b/libs/libvpx/vpx_util/vpx_debug_util.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ +#define VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ + +#include "./vpx_config.h" + +#include "vpx_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +void bitstream_queue_set_frame_write(int frame_idx); +int bitstream_queue_get_frame_write(void); +void bitstream_queue_set_frame_read(int frame_idx); +int bitstream_queue_get_frame_read(void); +#endif + +#if CONFIG_BITSTREAM_DEBUG +/* This is a debug tool used to detect bitstream error. On encoder side, it + * pushes each bit and probability into a queue before the bit is written into + * the Arithmetic coder. On decoder side, whenever a bit is read out from the + * Arithmetic coder, it pops out the reference bit and probability from the + * queue as well. If the two results do not match, this debug tool will report + * an error. This tool can be used to pin down the bitstream error precisely. + * By combining gdb's backtrace method, we can detect which module causes the + * bitstream error. */ +int bitstream_queue_get_write(void); +int bitstream_queue_get_read(void); +void bitstream_queue_record_write(void); +void bitstream_queue_reset_write(void); +void bitstream_queue_pop(int *result, int *prob); +void bitstream_queue_push(int result, const int prob); +void bitstream_queue_set_skip_write(int skip); +void bitstream_queue_set_skip_read(int skip); +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +void mismatch_move_frame_idx_w(void); +void mismatch_move_frame_idx_r(void); +void mismatch_reset_frame(int num_planes); +void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +#endif // CONFIG_MISMATCH_DEBUG + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ diff --git a/libs/libvpx/vpx_util/vpx_thread.h b/libs/libvpx/vpx_util/vpx_thread.h index 53a5f4966a..6d308e949b 100644 --- a/libs/libvpx/vpx_util/vpx_thread.h +++ b/libs/libvpx/vpx_util/vpx_thread.h @@ -12,8 +12,8 @@ // Original source: // https://chromium.googlesource.com/webm/libwebp -#ifndef VPX_THREAD_H_ -#define VPX_THREAD_H_ +#ifndef VPX_VPX_UTIL_VPX_THREAD_H_ +#define VPX_VPX_UTIL_VPX_THREAD_H_ #include "./vpx_config.h" @@ -159,6 +159,23 @@ static INLINE int pthread_cond_init(pthread_cond_t *const condition, return 0; } +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + int ok = 1; +#ifdef USE_WINDOWS_CONDITION_VARIABLE + WakeAllConditionVariable(condition); +#else + while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok &= SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } +#endif + return !ok; +} + static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { int ok = 1; #ifdef USE_WINDOWS_CONDITION_VARIABLE @@ -194,6 +211,7 @@ static INLINE int pthread_cond_wait(pthread_cond_t *const condition, #endif return !ok; } + #elif defined(__OS2__) #define INCL_DOS #include // NOLINT @@ -202,6 +220,11 @@ static INLINE int pthread_cond_wait(pthread_cond_t *const condition, #include // NOLINT #include // NOLINT +#if defined(__STRICT_ANSI__) +// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here. +int _beginthread(void (*)(void *), void *, unsigned, void *); +#endif + #define pthread_t TID #define pthread_mutex_t HMTX @@ -412,4 +435,4 @@ const VPxWorkerInterface *vpx_get_worker_interface(void); } // extern "C" #endif -#endif // VPX_THREAD_H_ +#endif // VPX_VPX_UTIL_VPX_THREAD_H_ diff --git a/libs/libvpx/vpx_util/vpx_timestamp.h b/libs/libvpx/vpx_util/vpx_timestamp.h new file mode 100644 index 0000000000..c210de5e53 --- /dev/null +++ b/libs/libvpx/vpx_util/vpx_timestamp.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_TIMESTAMP_H_ +#define VPX_VPX_UTIL_VPX_TIMESTAMP_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Rational Number with an int64 numerator +typedef struct vpx_rational64 { + int64_t num; // fraction numerator + int den; // fraction denominator +} vpx_rational64_t; // alias for struct vpx_rational64_t + +static INLINE int gcd(int64_t a, int b) { + int r; // remainder + while (b > 0) { + r = (int)(a % b); + a = b; + b = r; + } + + return (int)a; +} + +static INLINE void reduce_ratio(vpx_rational64_t *ratio) { + const int denom = gcd(ratio->num, ratio->den); + ratio->num /= denom; + ratio->den /= denom; +} + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_VPX_UTIL_VPX_TIMESTAMP_H_ diff --git a/libs/libvpx/vpx_util/vpx_util.mk b/libs/libvpx/vpx_util/vpx_util.mk index 86d3ece3c8..1162714956 100644 --- a/libs/libvpx/vpx_util/vpx_util.mk +++ b/libs/libvpx/vpx_util/vpx_util.mk @@ -15,3 +15,6 @@ UTIL_SRCS-yes += vpx_thread.h UTIL_SRCS-yes += endian_inl.h UTIL_SRCS-yes += vpx_write_yuv_frame.h UTIL_SRCS-yes += vpx_write_yuv_frame.c +UTIL_SRCS-yes += vpx_timestamp.h +UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.h +UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.c diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.c b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c index ab68558115..4ef57a2fee 100644 --- a/libs/libvpx/vpx_util/vpx_write_yuv_frame.c +++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c @@ -13,7 +13,7 @@ void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { #if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \ - defined(OUTPUT_YUV_SKINMAP) + defined(OUTPUT_YUV_SKINMAP) || defined(OUTPUT_YUV_SVC_SRC) unsigned char *src = s->y_buffer; int h = s->y_crop_height; diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.h b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h index 1cb7029817..ce1102458e 100644 --- a/libs/libvpx/vpx_util/vpx_write_yuv_frame.h +++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ -#define VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ +#ifndef VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ +#define VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ #include #include "vpx_scale/yv12config.h" @@ -24,4 +24,4 @@ void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); } // extern "C" #endif -#endif // VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ +#endif // VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ diff --git a/libs/libvpx/vpxdec.c b/libs/libvpx/vpxdec.c index ff20e6a3c9..c60eb5c30b 100644 --- a/libs/libvpx/vpxdec.c +++ b/libs/libvpx/vpxdec.c @@ -98,20 +98,41 @@ static const arg_def_t svcdecodingarg = ARG_DEF( NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer"); static const arg_def_t framestatsarg = ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); +static const arg_def_t rowmtarg = + ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9"); +static const arg_def_t lpfoptarg = + ARG_DEF(NULL, "lpf-opt", 1, + "Do loopfilter without waiting for all threads to sync."); -static const arg_def_t *all_args[] = { - &help, &codecarg, &use_yv12, - &use_i420, &flipuvarg, &rawvideo, - &noblitarg, &progressarg, &limitarg, - &skiparg, &postprocarg, &summaryarg, - &outputfile, &threadsarg, &frameparallelarg, - &verbosearg, &scalearg, &fb_arg, - &md5arg, &error_concealment, &continuearg, +static const arg_def_t *all_args[] = { &help, + &codecarg, + &use_yv12, + &use_i420, + &flipuvarg, + &rawvideo, + &noblitarg, + &progressarg, + &limitarg, + &skiparg, + &postprocarg, + &summaryarg, + &outputfile, + &threadsarg, + &frameparallelarg, + &verbosearg, + &scalearg, + &fb_arg, + &md5arg, + &error_concealment, + &continuearg, #if CONFIG_VP9_HIGHBITDEPTH - &outbitdeptharg, + &outbitdeptharg, #endif - &svcdecodingarg, &framestatsarg, NULL -}; + &svcdecodingarg, + &framestatsarg, + &rowmtarg, + &lpfoptarg, + NULL }; #if CONFIG_VP8_DECODER static const arg_def_t addnoise_level = @@ -154,7 +175,7 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst, dst->d_h, mode); } #endif -void show_help(FILE *fout, int shorthelp) { +static void show_help(FILE *fout, int shorthelp) { int i; fprintf(fout, "Usage: %s filename\n\n", exec_name); @@ -238,13 +259,14 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, return 1; } *bytes_read = frame_size; + return 0; } - return 0; + return 1; } -static int read_frame(struct VpxDecInputContext *input, uint8_t **buf, - size_t *bytes_in_buffer, size_t *buffer_size) { +static int dec_read_frame(struct VpxDecInputContext *input, uint8_t **buf, + size_t *bytes_in_buffer, size_t *buffer_size) { switch (input->vpx_input_ctx->file_type) { #if CONFIG_WEBM_IO case FILE_TYPE_WEBM: @@ -506,6 +528,8 @@ static int main_loop(int argc, const char **argv_) { int arg_skip = 0; int ec_enabled = 0; int keep_going = 0; + int enable_row_mt = 0; + int enable_lpf_opt = 0; const VpxInterface *interface = NULL; const VpxInterface *fourcc_interface = NULL; uint64_t dx_time = 0; @@ -628,6 +652,10 @@ static int main_loop(int argc, const char **argv_) { die("Error: Could not open --framestats file (%s) for writing.\n", arg.val); } + } else if (arg_match(&arg, &rowmtarg, argi)) { + enable_row_mt = arg_parse_uint(&arg); + } else if (arg_match(&arg, &lpfoptarg, argi)) { + enable_lpf_opt = arg_parse_uint(&arg); } #if CONFIG_VP8_DECODER else if (arg_match(&arg, &addnoise_level, argi)) { @@ -753,6 +781,18 @@ static int main_loop(int argc, const char **argv_) { goto fail; } } + if (interface->fourcc == VP9_FOURCC && + vpx_codec_control(&decoder, VP9D_SET_ROW_MT, enable_row_mt)) { + fprintf(stderr, "Failed to set decoder in row multi-thread mode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } + if (interface->fourcc == VP9_FOURCC && + vpx_codec_control(&decoder, VP9D_SET_LOOP_FILTER_OPT, enable_lpf_opt)) { + fprintf(stderr, "Failed to set decoder in optimized loopfilter mode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } if (!quiet) fprintf(stderr, "%s\n", decoder.name); #if CONFIG_VP8_DECODER @@ -766,7 +806,7 @@ static int main_loop(int argc, const char **argv_) { if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); while (arg_skip) { - if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; + if (dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; arg_skip--; } @@ -797,7 +837,7 @@ static int main_loop(int argc, const char **argv_) { frame_avail = 0; if (!stop_after || frame_in < stop_after) { - if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { + if (!dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { frame_avail = 1; frame_in++; diff --git a/libs/libvpx/vpxenc.c b/libs/libvpx/vpxenc.c index 4db7eccc35..50c36bedd5 100644 --- a/libs/libvpx/vpxenc.c +++ b/libs/libvpx/vpxenc.c @@ -50,12 +50,6 @@ #endif #include "./y4minput.h" -/* Swallow warnings about unused results of fread/fwrite */ -static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { - return fread(ptr, size, nmemb, stream); -} -#define fread wrap_fread - static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { return fwrite(ptr, size, nmemb, stream); @@ -95,34 +89,6 @@ static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal, va_end(ap); } -static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { - FILE *f = input_ctx->file; - y4m_input *y4m = &input_ctx->y4m; - int shortread = 0; - - if (input_ctx->file_type == FILE_TYPE_Y4M) { - if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; - } else { - shortread = read_yuv_frame(input_ctx, img); - } - - return !shortread; -} - -static int file_is_y4m(const char detect[4]) { - if (memcmp(detect, "YUV4", 4) == 0) { - return 1; - } - return 0; -} - -static int fourcc_is_ivf(const char detect[4]) { - if (memcmp(detect, "DKIF", 4) == 0) { - return 1; - } - return 0; -} - static const arg_def_t help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t debugmode = @@ -326,9 +292,9 @@ static const arg_def_t maxsection_pct = ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); static const arg_def_t corpus_complexity = ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint"); -static const arg_def_t *rc_twopass_args[] = { - &bias_pct, &minsection_pct, &maxsection_pct, &corpus_complexity, NULL -}; +static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct, + &maxsection_pct, + &corpus_complexity, NULL }; static const arg_def_t kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); @@ -342,19 +308,19 @@ static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled, static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"); static const arg_def_t sharpness = - ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)"); + ARG_DEF(NULL, "sharpness", 1, + "Increase sharpness at the expense of lower PSNR. (0..7)"); static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"); -static const arg_def_t auto_altref = - ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"); static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"); -static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, "AltRef type"); -static const struct arg_enum_list tuning_enum[] = { - { "psnr", VP8_TUNE_PSNR }, { "ssim", VP8_TUNE_SSIM }, { NULL, 0 } -}; +static const arg_def_t arnr_type = + ARG_DEF(NULL, "arnr-type", 1, "AltRef filter type (1..3)"); +static const struct arg_enum_list tuning_enum[] = { { "psnr", VP8_TUNE_PSNR }, + { "ssim", VP8_TUNE_SSIM }, + { NULL, 0 } }; static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum); static const arg_def_t cq_level = @@ -367,12 +333,14 @@ static const arg_def_t gf_cbr_boost_pct = ARG_DEF( #if CONFIG_VP8_ENCODER static const arg_def_t cpu_used_vp8 = ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-16..16)"); +static const arg_def_t auto_altref_vp8 = ARG_DEF( + NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames. (0..1)"); static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2"); static const arg_def_t screen_content_mode = ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode"); static const arg_def_t *vp8_args[] = { &cpu_used_vp8, - &auto_altref, + &auto_altref_vp8, &noise_sens, &sharpness, &static_thresh, @@ -405,12 +373,19 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED, #if CONFIG_VP9_ENCODER static const arg_def_t cpu_used_vp9 = - ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-8..8)"); + ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-9..9)"); +static const arg_def_t auto_altref_vp9 = ARG_DEF( + NULL, "auto-alt-ref", 1, + "Enable automatic alt reference frames, 2+ enables multi-layer. (0..6)"); static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"); static const arg_def_t tile_rows = ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2 (set to 0 while threads > 1)"); + +static const arg_def_t enable_tpl_model = + ARG_DEF(NULL, "enable-tpl", 1, "Enable temporal dependency model"); + static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)"); static const arg_def_t frame_parallel_decoding = ARG_DEF( @@ -491,11 +466,12 @@ static const arg_def_t row_mt = #if CONFIG_VP9_ENCODER static const arg_def_t *vp9_args[] = { &cpu_used_vp9, - &auto_altref, + &auto_altref_vp9, &sharpness, &static_thresh, &tile_cols, &tile_rows, + &enable_tpl_model, &arnr_maxframes, &arnr_strength, &arnr_type, @@ -527,6 +503,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_STATIC_THRESHOLD, VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, + VP9E_SET_TPL, VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE, @@ -552,7 +529,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, static const arg_def_t *no_args[] = { NULL }; -void show_help(FILE *fout, int shorthelp) { +static void show_help(FILE *fout, int shorthelp) { int i; const int num_encoder = get_vpx_encoder_count(); @@ -603,230 +580,6 @@ void usage_exit(void) { exit(EXIT_FAILURE); } -#define mmin(a, b) ((a) < (b) ? (a) : (b)) - -#if CONFIG_VP9_HIGHBITDEPTH -static void find_mismatch_high(const vpx_image_t *const img1, - const vpx_image_t *const img2, int yloc[4], - int uloc[4], int vloc[4]) { - uint16_t *plane1, *plane2; - uint32_t stride1, stride2; - const uint32_t bsize = 64; - const uint32_t bsizey = bsize >> img1->y_chroma_shift; - const uint32_t bsizex = bsize >> img1->x_chroma_shift; - const uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - int match = 1; - uint32_t i, j; - yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; - plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y]; - plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y]; - stride1 = img1->stride[VPX_PLANE_Y] / 2; - stride2 = img2->stride[VPX_PLANE_Y] / 2; - for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { - for (j = 0; match && j < img1->d_w; j += bsize) { - int k, l; - const int si = mmin(i + bsize, img1->d_h) - i; - const int sj = mmin(j + bsize, img1->d_w) - j; - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(plane1 + (i + k) * stride1 + j + l) != - *(plane2 + (i + k) * stride2 + j + l)) { - yloc[0] = i + k; - yloc[1] = j + l; - yloc[2] = *(plane1 + (i + k) * stride1 + j + l); - yloc[3] = *(plane2 + (i + k) * stride2 + j + l); - match = 0; - break; - } - } - } - } - } - - uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; - plane1 = (uint16_t *)img1->planes[VPX_PLANE_U]; - plane2 = (uint16_t *)img2->planes[VPX_PLANE_U]; - stride1 = img1->stride[VPX_PLANE_U] / 2; - stride2 = img2->stride[VPX_PLANE_U] / 2; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(plane1 + (i + k) * stride1 + j + l) != - *(plane2 + (i + k) * stride2 + j + l)) { - uloc[0] = i + k; - uloc[1] = j + l; - uloc[2] = *(plane1 + (i + k) * stride1 + j + l); - uloc[3] = *(plane2 + (i + k) * stride2 + j + l); - match = 0; - break; - } - } - } - } - } - - vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; - plane1 = (uint16_t *)img1->planes[VPX_PLANE_V]; - plane2 = (uint16_t *)img2->planes[VPX_PLANE_V]; - stride1 = img1->stride[VPX_PLANE_V] / 2; - stride2 = img2->stride[VPX_PLANE_V] / 2; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(plane1 + (i + k) * stride1 + j + l) != - *(plane2 + (i + k) * stride2 + j + l)) { - vloc[0] = i + k; - vloc[1] = j + l; - vloc[2] = *(plane1 + (i + k) * stride1 + j + l); - vloc[3] = *(plane2 + (i + k) * stride2 + j + l); - match = 0; - break; - } - } - } - } - } -} -#endif - -static void find_mismatch(const vpx_image_t *const img1, - const vpx_image_t *const img2, int yloc[4], - int uloc[4], int vloc[4]) { - const uint32_t bsize = 64; - const uint32_t bsizey = bsize >> img1->y_chroma_shift; - const uint32_t bsizex = bsize >> img1->x_chroma_shift; - const uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - int match = 1; - uint32_t i, j; - yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; - for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { - for (j = 0; match && j < img1->d_w; j += bsize) { - int k, l; - const int si = mmin(i + bsize, img1->d_h) - i; - const int sj = mmin(j + bsize, img1->d_w) - j; - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l) != - *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { - yloc[0] = i + k; - yloc[1] = j + l; - yloc[2] = *(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l); - yloc[3] = *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l); - match = 0; - break; - } - } - } - } - } - - uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l) != - *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { - uloc[0] = i + k; - uloc[1] = j + l; - uloc[2] = *(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l); - uloc[3] = *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l); - match = 0; - break; - } - } - } - } - } - vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l) != - *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { - vloc[0] = i + k; - vloc[1] = j + l; - vloc[2] = *(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l); - vloc[3] = *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l); - match = 0; - break; - } - } - } - } - } -} - -static int compare_img(const vpx_image_t *const img1, - const vpx_image_t *const img2) { - uint32_t l_w = img1->d_w; - uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - uint32_t i; - int match = 1; - - match &= (img1->fmt == img2->fmt); - match &= (img1->d_w == img2->d_w); - match &= (img1->d_h == img2->d_h); -#if CONFIG_VP9_HIGHBITDEPTH - if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { - l_w *= 2; - c_w *= 2; - } -#endif - - for (i = 0; i < img1->d_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], - img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], - l_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], - img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], - c_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], - img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], - c_w) == 0); - - return match; -} - #define NELEMENTS(x) (sizeof(x) / sizeof(x[0])) #if CONFIG_VP9_ENCODER #define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map) @@ -1012,57 +765,6 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { } } -static void open_input_file(struct VpxInputContext *input) { - /* Parse certain options from the input file, if possible */ - input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") - : set_binary_mode(stdin); - - if (!input->file) fatal("Failed to open input file"); - - if (!fseeko(input->file, 0, SEEK_END)) { - /* Input file is seekable. Figure out how long it is, so we can get - * progress info. - */ - input->length = ftello(input->file); - rewind(input->file); - } - - /* Default to 1:1 pixel aspect ratio. */ - input->pixel_aspect_ratio.numerator = 1; - input->pixel_aspect_ratio.denominator = 1; - - /* For RAW input sources, these bytes will applied on the first frame - * in read_frame(). - */ - input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); - input->detect.position = 0; - - if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { - if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, - input->only_i420) >= 0) { - input->file_type = FILE_TYPE_Y4M; - input->width = input->y4m.pic_w; - input->height = input->y4m.pic_h; - input->pixel_aspect_ratio.numerator = input->y4m.par_n; - input->pixel_aspect_ratio.denominator = input->y4m.par_d; - input->framerate.numerator = input->y4m.fps_n; - input->framerate.denominator = input->y4m.fps_d; - input->fmt = input->y4m.vpx_fmt; - input->bit_depth = input->y4m.bit_depth; - } else - fatal("Unsupported Y4M stream."); - } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { - fatal("IVF is not supported as input."); - } else { - input->file_type = FILE_TYPE_RAW; - } -} - -static void close_input_file(struct VpxInputContext *input) { - fclose(input->file); - if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); -} - static struct stream_state *new_stream(struct VpxEncoderConfig *global, struct stream_state *prev) { struct stream_state *stream; @@ -1278,8 +980,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global, match = 1; /* Point either to the next free element or the first - * instance of this control. - */ + * instance of this control. + */ for (j = 0; j < config->arg_ctrl_cnt; j++) if (ctrl_args_map != NULL && config->arg_ctrls[j][0] == ctrl_args_map[i]) @@ -1614,14 +1316,14 @@ static void encode_frame(struct stream_state *stream, vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16); } I420Scale_16( - (uint16 *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2, - (uint16 *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2, - (uint16 *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2, - img->d_w, img->d_h, (uint16 *)stream->img->planes[VPX_PLANE_Y], + (uint16_t *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2, + (uint16_t *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2, + (uint16_t *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2, + img->d_w, img->d_h, (uint16_t *)stream->img->planes[VPX_PLANE_Y], stream->img->stride[VPX_PLANE_Y] / 2, - (uint16 *)stream->img->planes[VPX_PLANE_U], + (uint16_t *)stream->img->planes[VPX_PLANE_U], stream->img->stride[VPX_PLANE_U] / 2, - (uint16 *)stream->img->planes[VPX_PLANE_V], + (uint16_t *)stream->img->planes[VPX_PLANE_V], stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w, stream->img->d_h, kFilterBox); img = stream->img; @@ -2215,9 +1917,9 @@ int main(int argc, const char **argv_) { if (!global.quiet) { FOREACH_STREAM(fprintf( - stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 - "b/f %7" PRId64 "b/s" - " %7" PRId64 " %s (%.2f fps)\033[K\n", + stderr, + "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64 + "b/s %7" PRId64 " %s (%.2f fps)\033[K\n", pass + 1, global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes, seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0, diff --git a/libs/libvpx/vpxenc.h b/libs/libvpx/vpxenc.h index d867e9d954..b780aedca6 100644 --- a/libs/libvpx/vpxenc.h +++ b/libs/libvpx/vpxenc.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPXENC_H_ -#define VPXENC_H_ +#ifndef VPX_VPXENC_H_ +#define VPX_VPXENC_H_ #include "vpx/vpx_encoder.h" @@ -61,4 +61,4 @@ struct VpxEncoderConfig { } // extern "C" #endif -#endif // VPXENC_H_ +#endif // VPX_VPXENC_H_ diff --git a/libs/libvpx/vpxstats.h b/libs/libvpx/vpxstats.h index 5c9ea34f71..3625ee3291 100644 --- a/libs/libvpx/vpxstats.h +++ b/libs/libvpx/vpxstats.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPXSTATS_H_ -#define VPXSTATS_H_ +#ifndef VPX_VPXSTATS_H_ +#define VPX_VPXSTATS_H_ #include @@ -40,4 +40,4 @@ vpx_fixed_buf_t stats_get(stats_io_t *stats); } // extern "C" #endif -#endif // VPXSTATS_H_ +#endif // VPX_VPXSTATS_H_ diff --git a/libs/libvpx/warnings.h b/libs/libvpx/warnings.h index 6b8ae6796f..15558c6437 100644 --- a/libs/libvpx/warnings.h +++ b/libs/libvpx/warnings.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WARNINGS_H_ -#define WARNINGS_H_ +#ifndef VPX_WARNINGS_H_ +#define VPX_WARNINGS_H_ #ifdef __cplusplus extern "C" { @@ -30,4 +30,4 @@ void check_encoder_config(int disable_prompt, } // extern "C" #endif -#endif // WARNINGS_H_ +#endif // VPX_WARNINGS_H_ diff --git a/libs/libvpx/webmdec.h b/libs/libvpx/webmdec.h index 7dcb170caf..d8618b07d6 100644 --- a/libs/libvpx/webmdec.h +++ b/libs/libvpx/webmdec.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBMDEC_H_ -#define WEBMDEC_H_ +#ifndef VPX_WEBMDEC_H_ +#define VPX_WEBMDEC_H_ #include "./tools_common.h" @@ -66,4 +66,4 @@ void webm_free(struct WebmInputContext *webm_ctx); } // extern "C" #endif -#endif // WEBMDEC_H_ +#endif // VPX_WEBMDEC_H_ diff --git a/libs/libvpx/webmenc.h b/libs/libvpx/webmenc.h index b4a9e357bb..4176e82081 100644 --- a/libs/libvpx/webmenc.h +++ b/libs/libvpx/webmenc.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBMENC_H_ -#define WEBMENC_H_ +#ifndef VPX_WEBMENC_H_ +#define VPX_WEBMENC_H_ #include #include @@ -52,4 +52,4 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx); } // extern "C" #endif -#endif // WEBMENC_H_ +#endif // VPX_WEBMENC_H_ diff --git a/libs/libvpx/y4menc.c b/libs/libvpx/y4menc.c index 05018dbc43..02b729e5bb 100644 --- a/libs/libvpx/y4menc.c +++ b/libs/libvpx/y4menc.c @@ -17,11 +17,9 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height, const char *color; switch (bit_depth) { case 8: - color = fmt == VPX_IMG_FMT_444A - ? "C444alpha\n" - : fmt == VPX_IMG_FMT_I444 - ? "C444\n" - : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n"; + color = fmt == VPX_IMG_FMT_I444 + ? "C444\n" + : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n"; break; case 9: color = fmt == VPX_IMG_FMT_I44416 diff --git a/libs/libvpx/y4menc.h b/libs/libvpx/y4menc.h index 69d590413e..9a367e34c6 100644 --- a/libs/libvpx/y4menc.h +++ b/libs/libvpx/y4menc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef Y4MENC_H_ -#define Y4MENC_H_ +#ifndef VPX_Y4MENC_H_ +#define VPX_Y4MENC_H_ #include "./tools_common.h" @@ -30,4 +30,4 @@ int y4m_write_frame_header(char *buf, size_t len); } // extern "C" #endif -#endif // Y4MENC_H_ +#endif // VPX_Y4MENC_H_ diff --git a/libs/libvpx/y4minput.c b/libs/libvpx/y4minput.c index 1de636cc0b..007bd9971b 100644 --- a/libs/libvpx/y4minput.c +++ b/libs/libvpx/y4minput.c @@ -130,8 +130,8 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { The number of taps is intentionally kept small to reduce computational overhead and limit ringing. - The taps from these filters are scaled so that their sum is 1, and the result - is scaled by 128 and rounded to integers to create a filter whose + The taps from these filters are scaled so that their sum is 1, and the + result is scaled by 128 and rounded to integers to create a filter whose intermediate values fit inside 16 bits. Coefficients are rounded in such a way as to ensure their sum is still 128, which is usually equivalent to normal rounding. @@ -139,7 +139,6 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { Conversions which require both horizontal and vertical filtering could have these steps pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ - #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) @@ -976,6 +975,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; _y4m->convert = y4m_convert_411_420jpeg; + fprintf(stderr, "Unsupported conversion from yuv 411\n"); + return -1; } else if (strcmp(_y4m->chroma_type, "444") == 0) { _y4m->src_c_dec_h = 1; _y4m->src_c_dec_v = 1; @@ -1030,30 +1031,6 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; - if (only_420) { - _y4m->dst_c_dec_h = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; - /*Chroma filter required: read into the aux buf first. - We need to make two filter passes, so we need some extra space in the - aux buffer. - The extra plane also gets read into the aux buf. - It will be discarded.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h; - _y4m->convert = y4m_convert_444_420jpeg; - } else { - _y4m->vpx_fmt = VPX_IMG_FMT_444A; - _y4m->bps = 32; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h; - /*Natively supported: no conversion required.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - } } else if (strcmp(_y4m->chroma_type, "mono") == 0) { _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0; _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2; diff --git a/libs/libvpx/y4minput.h b/libs/libvpx/y4minput.h index 9e69ceb835..a4a8b18dc5 100644 --- a/libs/libvpx/y4minput.h +++ b/libs/libvpx/y4minput.h @@ -11,8 +11,8 @@ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. */ -#ifndef Y4MINPUT_H_ -#define Y4MINPUT_H_ +#ifndef VPX_Y4MINPUT_H_ +#define VPX_Y4MINPUT_H_ #include #include "vpx/vpx_image.h" @@ -65,4 +65,4 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img); } // extern "C" #endif -#endif // Y4MINPUT_H_ +#endif // VPX_Y4MINPUT_H_