cinelerra-5.1/thirdparty/src/x265_3.5.patch0

   1 diff -Naur ./source/CMakeLists.txt ../x265_apple_patch/source/CMakeLists.txt
   2 --- ./source/CMakeLists.txt     2021-05-08 13:06:22.000000000 +0100
   3 +++ ../x265_apple_patch/source/CMakeLists.txt   2021-05-08 13:08:01.000000000 +0100
   4 @@ -40,9 +40,11 @@
   5  # System architecture detection
   6  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
   7  set(X86_ALIASES x86 i386 i686 x86_64 amd64)
   8 -set(ARM_ALIASES armv6l armv7l aarch64)
   9 +set(ARM_ALIASES armv6l armv7l)
  10 +set(ARM64_ALIASES arm64 arm64e aarch64)
  11  list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
  12  list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
  13 +list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
  14  set(POWER_ALIASES ppc64 ppc64le)
  15  list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
  16  if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
  17 @@ -79,6 +81,15 @@
  18          message(STATUS "Detected ARM target processor")
  19          add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
  20      endif()
  21 +elseif(ARM64MATCH GREATER "-1")
  22 +    if(CROSS_COMPILE_ARM64)
  23 +        message(STATUS "Cross compiling for ARM64 arch")
  24 +    else()
  25 +        set(CROSS_COMPILE_ARM64 0)
  26 +    endif()
  27 +    message(STATUS "Detected ARM64 target processor")
  28 +    set(ARM64 1)
  29 +    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
  30  else()
  31      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
  32      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
  33 @@ -259,6 +270,9 @@
  34              endif()
  35          endif()
  36      endif()
  37 +    if(ARM64 OR CROSS_COMPILE_ARM64)
  38 +        add_definitions(-DHAVE_NEON)
  39 +    endif()
  40      add_definitions(${ARM_ARGS})
  41      if(FPROFILE_GENERATE)
  42          if(INTEL_CXX)
  43 @@ -350,7 +364,7 @@
  44  endif(GCC)
  45
  46  find_package(Nasm)
  47 -if(ARM OR CROSS_COMPILE_ARM)
  48 +if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
  49      option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
  50  elseif(NASM_FOUND AND X86)
  51      if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
  52 @@ -549,6 +563,32 @@
  53                  ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
  54                  DEPENDS ${ASM_SRC})
  55          endforeach()
  56 +    elseif(ARM64 OR CROSS_COMPILE_ARM64)
  57 +    # compile ARM arch asm files here
  58 +        enable_language(ASM)
  59 +        foreach(ASM ${ARM_ASMS})
  60 +            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm64/${ASM})
  61 +            list(APPEND ASM_SRCS ${ASM_SRC})
  62 +            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
  63 +            add_custom_command(
  64 +                OUTPUT ${ASM}.${SUFFIX}
  65 +                COMMAND ${CMAKE_CXX_COMPILER}
  66 +                ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
  67 +                DEPENDS ${ASM_SRC})
  68 +        endforeach()
  69 +    elseif(ARM64 OR CROSS_COMPILE_ARM64)
  70 +    # compile ARM arch asm files here
  71 +        enable_language(ASM)
  72 +        foreach(ASM ${ARM_ASMS})
  73 +            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm64/${ASM})
  74 +            list(APPEND ASM_SRCS ${ASM_SRC})
  75 +            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
  76 +            add_custom_command(
  77 +                OUTPUT ${ASM}.${SUFFIX}
  78 +                COMMAND ${CMAKE_CXX_COMPILER}
  79 +                ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
  80 +                DEPENDS ${ASM_SRC})
  81 +        endforeach()
  82      elseif(X86)
  83      # compile X86 arch asm files here
  84          foreach(ASM ${MSVC_ASMS})
  85 diff -Naur ./source/common/CMakeLists.txt ../x265_apple_patch/source/common/CMakeLists.txt
  86 --- ./source/common/CMakeLists.txt      2021-05-08 13:06:22.000000000 +0100
  87 +++ ../x265_apple_patch/source/common/CMakeLists.txt    2021-05-08 13:08:01.000000000 +0100
  88 @@ -114,6 +114,22 @@
  89      source_group(Assembly FILES ${ASM_PRIMITIVES})
  90  endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
  91
  92 +
  93 +if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
  94 +    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp  filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h)
  95 +    enable_language(ASM)
  96 +    # add ARM assembly/intrinsic files here
  97 +    #set(A_SRCS )
  98 +    #set(VEC_PRIMITIVES)
  99 +
 100 +    #set(ARM64_ASMS "${A_SRCS}" CACHE INTERNAL "ARM64 Assembly Sources")
 101 +    foreach(SRC ${C_SRCS})
 102 +        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm64/${SRC})
 103 +    endforeach()
 104 +    source_group(Assembly FILES ${ASM_PRIMITIVES})
 105 +endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
 106 +
 107 +
 108  if(POWER)
 109      set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
 110      if(ENABLE_ALTIVEC)
 111 diff -Naur ./source/common/arm64/arm64-utils.cpp ../x265_apple_patch/source/common/arm64/arm64-utils.cpp
 112 --- ./source/common/arm64/arm64-utils.cpp       1970-01-01 01:00:00.000000000 +0100
 113 +++ ../x265_apple_patch/source/common/arm64/arm64-utils.cpp     2021-05-08 13:08:01.000000000 +0100
 114 @@ -0,0 +1,290 @@
 115 +#include "common.h"
 116 +#include "x265.h"
 117 +#include "arm64-utils.h"
 118 +#include <arm_neon.h>
 119 +
 120 +#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
 121 +namespace X265_NS {
 122 +
 123 +
 124 +
 125 +void transpose8x8(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride)
 126 +{
 127 +    uint8x8_t a0,a1,a2,a3,a4,a5,a6,a7;
 128 +    uint8x8_t b0,b1,b2,b3,b4,b5,b6,b7;
 129 +
 130 +    a0 = *(uint8x8_t *)(src + 0*sstride);
 131 +    a1 = *(uint8x8_t *)(src + 1*sstride);
 132 +    a2 = *(uint8x8_t *)(src + 2*sstride);
 133 +    a3 = *(uint8x8_t *)(src + 3*sstride);
 134 +    a4 = *(uint8x8_t *)(src + 4*sstride);
 135 +    a5 = *(uint8x8_t *)(src + 5*sstride);
 136 +    a6 = *(uint8x8_t *)(src + 6*sstride);
 137 +    a7 = *(uint8x8_t *)(src + 7*sstride);
 138 +
 139 +    b0 = vtrn1_u32(a0,a4);
 140 +    b1 = vtrn1_u32(a1,a5);
 141 +    b2 = vtrn1_u32(a2,a6);
 142 +    b3 = vtrn1_u32(a3,a7);
 143 +    b4 = vtrn2_u32(a0,a4);
 144 +    b5 = vtrn2_u32(a1,a5);
 145 +    b6 = vtrn2_u32(a2,a6);
 146 +    b7 = vtrn2_u32(a3,a7);
 147 +
 148 +    a0 = vtrn1_u16(b0,b2);
 149 +    a1 = vtrn1_u16(b1,b3);
 150 +    a2 = vtrn2_u16(b0,b2);
 151 +    a3 = vtrn2_u16(b1,b3);
 152 +    a4 = vtrn1_u16(b4,b6);
 153 +    a5 = vtrn1_u16(b5,b7);
 154 +    a6 = vtrn2_u16(b4,b6);
 155 +    a7 = vtrn2_u16(b5,b7);
 156 +
 157 +    b0 = vtrn1_u8(a0,a1);
 158 +    b1 = vtrn2_u8(a0,a1);
 159 +    b2 = vtrn1_u8(a2,a3);
 160 +    b3 = vtrn2_u8(a2,a3);
 161 +    b4 = vtrn1_u8(a4,a5);
 162 +    b5 = vtrn2_u8(a4,a5);
 163 +    b6 = vtrn1_u8(a6,a7);
 164 +    b7 = vtrn2_u8(a6,a7);
 165 +
 166 +    *(uint8x8_t *)(dst + 0*dstride) = b0;
 167 +    *(uint8x8_t *)(dst + 1*dstride) = b1;
 168 +    *(uint8x8_t *)(dst + 2*dstride) = b2;
 169 +    *(uint8x8_t *)(dst + 3*dstride) = b3;
 170 +    *(uint8x8_t *)(dst + 4*dstride) = b4;
 171 +    *(uint8x8_t *)(dst + 5*dstride) = b5;
 172 +    *(uint8x8_t *)(dst + 6*dstride) = b6;
 173 +    *(uint8x8_t *)(dst + 7*dstride) = b7;
 174 +}
 175 +
 176 +
 177 +
 178 +
 179 +
 180 +
 181 +void transpose16x16(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride)
 182 +{
 183 +    uint16x8_t a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,aA,aB,aC,aD,aE,aF;
 184 +    uint16x8_t b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,bA,bB,bC,bD,bE,bF;
 185 +    uint16x8_t c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,cA,cB,cC,cD,cE,cF;
 186 +    uint16x8_t d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,dA,dB,dC,dD,dE,dF;
 187 +
 188 +    a0 = *(uint16x8_t *)(src + 0*sstride);
 189 +    a1 = *(uint16x8_t *)(src + 1*sstride);
 190 +    a2 = *(uint16x8_t *)(src + 2*sstride);
 191 +    a3 = *(uint16x8_t *)(src + 3*sstride);
 192 +    a4 = *(uint16x8_t *)(src + 4*sstride);
 193 +    a5 = *(uint16x8_t *)(src + 5*sstride);
 194 +    a6 = *(uint16x8_t *)(src + 6*sstride);
 195 +    a7 = *(uint16x8_t *)(src + 7*sstride);
 196 +    a8 = *(uint16x8_t *)(src + 8*sstride);
 197 +    a9 = *(uint16x8_t *)(src + 9*sstride);
 198 +    aA = *(uint16x8_t *)(src + 10*sstride);
 199 +    aB = *(uint16x8_t *)(src + 11*sstride);
 200 +    aC = *(uint16x8_t *)(src + 12*sstride);
 201 +    aD = *(uint16x8_t *)(src + 13*sstride);
 202 +    aE = *(uint16x8_t *)(src + 14*sstride);
 203 +    aF = *(uint16x8_t *)(src + 15*sstride);
 204 +
 205 +    b0 = vtrn1q_u64(a0, a8);
 206 +    b1 = vtrn1q_u64(a1, a9);
 207 +    b2 = vtrn1q_u64(a2, aA);
 208 +    b3 = vtrn1q_u64(a3, aB);
 209 +    b4 = vtrn1q_u64(a4, aC);
 210 +    b5 = vtrn1q_u64(a5, aD);
 211 +    b6 = vtrn1q_u64(a6, aE);
 212 +    b7 = vtrn1q_u64(a7, aF);
 213 +    b8 = vtrn2q_u64(a0, a8);
 214 +    b9 = vtrn2q_u64(a1, a9);
 215 +    bA = vtrn2q_u64(a2, aA);
 216 +    bB = vtrn2q_u64(a3, aB);
 217 +    bC = vtrn2q_u64(a4, aC);
 218 +    bD = vtrn2q_u64(a5, aD);
 219 +    bE = vtrn2q_u64(a6, aE);
 220 +    bF = vtrn2q_u64(a7, aF);
 221 +
 222 +    c0 = vtrn1q_u32(b0, b4);
 223 +    c1 = vtrn1q_u32(b1, b5);
 224 +    c2 = vtrn1q_u32(b2, b6);
 225 +    c3 = vtrn1q_u32(b3, b7);
 226 +    c4 = vtrn2q_u32(b0, b4);
 227 +    c5 = vtrn2q_u32(b1, b5);
 228 +    c6 = vtrn2q_u32(b2, b6);
 229 +    c7 = vtrn2q_u32(b3, b7);
 230 +    c8 = vtrn1q_u32(b8, bC);
 231 +    c9 = vtrn1q_u32(b9, bD);
 232 +    cA = vtrn1q_u32(bA, bE);
 233 +    cB = vtrn1q_u32(bB, bF);
 234 +    cC = vtrn2q_u32(b8, bC);
 235 +    cD = vtrn2q_u32(b9, bD);
 236 +    cE = vtrn2q_u32(bA, bE);
 237 +    cF = vtrn2q_u32(bB, bF);
 238 +
 239 +    d0 = vtrn1q_u16(c0, c2);
 240 +    d1 = vtrn1q_u16(c1, c3);
 241 +    d2 = vtrn2q_u16(c0, c2);
 242 +    d3 = vtrn2q_u16(c1, c3);
 243 +    d4 = vtrn1q_u16(c4, c6);
 244 +    d5 = vtrn1q_u16(c5, c7);
 245 +    d6 = vtrn2q_u16(c4, c6);
 246 +    d7 = vtrn2q_u16(c5, c7);
 247 +    d8 = vtrn1q_u16(c8, cA);
 248 +    d9 = vtrn1q_u16(c9, cB);
 249 +    dA = vtrn2q_u16(c8, cA);
 250 +    dB = vtrn2q_u16(c9, cB);
 251 +    dC = vtrn1q_u16(cC, cE);
 252 +    dD = vtrn1q_u16(cD, cF);
 253 +    dE = vtrn2q_u16(cC, cE);
 254 +    dF = vtrn2q_u16(cD, cF);
 255 +
 256 +    *(uint16x8_t *)(dst + 0*dstride)  = vtrn1q_u8(d0, d1);
 257 +    *(uint16x8_t *)(dst + 1*dstride)  = vtrn2q_u8(d0, d1);
 258 +    *(uint16x8_t *)(dst + 2*dstride)  = vtrn1q_u8(d2, d3);
 259 +    *(uint16x8_t *)(dst + 3*dstride)  = vtrn2q_u8(d2, d3);
 260 +    *(uint16x8_t *)(dst + 4*dstride)  = vtrn1q_u8(d4, d5);
 261 +    *(uint16x8_t *)(dst + 5*dstride)  = vtrn2q_u8(d4, d5);
 262 +    *(uint16x8_t *)(dst + 6*dstride)  = vtrn1q_u8(d6, d7);
 263 +    *(uint16x8_t *)(dst + 7*dstride)  = vtrn2q_u8(d6, d7);
 264 +    *(uint16x8_t *)(dst + 8*dstride)  = vtrn1q_u8(d8, d9);
 265 +    *(uint16x8_t *)(dst + 9*dstride)  = vtrn2q_u8(d8, d9);
 266 +    *(uint16x8_t *)(dst + 10*dstride)  = vtrn1q_u8(dA, dB);
 267 +    *(uint16x8_t *)(dst + 11*dstride)  = vtrn2q_u8(dA, dB);
 268 +    *(uint16x8_t *)(dst + 12*dstride)  = vtrn1q_u8(dC, dD);
 269 +    *(uint16x8_t *)(dst + 13*dstride)  = vtrn2q_u8(dC, dD);
 270 +    *(uint16x8_t *)(dst + 14*dstride)  = vtrn1q_u8(dE, dF);
 271 +    *(uint16x8_t *)(dst + 15*dstride)  = vtrn2q_u8(dE, dF);
 272 +
 273 +
 274 +}
 275 +
 276 +
 277 +void transpose32x32(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride)
 278 +{
 279 +    //assumption: there is no partial overlap
 280 +    transpose16x16(dst,src,dstride,sstride);
 281 +    transpose16x16(dst+16*dstride+16,src+16*sstride+16,dstride,sstride);
 282 +    if (dst == src)
 283 +    {
 284 +        uint8_t tmp[16*16] __attribute__((aligned(64)));
 285 +        transpose16x16(tmp,src + 16,16,sstride);
 286 +        transpose16x16(dst + 16, src + 16*sstride,dstride,sstride);
 287 +        for (int i=0;i<16;i++) COPY_16(dst+(16 + i)*dstride,tmp + 16*i);
 288 +    }
 289 +    else
 290 +    {
 291 +        transpose16x16(dst+16*dstride,src + 16,dstride,sstride);
 292 +        transpose16x16(dst + 16, src + 16*sstride,dstride,sstride);
 293 +    }
 294 +
 295 +}
 296 +
 297 +
 298 +
 299 +void transpose8x8(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride)
 300 +{
 301 +    uint16x8_t a0,a1,a2,a3,a4,a5,a6,a7;
 302 +    uint16x8_t b0,b1,b2,b3,b4,b5,b6,b7;
 303 +
 304 +    a0 = *(uint16x8_t *)(src + 0*sstride);
 305 +    a1 = *(uint16x8_t *)(src + 1*sstride);
 306 +    a2 = *(uint16x8_t *)(src + 2*sstride);
 307 +    a3 = *(uint16x8_t *)(src + 3*sstride);
 308 +    a4 = *(uint16x8_t *)(src + 4*sstride);
 309 +    a5 = *(uint16x8_t *)(src + 5*sstride);
 310 +    a6 = *(uint16x8_t *)(src + 6*sstride);
 311 +    a7 = *(uint16x8_t *)(src + 7*sstride);
 312 +
 313 +    b0 = vtrn1q_u64(a0,a4);
 314 +    b1 = vtrn1q_u64(a1,a5);
 315 +    b2 = vtrn1q_u64(a2,a6);
 316 +    b3 = vtrn1q_u64(a3,a7);
 317 +    b4 = vtrn2q_u64(a0,a4);
 318 +    b5 = vtrn2q_u64(a1,a5);
 319 +    b6 = vtrn2q_u64(a2,a6);
 320 +    b7 = vtrn2q_u64(a3,a7);
 321 +
 322 +    a0 = vtrn1q_u32(b0,b2);
 323 +    a1 = vtrn1q_u32(b1,b3);
 324 +    a2 = vtrn2q_u32(b0,b2);
 325 +    a3 = vtrn2q_u32(b1,b3);
 326 +    a4 = vtrn1q_u32(b4,b6);
 327 +    a5 = vtrn1q_u32(b5,b7);
 328 +    a6 = vtrn2q_u32(b4,b6);
 329 +    a7 = vtrn2q_u32(b5,b7);
 330 +
 331 +    b0 = vtrn1q_u16(a0,a1);
 332 +    b1 = vtrn2q_u16(a0,a1);
 333 +    b2 = vtrn1q_u16(a2,a3);
 334 +    b3 = vtrn2q_u16(a2,a3);
 335 +    b4 = vtrn1q_u16(a4,a5);
 336 +    b5 = vtrn2q_u16(a4,a5);
 337 +    b6 = vtrn1q_u16(a6,a7);
 338 +    b7 = vtrn2q_u16(a6,a7);
 339 +
 340 +    *(uint16x8_t *)(dst + 0*dstride) = b0;
 341 +    *(uint16x8_t *)(dst + 1*dstride) = b1;
 342 +    *(uint16x8_t *)(dst + 2*dstride) = b2;
 343 +    *(uint16x8_t *)(dst + 3*dstride) = b3;
 344 +    *(uint16x8_t *)(dst + 4*dstride) = b4;
 345 +    *(uint16x8_t *)(dst + 5*dstride) = b5;
 346 +    *(uint16x8_t *)(dst + 6*dstride) = b6;
 347 +    *(uint16x8_t *)(dst + 7*dstride) = b7;
 348 +}
 349 +
 350 +void transpose16x16(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride)
 351 +{
 352 +    //assumption: there is no partial overlap
 353 +    transpose8x8(dst,src,dstride,sstride);
 354 +    transpose8x8(dst+8*dstride+8,src+8*sstride+8,dstride,sstride);
 355 +
 356 +    if (dst == src)
 357 +    {
 358 +        uint16_t tmp[8*8];
 359 +        transpose8x8(tmp,src + 8,8,sstride);
 360 +        transpose8x8(dst + 8, src + 8*sstride,dstride,sstride);
 361 +        for (int i=0;i<8;i++) COPY_16(dst+(8 + i)*dstride,tmp + 8*i);
 362 +    }
 363 +    else
 364 +    {
 365 +        transpose8x8(dst+8*dstride,src + 8,dstride,sstride);
 366 +        transpose8x8(dst + 8, src + 8*sstride,dstride,sstride);
 367 +    }
 368 +
 369 +}
 370 +
 371 +
 372 +
 373 +void transpose32x32(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride)
 374 +{
 375 +    //assumption: there is no partial overlap
 376 +    for (int i=0;i<4;i++)
 377 +    {
 378 +        transpose8x8(dst+i*8*(1+dstride),src+i*8*(1+sstride),dstride,sstride);
 379 +        for (int j=i+1;j<4;j++)
 380 +        {
 381 +            if (dst == src)
 382 +            {
 383 +                uint16_t tmp[8*8] __attribute__((aligned(64)));
 384 +                transpose8x8(tmp,src + 8*i + 8*j*sstride,8,sstride);
 385 +                transpose8x8(dst + 8*i + 8*j*dstride, src + 8*j + 8*i*sstride,dstride,sstride);
 386 +                for (int k=0;k<8;k++) COPY_16(dst+ 8*j + (8*i+k)*dstride,tmp + 8*k);
 387 +            }
 388 +            else
 389 +            {
 390 +                transpose8x8(dst + 8*(j + i*dstride),src + 8*(i + j*sstride),dstride,sstride);
 391 +                transpose8x8(dst + 8*(i + j*dstride),src + 8*(j + i*sstride),dstride,sstride);
 392 +            }
 393 +
 394 +        }
 395 +    }
 396 +}
 397 +
 398 +
 399 +
 400 +
 401 +}
 402 +
 403 +
 404 +
 405 diff -Naur ./source/common/arm64/arm64-utils.h ../x265_apple_patch/source/common/arm64/arm64-utils.h
 406 --- ./source/common/arm64/arm64-utils.h 1970-01-01 01:00:00.000000000 +0100
 407 +++ ../x265_apple_patch/source/common/arm64/arm64-utils.h       2021-05-08 13:08:01.000000000 +0100
 408 @@ -0,0 +1,14 @@
 409 +#ifndef __ARM64_UTILS_H__
 410 +#define __ARM64_UTILS_H__
 411 +
 412 +
 413 +namespace X265_NS {
 414 +void transpose8x8(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
 415 +void transpose16x16(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
 416 +void transpose32x32(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
 417 +void transpose8x8(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
 418 +void transpose16x16(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
 419 +void transpose32x32(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
 420 +}
 421 +
 422 +#endif
 423 diff -Naur ./source/common/arm64/asm-primitives.cpp ../x265_apple_patch/source/common/arm64/asm-primitives.cpp
 424 --- ./source/common/arm64/asm-primitives.cpp    1970-01-01 01:00:00.000000000 +0100
 425 +++ ../x265_apple_patch/source/common/arm64/asm-primitives.cpp  2021-05-08 13:08:01.000000000 +0100
 426 @@ -0,0 +1,53 @@
 427 +/*****************************************************************************
 428 + * Copyright (C) 2013-2017 MulticoreWare, Inc
 429 + *
 430 + * Authors: Steve Borho <steve@borho.org>
 431 + *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
 432 + *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
 433 + *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
 434 + *
 435 + * This program is free software; you can redistribute it and/or modify
 436 + * it under the terms of the GNU General Public License as published by
 437 + * the Free Software Foundation; either version 2 of the License, or
 438 + * (at your option) any later version.
 439 + *
 440 + * This program is distributed in the hope that it will be useful,
 441 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 442 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 443 + * GNU General Public License for more details.
 444 + *
 445 + * You should have received a copy of the GNU General Public License
 446 + * along with this program; if not, write to the Free Software
 447 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 448 + *
 449 + * This program is also available under a commercial proprietary license.
 450 + * For more information, contact us at license @ x265.com.
 451 + *****************************************************************************/
 452 +
 453 +#include "common.h"
 454 +#include "primitives.h"
 455 +#include "x265.h"
 456 +#include "cpu.h"
 457 +
 458 +#include "pixel-prim.h"
 459 +#include "filter-prim.h"
 460 +#include "dct-prim.h"
 461 +#include "loopfilter-prim.h"
 462 +#include "intrapred-prim.h"
 463 +
 464 +namespace X265_NS {
 465 +// private x265 namespace
 466 +
 467 +void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
 468 +{
 469 +    if (cpuMask & X265_CPU_NEON)
 470 +    {
 471 +      setupPixelPrimitives_neon(p);
 472 +      setupFilterPrimitives_neon(p);
 473 +      setupDCTPrimitives_neon(p);
 474 +      setupLoopFilterPrimitives_neon(p);
 475 +      setupIntraPrimitives_neon(p);
 476 +    }
 477 +}
 478 +
 479 +} // namespace X265_NS
 480 diff -Naur ./source/common/arm64/dct-prim.cpp ../x265_apple_patch/source/common/arm64/dct-prim.cpp
 481 --- ./source/common/arm64/dct-prim.cpp  1970-01-01 01:00:00.000000000 +0100
 482 +++ ../x265_apple_patch/source/common/arm64/dct-prim.cpp        2021-05-08 13:08:01.000000000 +0100
 483 @@ -0,0 +1,933 @@
 484 +#include "dct-prim.h"
 485 +
 486 +
 487 +#if HAVE_NEON
 488 +
 489 +#include <arm_neon.h>
 490 +
 491 +
 492 +namespace {
 493 +using namespace X265_NS;
 494 +
 495 +
 496 +static int16x8_t rev16(const int16x8_t a)
 497 +{
 498 +    static const int8x16_t tbl = {14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1};
 499 +    return vqtbx1q_u8(a,a,tbl);
 500 +}
 501 +
 502 +static int32x4_t rev32(const int32x4_t a)
 503 +{
 504 +    static const int8x16_t tbl = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
 505 +    return vqtbx1q_u8(a,a,tbl);
 506 +}
 507 +
 508 +static void transpose_4x4x16(int16x4_t& x0,int16x4_t& x1,int16x4_t& x2,int16x4_t& x3)
 509 +{
 510 +    int16x4_t s0,s1,s2,s3;
 511 +    s0 = vtrn1_s32(x0,x2);
 512 +    s1 = vtrn1_s32(x1,x3);
 513 +    s2 = vtrn2_s32(x0,x2);
 514 +    s3 = vtrn2_s32(x1,x3);
 515 +
 516 +    x0 = vtrn1_s16(s0,s1);
 517 +    x1 = vtrn2_s16(s0,s1);
 518 +    x2 = vtrn1_s16(s2,s3);
 519 +    x3 = vtrn2_s16(s2,s3);
 520 +}
 521 +
 522 +
 523 +
 524 +static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
 525 +{
 526 +
 527 +    // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
 528 +    // For clarity, left the original reference code in comments
 529 +    int scanPosLast = 0;
 530 +
 531 +    uint16_t cSign = 0;
 532 +    uint16_t cFlag = 0;
 533 +    uint8_t cNum = 0;
 534 +
 535 +    uint32_t prevcgIdx = 0;
 536 +    do
 537 +    {
 538 +        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
 539 +
 540 +        const uint32_t posLast = scan[scanPosLast];
 541 +
 542 +        const int curCoeff = coeff[posLast];
 543 +        const uint32_t isNZCoeff = (curCoeff != 0);
 544 +        /*
 545 +        NOTE: the new algorithm is complicated, so I keep reference code here
 546 +        uint32_t posy   = posLast >> log2TrSize;
 547 +        uint32_t posx   = posLast - (posy << log2TrSize);
 548 +        uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
 549 +        const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
 550 +        sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
 551 +        */
 552 +
 553 +        // get L1 sig map
 554 +        numSig -= isNZCoeff;
 555 +
 556 +        if (scanPosLast % (1<<MLS_CG_SIZE) == 0)
 557 +        {
 558 +            coeffSign[prevcgIdx] = cSign;
 559 +            coeffFlag[prevcgIdx] = cFlag;
 560 +            coeffNum[prevcgIdx] = cNum;
 561 +            cSign = 0;
 562 +            cFlag = 0;
 563 +            cNum = 0;
 564 +        }
 565 +        // TODO: optimize by instruction BTS
 566 +       cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
 567 +       cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
 568 +       cNum += (uint8_t)isNZCoeff;
 569 +       prevcgIdx = cgIdx;
 570 +        scanPosLast++;
 571 +    }
 572 +    while (numSig > 0);
 573 +
 574 +    coeffSign[prevcgIdx] = cSign;
 575 +    coeffFlag[prevcgIdx] = cFlag;
 576 +    coeffNum[prevcgIdx] = cNum;
 577 +    return scanPosLast - 1;
 578 +}
 579 +
 580 +
 581 +#if (MLS_CG_SIZE == 4)
 582 +template<int log2TrSize>
 583 +static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
 584 +{
 585 +    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
 586 +    const int scaleBits = SCALE_BITS - 2 * transformShift;
 587 +    const uint32_t trSize = 1 << log2TrSize;
 588 +
 589 +    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
 590 +    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
 591 +    for (int y = 0; y < MLS_CG_SIZE; y++)
 592 +    {
 593 +      int16x4_t in = *(int16x4_t *)&m_resiDctCoeff[blkPos];
 594 +      int32x4_t mul = vmull_s16(in,in);
 595 +      int64x2_t cost0, cost1;
 596 +      cost0 = vshll_n_s32(vget_low_s32(mul),scaleBits);
 597 +      cost1 = vshll_high_n_s32(mul,scaleBits);
 598 +      *(int64x2_t *)&costUncoded[blkPos+0] = cost0;
 599 +      *(int64x2_t *)&costUncoded[blkPos+2] = cost1;
 600 +      vcost_sum_0 = vaddq_s64(vcost_sum_0,cost0);
 601 +      vcost_sum_1 = vaddq_s64(vcost_sum_1,cost1);
 602 +      blkPos += trSize;
 603 +    }
 604 +    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0,vcost_sum_1));
 605 +    *totalUncodedCost += sum;
 606 +    *totalRdCost += sum;
 607 +}
 608 +
 609 +template<int log2TrSize>
 610 +static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
 611 +{
 612 +    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
 613 +    const int scaleBits = SCALE_BITS - 2 * transformShift;
 614 +    const uint32_t trSize = 1 << log2TrSize;
 615 +    //using preprocessor to bypass clang bug
 616 +    const int max = X265_MAX(0, (2 * transformShift + 1));
 617 +
 618 +    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
 619 +    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
 620 +    int32x4_t vpsy = vdupq_n_s32(*psyScale);
 621 +    for (int y = 0; y < MLS_CG_SIZE; y++)
 622 +    {
 623 +      int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeff[blkPos]);
 624 +      int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeff[blkPos]),signCoef);
 625 +      int64x2_t cost0, cost1;
 626 +      cost0 = vmull_s32(vget_low_s32(signCoef),vget_low_s32(signCoef));
 627 +      cost1 = vmull_high_s32(signCoef,signCoef);
 628 +      cost0 = vshlq_n_s64(cost0,scaleBits);
 629 +      cost1 = vshlq_n_s64(cost1,scaleBits);
 630 +      int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef),vget_low_s32(vpsy));
 631 +      int64x2_t neg1 = vmull_high_s32(predictedCoef,vpsy);
 632 +      if (max > 0) {
 633 +        int64x2_t shift = vdupq_n_s64(-max);
 634 +        neg0 = vshlq_s64(neg0,shift);
 635 +        neg1 = vshlq_s64(neg1,shift);
 636 +      }
 637 +      cost0 = vsubq_s64(cost0,neg0);
 638 +      cost1 = vsubq_s64(cost1,neg1);
 639 +      *(int64x2_t *)&costUncoded[blkPos+0] = cost0;
 640 +      *(int64x2_t *)&costUncoded[blkPos+2] = cost1;
 641 +      vcost_sum_0 = vaddq_s64(vcost_sum_0,cost0);
 642 +      vcost_sum_1 = vaddq_s64(vcost_sum_1,cost1);
 643 +
 644 +        blkPos += trSize;
 645 +    }
 646 +  int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0,vcost_sum_1));
 647 +  *totalUncodedCost += sum;
 648 +  *totalRdCost += sum;
 649 +}
 650 +
 651 +#else
 652 +      #error "MLS_CG_SIZE must be 4 for neon version"
 653 +#endif
 654 +
 655 +
 656 +
 657 +template<int trSize>
 658 +int  count_nonzero_neon(const int16_t* quantCoeff)
 659 +{
 660 +  X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
 661 +  int count = 0;
 662 +  int16x8_t vcount = vdupq_n_s16(0);
 663 +  const int numCoeff = trSize * trSize;
 664 +  int i = 0;
 665 +  for (; (i + 8) <= numCoeff; i+=8)
 666 +  {
 667 +    int16x8_t in = *(int16x8_t*)&quantCoeff[i];
 668 +    vcount = vaddq_s16(vcount,vtstq_s16(in,in));
 669 +  }
 670 +  for (; i < numCoeff; i++)
 671 +  {
 672 +      count += quantCoeff[i] != 0;
 673 +  }
 674 +
 675 +  return count - vaddvq_s16(vcount);
 676 +}
 677 +
 678 +template<int trSize>
 679 +uint32_t copy_count_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
 680 +{
 681 +  uint32_t numSig = 0;
 682 +  int16x8_t vcount = vdupq_n_s16(0);
 683 +  for (int k = 0; k < trSize; k++)
 684 +  {
 685 +    int j = 0;
 686 +    for (; (j + 8) <= trSize; j+=8)
 687 +    {
 688 +      int16x8_t in = *(int16x8_t*)&residual[j];
 689 +      *(int16x8_t*)&coeff[j] = in;
 690 +      vcount = vaddq_s16(vcount,vtstq_s16(in,in));
 691 +    }
 692 +    for (; j < trSize; j++)
 693 +    {
 694 +      coeff[j] = residual[j];
 695 +      numSig += (residual[j] != 0);
 696 +    }
 697 +    residual += resiStride;
 698 +    coeff += trSize;
 699 +  }
 700 +
 701 +  return numSig - vaddvq_s16(vcount);
 702 +}
 703 +
 704 +
 705 +static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
 706 +{
 707 +    int j, k;
 708 +    int32x4_t E[2], O[2];
 709 +    int32x4_t EE, EO;
 710 +    int32x2_t EEE, EEO;
 711 +    const int add = 1 << (shift - 1);
 712 +    const int32x4_t _vadd = {add,0};
 713 +
 714 +    for (j = 0; j < line; j++)
 715 +    {
 716 +        int16x8_t in0 = *(int16x8_t *)src;
 717 +        int16x8_t in1 = rev16(*(int16x8_t *)&src[8]);
 718 +
 719 +        E[0] = vaddl_s16(vget_low_s16(in0),vget_low_s16(in1));
 720 +        O[0] = vsubl_s16(vget_low_s16(in0),vget_low_s16(in1));
 721 +        E[1] = vaddl_high_s16(in0,in1);
 722 +        O[1] = vsubl_high_s16(in0,in1);
 723 +
 724 +        for (k = 1; k < 16; k += 2)
 725 +        {
 726 +            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16[k][0]);
 727 +            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16[k][4]);
 728 +
 729 +            int32x4_t res = _vadd;
 730 +            res = vmlaq_s32(res,c0,O[0]);
 731 +            res = vmlaq_s32(res,c1,O[1]);
 732 +            dst[k * line] = (int16_t)(vaddvq_s32(res) >> shift);
 733 +        }
 734 +
 735 +        /* EE and EO */
 736 +        EE = vaddq_s32(E[0],rev32(E[1]));
 737 +        EO = vsubq_s32(E[0],rev32(E[1]));
 738 +
 739 +        for (k = 2; k < 16; k += 4)
 740 +        {
 741 +            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16[k][0]);
 742 +            int32x4_t res = _vadd;
 743 +            res = vmlaq_s32(res,c0,EO);
 744 +            dst[k * line] = (int16_t)(vaddvq_s32(res) >> shift);
 745 +        }
 746 +
 747 +        /* EEE and EEO */
 748 +        EEE[0] = EE[0] + EE[3];
 749 +        EEO[0] = EE[0] - EE[3];
 750 +        EEE[1] = EE[1] + EE[2];
 751 +        EEO[1] = EE[1] - EE[2];
 752 +
 753 +        dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
 754 +        dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
 755 +        dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
 756 +        dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
 757 +
 758 +
 759 +        src += 16;
 760 +        dst++;
 761 +    }
 762 +}
 763 +
 764 +
 765 +static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
 766 +{
 767 +    int j, k;
 768 +    const int add = 1 << (shift - 1);
 769 +
 770 +
 771 +    for (j = 0; j < line; j++)
 772 +    {
 773 +        int32x4_t VE[4], VO0,VO1,VO2,VO3;
 774 +        int32x4_t VEE[2], VEO[2];
 775 +        int32x4_t VEEE, VEEO;
 776 +        int EEEE[2], EEEO[2];
 777 +
 778 +        int16x8x4_t inputs;
 779 +        inputs = *(int16x8x4_t *)&src[0];
 780 +        int16x8x4_t in_rev;
 781 +
 782 +        in_rev.val[1] = rev16(inputs.val[2]);
 783 +        in_rev.val[0] = rev16(inputs.val[3]);
 784 +
 785 +        VE[0] = vaddl_s16(vget_low_s16(inputs.val[0]),vget_low_s16(in_rev.val[0]));
 786 +        VE[1] = vaddl_high_s16(inputs.val[0],in_rev.val[0]);
 787 +        VO0 = vsubl_s16(vget_low_s16(inputs.val[0]),vget_low_s16(in_rev.val[0]));
 788 +        VO1 = vsubl_high_s16(inputs.val[0],in_rev.val[0]);
 789 +        VE[2] = vaddl_s16(vget_low_s16(inputs.val[1]),vget_low_s16(in_rev.val[1]));
 790 +        VE[3] = vaddl_high_s16(inputs.val[1],in_rev.val[1]);
 791 +        VO2 = vsubl_s16(vget_low_s16(inputs.val[1]),vget_low_s16(in_rev.val[1]));
 792 +        VO3 = vsubl_high_s16(inputs.val[1],in_rev.val[1]);
 793 +
 794 +        for (k = 1; k < 32; k += 2)
 795 +        {
 796 +            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32[k][0]);
 797 +            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32[k][4]);
 798 +            int32x4_t c2 = vmovl_s16(*(int16x4_t *)&g_t32[k][8]);
 799 +            int32x4_t c3 = vmovl_s16(*(int16x4_t *)&g_t32[k][12]);
 800 +            int32x4_t s = vmulq_s32(c0,VO0);
 801 +            s = vmlaq_s32(s,c1,VO1);
 802 +            s = vmlaq_s32(s,c2,VO2);
 803 +            s = vmlaq_s32(s,c3,VO3);
 804 +
 805 +            dst[k * line] = (int16_t)((vaddvq_s32(s) + add) >> shift);
 806 +
 807 +        }
 808 +
 809 +        int32x4_t rev_VE[2];
 810 +
 811 +
 812 +        rev_VE[0] = rev32(VE[3]);
 813 +        rev_VE[1] = rev32(VE[2]);
 814 +
 815 +        /* EE and EO */
 816 +        for (k = 0; k < 2; k++)
 817 +        {
 818 +            VEE[k] = vaddq_s32(VE[k],rev_VE[k]);
 819 +            VEO[k] = vsubq_s32(VE[k],rev_VE[k]);
 820 +        }
 821 +        for (k = 2; k < 32; k += 4)
 822 +        {
 823 +            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32[k][0]);
 824 +            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32[k][4]);
 825 +            int32x4_t s = vmulq_s32(c0,VEO[0]);
 826 +            s = vmlaq_s32(s,c1,VEO[1]);
 827 +
 828 +            dst[k * line] = (int16_t)((vaddvq_s32(s) + add) >> shift);
 829 +
 830 +        }
 831 +
 832 +        int32x4_t tmp = rev32(VEE[1]);
 833 +        VEEE = vaddq_s32(VEE[0],tmp);
 834 +        VEEO = vsubq_s32(VEE[0],tmp);
 835 +        for (k = 4; k < 32; k += 8)
 836 +        {
 837 +            int32x4_t c = vmovl_s16(*(int16x4_t *)&g_t32[k][0]);
 838 +            int32x4_t s = vmulq_s32(c,VEEO);
 839 +
 840 +            dst[k * line] = (int16_t)((vaddvq_s32(s) + add) >> shift);
 841 +        }
 842 +
 843 +        /* EEEE and EEEO */
 844 +        EEEE[0] = VEEE[0] + VEEE[3];
 845 +        EEEO[0] = VEEE[0] - VEEE[3];
 846 +        EEEE[1] = VEEE[1] + VEEE[2];
 847 +        EEEO[1] = VEEE[1] - VEEE[2];
 848 +
 849 +        dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
 850 +        dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
 851 +        dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
 852 +        dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
 853 +
 854 +
 855 +
 856 +        src += 32;
 857 +        dst++;
 858 +    }
 859 +}
 860 +
 861 +static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
 862 +{
 863 +    int j, k;
 864 +    int E[4], O[4];
 865 +    int EE[2], EO[2];
 866 +    int add = 1 << (shift - 1);
 867 +
 868 +    for (j = 0; j < line; j++)
 869 +    {
 870 +        /* E and O*/
 871 +        for (k = 0; k < 4; k++)
 872 +        {
 873 +            E[k] = src[k] + src[7 - k];
 874 +            O[k] = src[k] - src[7 - k];
 875 +        }
 876 +
 877 +        /* EE and EO */
 878 +        EE[0] = E[0] + E[3];
 879 +        EO[0] = E[0] - E[3];
 880 +        EE[1] = E[1] + E[2];
 881 +        EO[1] = E[1] - E[2];
 882 +
 883 +        dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
 884 +        dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
 885 +        dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
 886 +        dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
 887 +
 888 +        dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
 889 +        dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
 890 +        dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
 891 +        dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
 892 +
 893 +        src += 8;
 894 +        dst++;
 895 +    }
 896 +}
 897 +
 898 +static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
 899 +{
 900 +    int j;
 901 +    int E[2], O[2];
 902 +    int add = 1 << (shift - 1);
 903 +
 904 +    for (j = 0; j < line; j++)
 905 +    {
 906 +        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
 907 +        O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
 908 +        O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
 909 +        E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
 910 +        E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
 911 +
 912 +        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
 913 +        dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
 914 +        dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
 915 +        dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
 916 +        dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
 917 +
 918 +        src++;
 919 +        dst += 4;
 920 +    }
 921 +}
 922 +
 923 +
 924 +
 925 +static void partialButterflyInverse16_neon(const int16_t* src, int16_t* orig_dst, int shift, int line)
 926 +{
 927 +#define FMAK(x,l) s[l] = vmlal_lane_s16(s[l],*(int16x4_t*)&src[(x)*line],*(int16x4_t *)&g_t16[x][k],l)
 928 +#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&src[x*line],*(int16x4_t *)&g_t16[x][k],l);
 929 +#define ODD3_15(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);
 930 +#define EVEN6_14_STEP4(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);
 931 +
 932 +
 933 +    int j, k;
 934 +    int32x4_t E[8], O[8];
 935 +    int32x4_t EE[4], EO[4];
 936 +    int32x4_t EEE[2], EEO[2];
 937 +    const int add = 1 << (shift - 1);
 938 +
 939 +
 940 +#pragma unroll(4)
 941 +    for (j = 0; j < line; j+=4)
 942 +    {
 943 +        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
 944 +
 945 +#pragma unroll(2)
 946 +        for (k=0;k<2;k++) {
 947 +            int32x4_t s;
 948 +            s = vmull_s16(vdup_n_s16(g_t16[4][k]),*(int16x4_t*)&src[4*line]);;
 949 +            EEO[k] = vmlal_s16(s,vdup_n_s16(g_t16[12][k]),*(int16x4_t*)&src[(12)*line]);
 950 +            s = vmull_s16(vdup_n_s16(g_t16[0][k]),*(int16x4_t*)&src[0*line]);;
 951 +            EEE[k] = vmlal_s16(s,vdup_n_s16(g_t16[8][k]),*(int16x4_t*)&src[(8)*line]);
 952 +        }
 953 +
 954 +        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
 955 +        EE[0] = vaddq_s32(EEE[0] , EEO[0]);
 956 +        EE[2] = vsubq_s32(EEE[1] , EEO[1]);
 957 +        EE[1] = vaddq_s32(EEE[1] , EEO[1]);
 958 +        EE[3] = vsubq_s32(EEE[0] , EEO[0]);
 959 +
 960 +
 961 +#pragma unroll(1)
 962 +        for (k = 0; k < 4; k+=4)
 963 +        {
 964 +            int32x4_t s[4];
 965 +            s[0] = MULK(2,0);
 966 +            s[1] = MULK(2,1);
 967 +            s[2] = MULK(2,2);
 968 +            s[3] = MULK(2,3);
 969 +
 970 +            EVEN6_14_STEP4(0);
 971 +            EVEN6_14_STEP4(1);
 972 +            EVEN6_14_STEP4(2);
 973 +            EVEN6_14_STEP4(3);
 974 +
 975 +            EO[k] = s[0];
 976 +            EO[k+1] = s[1];
 977 +            EO[k+2] = s[2];
 978 +            EO[k+3] = s[3];
 979 +         }
 980 +
 981 +
 982 +
 983 +        static const int32x4_t min = vdupq_n_s32(-32768);
 984 +        static const int32x4_t max = vdupq_n_s32(32767);
 985 +        const int32x4_t minus_shift = vdupq_n_s32(-shift);
 986 +
 987 +#pragma unroll(4)
 988 +        for (k = 0; k < 4; k++)
 989 +        {
 990 +            E[k] = vaddq_s32(EE[k] , EO[k]);
 991 +            E[k + 4] = vsubq_s32(EE[3 - k] , EO[3 - k]);
 992 +        }
 993 +
 994 +#pragma unroll(2)
 995 +        for (k = 0; k < 8; k+=4)
 996 +        {
 997 +            int32x4_t s[4];
 998 +            s[0] = MULK(1,0);
 999 +            s[1] = MULK(1,1);
1000 +            s[2] = MULK(1,2);
1001 +            s[3] = MULK(1,3);
1002 +            ODD3_15(0);
1003 +            ODD3_15(1);
1004 +            ODD3_15(2);
1005 +            ODD3_15(3);
1006 +            O[k] = s[0];
1007 +            O[k+1] = s[1];
1008 +            O[k+2] = s[2];
1009 +            O[k+3] = s[3];
1010 +            int32x4_t t;
1011 +            int16x4_t x0,x1,x2,x3;
1012 +
1013 +            E[k] = vaddq_s32(vdupq_n_s32(add),E[k]);
1014 +            t = vaddq_s32(E[k],O[k]);
1015 +            t = vshlq_s32(t,minus_shift);
1016 +            t = vmaxq_s32(t,min);
1017 +            t = vminq_s32(t,max);
1018 +            x0 = vmovn_s32(t);
1019 +
1020 +            E[k+1] = vaddq_s32(vdupq_n_s32(add),E[k+1]);
1021 +            t = vaddq_s32(E[k+1],O[k+1]);
1022 +            t = vshlq_s32(t,minus_shift);
1023 +            t = vmaxq_s32(t,min);
1024 +            t = vminq_s32(t,max);
1025 +            x1 = vmovn_s32(t);
1026 +
1027 +            E[k+2] = vaddq_s32(vdupq_n_s32(add),E[k+2]);
1028 +            t = vaddq_s32(E[k+2],O[k+2]);
1029 +            t = vshlq_s32(t,minus_shift);
1030 +            t = vmaxq_s32(t,min);
1031 +            t = vminq_s32(t,max);
1032 +            x2 = vmovn_s32(t);
1033 +
1034 +            E[k+3] = vaddq_s32(vdupq_n_s32(add),E[k+3]);
1035 +            t = vaddq_s32(E[k+3],O[k+3]);
1036 +            t = vshlq_s32(t,minus_shift);
1037 +            t = vmaxq_s32(t,min);
1038 +            t = vminq_s32(t,max);
1039 +            x3 = vmovn_s32(t);
1040 +
1041 +            transpose_4x4x16(x0,x1,x2,x3);
1042 +            *(int16x4_t*)&orig_dst[0*16+k] = x0;
1043 +            *(int16x4_t*)&orig_dst[1*16+k] = x1;
1044 +            *(int16x4_t*)&orig_dst[2*16+k] = x2;
1045 +            *(int16x4_t*)&orig_dst[3*16+k] = x3;
1046 +        }
1047 +
1048 +
1049 +#pragma unroll(2)
1050 +        for (k = 0; k < 8; k+=4)
1051 +        {
1052 +            int32x4_t t;
1053 +            int16x4_t x0,x1,x2,x3;
1054 +
1055 +            t = vsubq_s32(E[7-k],O[7-k]);
1056 +            t = vshlq_s32(t,minus_shift);
1057 +            t = vmaxq_s32(t,min);
1058 +            t = vminq_s32(t,max);
1059 +            x0 = vmovn_s32(t);
1060 +
1061 +            t = vsubq_s32(E[6-k],O[6-k]);
1062 +            t = vshlq_s32(t,minus_shift);
1063 +            t = vmaxq_s32(t,min);
1064 +            t = vminq_s32(t,max);
1065 +            x1 = vmovn_s32(t);
1066 +
1067 +            t = vsubq_s32(E[5-k],O[5-k]);
1068 +
1069 +            t = vshlq_s32(t,minus_shift);
1070 +            t = vmaxq_s32(t,min);
1071 +            t = vminq_s32(t,max);
1072 +            x2 = vmovn_s32(t);
1073 +
1074 +            t = vsubq_s32(E[4-k],O[4-k]);
1075 +            t = vshlq_s32(t,minus_shift);
1076 +            t = vmaxq_s32(t,min);
1077 +            t = vminq_s32(t,max);
1078 +            x3 = vmovn_s32(t);
1079 +
1080 +            transpose_4x4x16(x0,x1,x2,x3);
1081 +            *(int16x4_t*)&orig_dst[0*16+k+8] = x0;
1082 +            *(int16x4_t*)&orig_dst[1*16+k+8] = x1;
1083 +            *(int16x4_t*)&orig_dst[2*16+k+8] = x2;
1084 +            *(int16x4_t*)&orig_dst[3*16+k+8] = x3;
1085 +        }
1086 +        orig_dst += 4*16;
1087 +        src+=4;
1088 +    }
1089 +
1090 +#undef MUL
1091 +#undef FMA
1092 +#undef FMAK
1093 +#undef MULK
1094 +#undef ODD3_15
1095 +#undef EVEN6_14_STEP4
1096 +
1097 +
1098 +}
1099 +
1100 +
1101 +
1102 +static void partialButterflyInverse32_neon(const int16_t* src, int16_t* orig_dst, int shift, int line)
1103 +{
1104 +#define MUL(x) vmull_s16(vdup_n_s16(g_t32[x][k]),*(int16x4_t*)&src[x*line]);
1105 +#define FMA(x) s = vmlal_s16(s,vdup_n_s16(g_t32[x][k]),*(int16x4_t*)&src[(x)*line])
1106 +#define FMAK(x,l) s[l] = vmlal_lane_s16(s[l],*(int16x4_t*)&src[(x)*line],*(int16x4_t *)&g_t32[x][k],l)
1107 +#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&src[x*line],*(int16x4_t *)&g_t32[x][k],l);
1108 +#define ODD31(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);FMAK(17,k);FMAK(19,k);FMAK(21,k);FMAK(23,k);FMAK(25,k);FMAK(27,k);FMAK(29,k);FMAK(31,k);
1109 +
1110 +#define ODD15(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);FMAK(18,k);FMAK(22,k);FMAK(26,k);FMAK(30,k);
1111 +#define ODD7(k) FMAK(12,k);FMAK(20,k);FMAK(28,k);
1112 +
1113 +
1114 +    int j, k;
1115 +    int32x4_t E[16], O[16];
1116 +    int32x4_t EE[8], EO[8];
1117 +    int32x4_t EEE[4], EEO[4];
1118 +    int32x4_t EEEE[2], EEEO[2];
1119 +    int16x4_t dst[32];
1120 +    int add = 1 << (shift - 1);
1121 +
1122 +#pragma unroll (8)
1123 +    for (j = 0; j < line; j+=4)
1124 +    {
1125 +#pragma unroll (4)
1126 +        for (k = 0; k < 16; k+=4)
1127 +        {
1128 +            int32x4_t s[4];
1129 +            s[0] = MULK(1,0);
1130 +            s[1] = MULK(1,1);
1131 +            s[2] = MULK(1,2);
1132 +            s[3] = MULK(1,3);
1133 +            ODD31(0);
1134 +            ODD31(1);
1135 +            ODD31(2);
1136 +            ODD31(3);
1137 +            O[k] = s[0];
1138 +            O[k+1] = s[1];
1139 +            O[k+2] = s[2];
1140 +            O[k+3] = s[3];
1141 +
1142 +
1143 +        }
1144 +
1145 +
1146 +#pragma unroll (2)
1147 +        for (k = 0; k < 8; k+=4)
1148 +        {
1149 +            int32x4_t s[4];
1150 +            s[0] = MULK(2,0);
1151 +            s[1] = MULK(2,1);
1152 +            s[2] = MULK(2,2);
1153 +            s[3] = MULK(2,3);
1154 +
1155 +            ODD15(0);
1156 +            ODD15(1);
1157 +            ODD15(2);
1158 +            ODD15(3);
1159 +
1160 +            EO[k] = s[0];
1161 +            EO[k+1] = s[1];
1162 +            EO[k+2] = s[2];
1163 +            EO[k+3] = s[3];
1164 +        }
1165 +
1166 +
1167 +        for (k = 0; k < 4; k+=4)
1168 +        {
1169 +            int32x4_t s[4];
1170 +            s[0] = MULK(4,0);
1171 +            s[1] = MULK(4,1);
1172 +            s[2] = MULK(4,2);
1173 +            s[3] = MULK(4,3);
1174 +
1175 +            ODD7(0);
1176 +            ODD7(1);
1177 +            ODD7(2);
1178 +            ODD7(3);
1179 +
1180 +            EEO[k] = s[0];
1181 +            EEO[k+1] = s[1];
1182 +            EEO[k+2] = s[2];
1183 +            EEO[k+3] = s[3];
1184 +        }
1185 +
1186 +#pragma unroll (2)
1187 +        for (k=0;k<2;k++) {
1188 +            int32x4_t s;
1189 +            s = MUL(8);
1190 +            EEEO[k] = FMA(24);
1191 +            s = MUL(0);
1192 +            EEEE[k] = FMA(16);
1193 +        }
1194 +        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
1195 +        EEE[0] = vaddq_s32(EEEE[0],EEEO[0]);
1196 +        EEE[3] = vsubq_s32(EEEE[0],EEEO[0]);
1197 +        EEE[1] = vaddq_s32(EEEE[1],EEEO[1]);
1198 +        EEE[2] = vsubq_s32(EEEE[1],EEEO[1]);
1199 +
1200 +#pragma unroll (4)
1201 +        for (k = 0; k < 4; k++)
1202 +        {
1203 +            EE[k] = vaddq_s32(EEE[k],EEO[k]);
1204 +            EE[k + 4] = vsubq_s32((EEE[3 - k]), (EEO[3 - k]));
1205 +        }
1206 +
1207 +#pragma unroll (8)
1208 +        for (k = 0; k < 8; k++)
1209 +        {
1210 +            E[k] = vaddq_s32(EE[k],EO[k]);
1211 +            E[k + 8] = vsubq_s32((EE[7 - k]),(EO[7 - k]));
1212 +        }
1213 +
1214 +        static const int32x4_t min = vdupq_n_s32(-32768);
1215 +        static const int32x4_t max = vdupq_n_s32(32767);
1216 +
1217 +
1218 +
1219 +#pragma unroll (16)
1220 +        for (k = 0; k < 16; k++)
1221 +        {
1222 +            int32x4_t adde = vaddq_s32(vdupq_n_s32(add),E[k]);
1223 +            int32x4_t s = vaddq_s32(adde,O[k]);
1224 +            s = vshlq_s32(s,vdupq_n_s32(-shift));
1225 +            s = vmaxq_s32(s,min);
1226 +            s = vminq_s32(s,max);
1227 +
1228 +
1229 +
1230 +            dst[k] = vmovn_s32(s);
1231 +            adde = vaddq_s32(vdupq_n_s32(add),(E[15-k]));
1232 +            s  =vsubq_s32(adde,(O[15-k]));
1233 +            s = vshlq_s32(s,vdupq_n_s32(-shift));
1234 +            s = vmaxq_s32(s,min);
1235 +            s = vminq_s32(s,max);
1236 +
1237 +            dst[k+16] = vmovn_s32(s);
1238 +        }
1239 +
1240 +
1241 +#pragma unroll (8)
1242 +        for (k = 0; k < 32; k+=4)
1243 +        {
1244 +            int16x4_t x0 = dst[k+0];
1245 +            int16x4_t x1 = dst[k+1];
1246 +            int16x4_t x2 = dst[k+2];
1247 +            int16x4_t x3 = dst[k+3];
1248 +            transpose_4x4x16(x0,x1,x2,x3);
1249 +            *(int16x4_t*)&orig_dst[0*32+k] = x0;
1250 +            *(int16x4_t*)&orig_dst[1*32+k] = x1;
1251 +            *(int16x4_t*)&orig_dst[2*32+k] = x2;
1252 +            *(int16x4_t*)&orig_dst[3*32+k] = x3;
1253 +        }
1254 +        orig_dst += 4*32;
1255 +        src += 4;
1256 +    }
1257 +#undef MUL
1258 +#undef FMA
1259 +#undef FMAK
1260 +#undef MULK
1261 +#undef ODD31
1262 +#undef ODD15
1263 +#undef ODD7
1264 +
1265 +}
1266 +
1267 +
1268 +static void dct8_neon(const int16_t* src, int16_t* dst, intptr_t srcStride)
1269 +{
1270 +    const int shift_1st = 2 + X265_DEPTH - 8;
1271 +    const int shift_2nd = 9;
1272 +
1273 +    ALIGN_VAR_32(int16_t, coef[8 * 8]);
1274 +    ALIGN_VAR_32(int16_t, block[8 * 8]);
1275 +
1276 +    for (int i = 0; i < 8; i++)
1277 +    {
1278 +        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
1279 +    }
1280 +
1281 +    partialButterfly8(block, coef, shift_1st, 8);
1282 +    partialButterfly8(coef, dst, shift_2nd, 8);
1283 +}
1284 +
1285 +static void dct16_neon(const int16_t* src, int16_t* dst, intptr_t srcStride)
1286 +{
1287 +    const int shift_1st = 3 + X265_DEPTH - 8;
1288 +    const int shift_2nd = 10;
1289 +
1290 +    ALIGN_VAR_32(int16_t, coef[16 * 16]);
1291 +    ALIGN_VAR_32(int16_t, block[16 * 16]);
1292 +
1293 +    for (int i = 0; i < 16; i++)
1294 +    {
1295 +        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
1296 +    }
1297 +
1298 +    partialButterfly16(block, coef, shift_1st, 16);
1299 +    partialButterfly16(coef, dst, shift_2nd, 16);
1300 +}
1301 +
1302 +static void dct32_neon(const int16_t* src, int16_t* dst, intptr_t srcStride)
1303 +{
1304 +    const int shift_1st = 4 + X265_DEPTH - 8;
1305 +    const int shift_2nd = 11;
1306 +
1307 +    ALIGN_VAR_32(int16_t, coef[32 * 32]);
1308 +    ALIGN_VAR_32(int16_t, block[32 * 32]);
1309 +
1310 +    for (int i = 0; i < 32; i++)
1311 +    {
1312 +        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
1313 +    }
1314 +
1315 +    partialButterfly32(block, coef, shift_1st, 32);
1316 +    partialButterfly32(coef, dst, shift_2nd, 32);
1317 +}
1318 +
1319 +static void idct4_neon(const int16_t* src, int16_t* dst, intptr_t dstStride)
1320 +{
1321 +    const int shift_1st = 7;
1322 +    const int shift_2nd = 12 - (X265_DEPTH - 8);
1323 +
1324 +    ALIGN_VAR_32(int16_t, coef[4 * 4]);
1325 +    ALIGN_VAR_32(int16_t, block[4 * 4]);
1326 +
1327 +    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
1328 +    partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
1329 +
1330 +    for (int i = 0; i < 4; i++)
1331 +    {
1332 +        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
1333 +    }
1334 +}
1335 +
1336 +static void idct16_neon(const int16_t* src, int16_t* dst, intptr_t dstStride)
1337 +{
1338 +    const int shift_1st = 7;
1339 +    const int shift_2nd = 12 - (X265_DEPTH - 8);
1340 +
1341 +    ALIGN_VAR_32(int16_t, coef[16 * 16]);
1342 +    ALIGN_VAR_32(int16_t, block[16 * 16]);
1343 +
1344 +    partialButterflyInverse16_neon(src, coef, shift_1st, 16);
1345 +    partialButterflyInverse16_neon(coef, block, shift_2nd, 16);
1346 +
1347 +    for (int i = 0; i < 16; i++)
1348 +    {
1349 +        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
1350 +    }
1351 +}
1352 +
1353 +static void idct32_neon(const int16_t* src, int16_t* dst, intptr_t dstStride)
1354 +{
1355 +    const int shift_1st = 7;
1356 +    const int shift_2nd = 12 - (X265_DEPTH - 8);
1357 +
1358 +    ALIGN_VAR_32(int16_t, coef[32 * 32]);
1359 +    ALIGN_VAR_32(int16_t, block[32 * 32]);
1360 +
1361 +    partialButterflyInverse32_neon(src, coef, shift_1st, 32);
1362 +    partialButterflyInverse32_neon(coef, block, shift_2nd, 32);
1363 +
1364 +    for (int i = 0; i < 32; i++)
1365 +    {
1366 +        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
1367 +    }
1368 +}
1369 +
1370 +
1371 +
1372 +}
1373 +
1374 +namespace X265_NS {
1375 +// x265 private namespace
1376 +void setupDCTPrimitives_neon(EncoderPrimitives& p) {
1377 +    p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_neon<2>;
1378 +    p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_neon<3>;
1379 +    p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_neon<4>;
1380 +    p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_neon<5>;
1381 +    p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_neon<2>;
1382 +    p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_neon<3>;
1383 +    p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_neon<4>;
1384 +    p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_neon<5>;
1385 +    p.cu[BLOCK_8x8].dct   = dct8_neon;
1386 +    p.cu[BLOCK_16x16].dct = dct16_neon;
1387 +    p.cu[BLOCK_32x32].dct = dct32_neon;
1388 +    p.cu[BLOCK_4x4].idct   = idct4_neon;
1389 +    p.cu[BLOCK_16x16].idct = idct16_neon;
1390 +    p.cu[BLOCK_32x32].idct = idct32_neon;
1391 +    p.cu[BLOCK_4x4].count_nonzero = count_nonzero_neon<4>;
1392 +    p.cu[BLOCK_8x8].count_nonzero = count_nonzero_neon<8>;
1393 +    p.cu[BLOCK_16x16].count_nonzero = count_nonzero_neon<16>;
1394 +    p.cu[BLOCK_32x32].count_nonzero = count_nonzero_neon<32>;
1395 +
1396 +    p.cu[BLOCK_4x4].copy_cnt   = copy_count_neon<4>;
1397 +    p.cu[BLOCK_8x8].copy_cnt   = copy_count_neon<8>;
1398 +    p.cu[BLOCK_16x16].copy_cnt = copy_count_neon<16>;
1399 +    p.cu[BLOCK_32x32].copy_cnt = copy_count_neon<32>;
1400 +    p.cu[BLOCK_4x4].psyRdoQuant_1p = nonPsyRdoQuant_neon<2>;
1401 +    p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_neon<2>;
1402 +    p.cu[BLOCK_8x8].psyRdoQuant_1p = nonPsyRdoQuant_neon<3>;
1403 +    p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_neon<3>;
1404 +    p.cu[BLOCK_16x16].psyRdoQuant_1p = nonPsyRdoQuant_neon<4>;
1405 +    p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_neon<4>;
1406 +    p.cu[BLOCK_32x32].psyRdoQuant_1p = nonPsyRdoQuant_neon<5>;
1407 +    p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_neon<5>;
1408 +
1409 +    p.scanPosLast  =scanPosLast_opt;
1410 +
1411 +}
1412 +};
1413 +
1414 +
1415 +
1416 +#endif
1417 diff -Naur ./source/common/arm64/dct-prim.h ../x265_apple_patch/source/common/arm64/dct-prim.h
1418 --- ./source/common/arm64/dct-prim.h    1970-01-01 01:00:00.000000000 +0100
1419 +++ ../x265_apple_patch/source/common/arm64/dct-prim.h  2021-05-08 13:08:01.000000000 +0100
1420 @@ -0,0 +1,18 @@
1421 +#ifndef __DCT_PRIM_NEON_H__
1422 +#define __DCT_PRIM_NEON_H__
1423 +
1424 +
1425 +#include "common.h"
1426 +#include "primitives.h"
1427 +#include "contexts.h"   // costCoeffNxN_c
1428 +#include "threading.h"  // CLZ
1429 +
1430 +namespace X265_NS {
1431 +// x265 private namespace
1432 +void setupDCTPrimitives_neon(EncoderPrimitives& p);
1433 +};
1434 +
1435 +
1436 +
1437 +#endif
1438 +
1439 diff -Naur ./source/common/arm64/filter-prim.cpp ../x265_apple_patch/source/common/arm64/filter-prim.cpp
1440 --- ./source/common/arm64/filter-prim.cpp       1970-01-01 01:00:00.000000000 +0100
1441 +++ ../x265_apple_patch/source/common/arm64/filter-prim.cpp     2021-05-08 13:08:01.000000000 +0100
1442 @@ -0,0 +1,797 @@
1443 +
1444 +#if HAVE_NEON
1445 +
1446 +#include "filter-prim.h"
1447 +#include <arm_neon.h>
1448 +
1449 +namespace {
1450 +
1451 +using namespace X265_NS;
1452 +
1453 +
1454 +template<int width, int height>
1455 +void filterPixelToShort_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
1456 +{
1457 +    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
1458 +    int row, col;
1459 +    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
1460 +    for (row = 0; row < height; row++)
1461 +    {
1462 +
1463 +        for (col = 0; col < width; col+=8)
1464 +        {
1465 +          int16x8_t in;
1466 +
1467 +#if HIGH_BIT_DEPTH
1468 +          in = *(int16x8_t *)&src[col];
1469 +#else
1470 +          in = vmovl_u8(*(uint8x8_t *)&src[col]);
1471 +#endif
1472 +
1473 +          int16x8_t tmp = vshlq_n_s16(in,shift);
1474 +          tmp = vsubq_s16(tmp,off);
1475 +          *(int16x8_t *)&dst[col] = tmp;
1476 +
1477 +        }
1478 +
1479 +        src += srcStride;
1480 +        dst += dstStride;
1481 +    }
1482 +}
1483 +
1484 +
1485 +template<int N, int width, int height>
1486 +void interp_horiz_pp_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
1487 +{
1488 +    const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1489 +    int headRoom = IF_FILTER_PREC;
1490 +    int offset =  (1 << (headRoom - 1));
1491 +    uint16_t maxVal = (1 << X265_DEPTH) - 1;
1492 +    int cStride = 1;
1493 +
1494 +    src -= (N / 2 - 1) * cStride;
1495 +    int16x8_t vc;
1496 +    vc = *(int16x8_t *)coeff;
1497 +    int16x4_t low_vc = vget_low_s16(vc);
1498 +    int16x4_t high_vc = vget_high_s16(vc);
1499 +
1500 +    const int32x4_t voffset = vdupq_n_s32(offset);
1501 +    const int32x4_t vhr = vdupq_n_s32(-headRoom);
1502 +
1503 +    int row, col;
1504 +    for (row = 0; row < height; row++)
1505 +    {
1506 +        for (col = 0; col < width; col+=8)
1507 +        {
1508 +            int32x4_t vsum1,vsum2;
1509 +
1510 +            int16x8_t input[N];
1511 +
1512 +            for (int i=0;i<N;i++)
1513 +            {
1514 +#if HIGH_BIT_DEPTH
1515 +                input[i] = *(int16x8_t *)&src[col+i];
1516 +#else
1517 +                input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i]);
1518 +#endif
1519 +            }
1520 +            vsum1 = voffset;
1521 +            vsum2 = voffset;
1522 +
1523 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[0]),low_vc,0);
1524 +            vsum2 = vmlal_high_lane_s16(vsum2,input[0],low_vc,0);
1525 +
1526 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[1]),low_vc,1);
1527 +            vsum2 = vmlal_high_lane_s16(vsum2,input[1],low_vc,1);
1528 +
1529 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[2]),low_vc,2);
1530 +            vsum2 = vmlal_high_lane_s16(vsum2,input[2],low_vc,2);
1531 +
1532 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[3]),low_vc,3);
1533 +            vsum2 = vmlal_high_lane_s16(vsum2,input[3],low_vc,3);
1534 +
1535 +            if (N == 8)
1536 +            {
1537 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[4]),high_vc,0);
1538 +                vsum2 = vmlal_high_lane_s16(vsum2,input[4],high_vc,0);
1539 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[5]),high_vc,1);
1540 +                vsum2 = vmlal_high_lane_s16(vsum2,input[5],high_vc,1);
1541 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[6]),high_vc,2);
1542 +                vsum2 = vmlal_high_lane_s16(vsum2,input[6],high_vc,2);
1543 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[7]),high_vc,3);
1544 +                vsum2 = vmlal_high_lane_s16(vsum2,input[7],high_vc,3);
1545 +
1546 +            }
1547 +
1548 +            vsum1 = vshlq_s32(vsum1, vhr);
1549 +            vsum2 = vshlq_s32(vsum2, vhr);
1550 +
1551 +            int16x8_t vsum = vuzp1q_s16(vsum1,vsum2);
1552 +            vsum = vminq_s16(vsum,vdupq_n_s16(maxVal));
1553 +            vsum = vmaxq_s16(vsum,vdupq_n_s16(0));
1554 +#if HIGH_BIT_DEPTH
1555 +            *(int16x8_t *)&dst[col] = vsum;
1556 +#else
1557 +            uint8x16_t usum = vuzp1q_u8(vsum,vsum);
1558 +            *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
1559 +#endif
1560 +
1561 +        }
1562 +
1563 +        src += srcStride;
1564 +        dst += dstStride;
1565 +    }
1566 +}
1567 +
1568 +#if HIGH_BIT_DEPTH
1569 +
1570 +template<int N, int width, int height>
1571 +void interp_horiz_ps_neon(const uint16_t * src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1572 +{
1573 +    const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1574 +    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1575 +    const int shift = IF_FILTER_PREC - headRoom;
1576 +    const int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1577 +
1578 +    int blkheight = height;
1579 +    src -= N / 2 - 1;
1580 +
1581 +    if (isRowExt)
1582 +    {
1583 +        src -= (N / 2 - 1) * srcStride;
1584 +        blkheight += N - 1;
1585 +    }
1586 +    int32x4_t vc0 = vmovl_s16(*(int16x4_t *)coeff);
1587 +    int32x4_t vc1;
1588 +
1589 +    if (N ==8) {
1590 +        vc1 = vmovl_s16(*(int16x4_t *)(coeff + 4));
1591 +    }
1592 +
1593 +    const int32x4_t voffset = vdupq_n_s32(offset);
1594 +    const int32x4_t vhr = vdupq_n_s32(-shift);
1595 +
1596 +    int row, col;
1597 +    for (row = 0; row < blkheight; row++)
1598 +    {
1599 +        for (col = 0; col < width; col+=4)
1600 +        {
1601 +            int32x4_t vsum;
1602 +
1603 +            int32x4_t input[N];
1604 +
1605 +            for (int i=0;i<N;i++)
1606 +            {
1607 +                input[i] = vmovl_s16(*(int16x4_t *)&src[col+i]);
1608 +            }
1609 +            vsum = voffset;
1610 +            vsum = vmlaq_laneq_s32(vsum,(input[0]),vc0,0);
1611 +            vsum = vmlaq_laneq_s32(vsum,(input[1]),vc0,1);
1612 +            vsum = vmlaq_laneq_s32(vsum,(input[2]),vc0,2);
1613 +            vsum = vmlaq_laneq_s32(vsum,(input[3]),vc0,3);
1614 +
1615 +
1616 +            if (N == 8)
1617 +            {
1618 +                vsum = vmlaq_laneq_s32(vsum,(input[4]),vc1,0);
1619 +                vsum = vmlaq_laneq_s32(vsum,(input[5]),vc1,1);
1620 +                vsum = vmlaq_laneq_s32(vsum,(input[6]),vc1,2);
1621 +                vsum = vmlaq_laneq_s32(vsum,(input[7]),vc1,3);
1622 +
1623 +            }
1624 +
1625 +            vsum = vshlq_s32(vsum, vhr);
1626 +            *(int16x4_t *)&dst[col] = vmovn_u32(vsum);
1627 +        }
1628 +
1629 +        src += srcStride;
1630 +        dst += dstStride;
1631 +    }
1632 +  }
1633 +
1634 +
1635 +#else
1636 +
1637 +template<int N, int width, int height>
1638 +void interp_horiz_ps_neon(const uint8_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1639 +{
1640 +    const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1641 +    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1642 +    const int shift = IF_FILTER_PREC - headRoom;
1643 +    const int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1644 +
1645 +    int blkheight = height;
1646 +    src -= N / 2 - 1;
1647 +
1648 +    if (isRowExt)
1649 +    {
1650 +        src -= (N / 2 - 1) * srcStride;
1651 +        blkheight += N - 1;
1652 +    }
1653 +    int16x8_t vc;
1654 +    vc = *(int16x8_t *)coeff;
1655 +
1656 +    const int16x8_t voffset = vdupq_n_s16(offset);
1657 +    const int16x8_t vhr = vdupq_n_s16(-shift);
1658 +
1659 +    int row, col;
1660 +    for (row = 0; row < blkheight; row++)
1661 +    {
1662 +        for (col = 0; col < width; col+=8)
1663 +        {
1664 +            int16x8_t vsum;
1665 +
1666 +            int16x8_t input[N];
1667 +
1668 +            for (int i=0;i<N;i++)
1669 +            {
1670 +                input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i]);
1671 +            }
1672 +            vsum = voffset;
1673 +            vsum = vmlaq_laneq_s16(vsum,(input[0]),vc,0);
1674 +            vsum = vmlaq_laneq_s16(vsum,(input[1]),vc,1);
1675 +            vsum = vmlaq_laneq_s16(vsum,(input[2]),vc,2);
1676 +            vsum = vmlaq_laneq_s16(vsum,(input[3]),vc,3);
1677 +
1678 +
1679 +            if (N == 8)
1680 +            {
1681 +                vsum = vmlaq_laneq_s16(vsum,(input[4]),vc,4);
1682 +                vsum = vmlaq_laneq_s16(vsum,(input[5]),vc,5);
1683 +                vsum = vmlaq_laneq_s16(vsum,(input[6]),vc,6);
1684 +                vsum = vmlaq_laneq_s16(vsum,(input[7]),vc,7);
1685 +
1686 +            }
1687 +
1688 +            vsum = vshlq_s16(vsum, vhr);
1689 +            *(int16x8_t *)&dst[col] = vsum;
1690 +        }
1691 +
1692 +        src += srcStride;
1693 +        dst += dstStride;
1694 +    }
1695 +  }
1696 +
1697 +#endif
1698 +
1699 +
1700 +template<int N, int width, int height>
1701 +void interp_vert_ss_neon(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1702 +{
1703 +    const int16_t* c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
1704 +    int shift = IF_FILTER_PREC;
1705 +    src -= (N / 2 - 1) * srcStride;
1706 +      int16x8_t vc;
1707 +      vc = *(int16x8_t *)c;
1708 +      int16x4_t low_vc = vget_low_s16(vc);
1709 +      int16x4_t high_vc = vget_high_s16(vc);
1710 +
1711 +      const int32x4_t vhr = vdupq_n_s32(-shift);
1712 +
1713 +      int row, col;
1714 +      for (row = 0; row < height; row++)
1715 +      {
1716 +          for (col = 0; col < width; col+=8)
1717 +          {
1718 +              int32x4_t vsum1,vsum2;
1719 +
1720 +              int16x8_t input[N];
1721 +
1722 +              for (int i=0;i<N;i++)
1723 +              {
1724 +                  input[i] = *(int16x8_t *)&src[col+i*srcStride];
1725 +              }
1726 +
1727 +              vsum1 = vmull_lane_s16(vget_low_s16(input[0]),low_vc,0);
1728 +              vsum2 = vmull_high_lane_s16(input[0],low_vc,0);
1729 +
1730 +              vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[1]),low_vc,1);
1731 +              vsum2 = vmlal_high_lane_s16(vsum2,input[1],low_vc,1);
1732 +
1733 +              vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[2]),low_vc,2);
1734 +              vsum2 = vmlal_high_lane_s16(vsum2,input[2],low_vc,2);
1735 +
1736 +              vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[3]),low_vc,3);
1737 +              vsum2 = vmlal_high_lane_s16(vsum2,input[3],low_vc,3);
1738 +
1739 +              if (N == 8)
1740 +              {
1741 +                  vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[4]),high_vc,0);
1742 +                  vsum2 = vmlal_high_lane_s16(vsum2,input[4],high_vc,0);
1743 +                  vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[5]),high_vc,1);
1744 +                  vsum2 = vmlal_high_lane_s16(vsum2,input[5],high_vc,1);
1745 +                  vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[6]),high_vc,2);
1746 +                  vsum2 = vmlal_high_lane_s16(vsum2,input[6],high_vc,2);
1747 +                  vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[7]),high_vc,3);
1748 +                  vsum2 = vmlal_high_lane_s16(vsum2,input[7],high_vc,3);
1749 +
1750 +              }
1751 +
1752 +              vsum1 = vshlq_s32(vsum1, vhr);
1753 +              vsum2 = vshlq_s32(vsum2, vhr);
1754 +
1755 +              int16x8_t vsum = vuzp1q_s16(vsum1,vsum2);
1756 +              *(int16x8_t *)&dst[col] = vsum;
1757 +          }
1758 +
1759 +          src += srcStride;
1760 +          dst += dstStride;
1761 +      }
1762 +
1763 +}
1764 +
1765 +
1766 +#if HIGH_BIT_DEPTH
1767 +
1768 +template<int N, int width, int height>
1769 +void interp_vert_pp_neon(const uint16_t* src, intptr_t srcStride, uint16_t* dst, intptr_t dstStride, int coeffIdx)
1770 +{
1771 +
1772 +    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1773 +    int shift = IF_FILTER_PREC;
1774 +    int offset = 1 << (shift - 1);
1775 +    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
1776 +
1777 +    src -= (N / 2 - 1) * srcStride;
1778 +    int16x8_t vc;
1779 +    vc = *(int16x8_t *)c;
1780 +    int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
1781 +    int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
1782 +
1783 +    const int32x4_t voffset = vdupq_n_s32(offset);
1784 +    const int32x4_t vhr = vdupq_n_s32(-shift);
1785 +
1786 +    int row, col;
1787 +    for (row = 0; row < height; row++)
1788 +    {
1789 +        for (col = 0; col < width; col+=4)
1790 +        {
1791 +            int32x4_t vsum;
1792 +
1793 +            int32x4_t input[N];
1794 +
1795 +            for (int i=0;i<N;i++)
1796 +            {
1797 +                input[i] = vmovl_u16(*(uint16x4_t *)&src[col+i*srcStride]);
1798 +            }
1799 +            vsum = voffset;
1800 +
1801 +            vsum = vmlaq_laneq_s32(vsum,(input[0]),low_vc,0);
1802 +            vsum = vmlaq_laneq_s32(vsum,(input[1]),low_vc,1);
1803 +            vsum = vmlaq_laneq_s32(vsum,(input[2]),low_vc,2);
1804 +            vsum = vmlaq_laneq_s32(vsum,(input[3]),low_vc,3);
1805 +
1806 +            if (N == 8)
1807 +            {
1808 +              vsum = vmlaq_laneq_s32(vsum,(input[4]),high_vc,0);
1809 +              vsum = vmlaq_laneq_s32(vsum,(input[5]),high_vc,1);
1810 +              vsum = vmlaq_laneq_s32(vsum,(input[6]),high_vc,2);
1811 +              vsum = vmlaq_laneq_s32(vsum,(input[7]),high_vc,3);
1812 +            }
1813 +
1814 +            vsum = vshlq_s32(vsum, vhr);
1815 +            vsum = vminq_s32(vsum,vdupq_n_s32(maxVal));
1816 +            vsum = vmaxq_s32(vsum,vdupq_n_s32(0));
1817 +            *(uint16x4_t *)&dst[col] = vmovn_u32(vsum);
1818 +        }
1819 +        src += srcStride;
1820 +        dst += dstStride;
1821 +    }
1822 +}
1823 +
1824 +
1825 +
1826 +
1827 +#else
1828 +
1829 +template<int N, int width, int height>
1830 +void interp_vert_pp_neon(const uint8_t* src, intptr_t srcStride, uint8_t* dst, intptr_t dstStride, int coeffIdx)
1831 +{
1832 +
1833 +    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1834 +    int shift = IF_FILTER_PREC;
1835 +    int offset = 1 << (shift - 1);
1836 +    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
1837 +
1838 +    src -= (N / 2 - 1) * srcStride;
1839 +    int16x8_t vc;
1840 +    vc = *(int16x8_t *)c;
1841 +
1842 +    const int16x8_t voffset = vdupq_n_s16(offset);
1843 +    const int16x8_t vhr = vdupq_n_s16(-shift);
1844 +
1845 +    int row, col;
1846 +    for (row = 0; row < height; row++)
1847 +    {
1848 +        for (col = 0; col < width; col+=8)
1849 +        {
1850 +            int16x8_t vsum;
1851 +
1852 +            int16x8_t input[N];
1853 +
1854 +            for (int i=0;i<N;i++)
1855 +            {
1856 +                input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i*srcStride]);
1857 +            }
1858 +            vsum = voffset;
1859 +
1860 +            vsum = vmlaq_laneq_s16(vsum,(input[0]),vc,0);
1861 +            vsum = vmlaq_laneq_s16(vsum,(input[1]),vc,1);
1862 +            vsum = vmlaq_laneq_s16(vsum,(input[2]),vc,2);
1863 +            vsum = vmlaq_laneq_s16(vsum,(input[3]),vc,3);
1864 +
1865 +            if (N == 8)
1866 +            {
1867 +              vsum = vmlaq_laneq_s16(vsum,(input[4]),vc,4);
1868 +              vsum = vmlaq_laneq_s16(vsum,(input[5]),vc,5);
1869 +              vsum = vmlaq_laneq_s16(vsum,(input[6]),vc,6);
1870 +              vsum = vmlaq_laneq_s16(vsum,(input[7]),vc,7);
1871 +
1872 +            }
1873 +
1874 +            vsum = vshlq_s16(vsum, vhr);
1875 +
1876 +            vsum = vminq_s16(vsum,vdupq_n_s16(maxVal));
1877 +            vsum = vmaxq_s16(vsum,vdupq_n_s16(0));
1878 +            uint8x16_t usum = vuzp1q_u8(vsum,vsum);
1879 +            *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
1880 +
1881 +        }
1882 +
1883 +        src += srcStride;
1884 +        dst += dstStride;
1885 +    }
1886 +}
1887 +
1888 +
1889 +#endif
1890 +
1891 +
1892 +#if HIGH_BIT_DEPTH
1893 +
1894 +template<int N, int width, int height>
1895 +void interp_vert_ps_neon(const uint16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1896 +{
1897 +    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1898 +    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1899 +    int shift = IF_FILTER_PREC - headRoom;
1900 +    int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1901 +    src -= (N / 2 - 1) * srcStride;
1902 +
1903 +    int16x8_t vc;
1904 +    vc = *(int16x8_t *)c;
1905 +    int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
1906 +    int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
1907 +
1908 +    const int32x4_t voffset = vdupq_n_s32(offset);
1909 +    const int32x4_t vhr = vdupq_n_s32(-shift);
1910 +
1911 +    int row, col;
1912 +    for (row = 0; row < height; row++)
1913 +    {
1914 +        for (col = 0; col < width; col+=4)
1915 +        {
1916 +            int16x8_t vsum;
1917 +
1918 +            int16x8_t input[N];
1919 +
1920 +            for (int i=0;i<N;i++)
1921 +            {
1922 +                input[i] = vmovl_u16(*(uint16x4_t *)&src[col+i*srcStride]);
1923 +            }
1924 +            vsum = voffset;
1925 +
1926 +            vsum = vmlaq_laneq_s32(vsum,(input[0]),low_vc,0);
1927 +            vsum = vmlaq_laneq_s32(vsum,(input[1]),low_vc,1);
1928 +            vsum = vmlaq_laneq_s32(vsum,(input[2]),low_vc,2);
1929 +            vsum = vmlaq_laneq_s32(vsum,(input[3]),low_vc,3);
1930 +
1931 +            if (N == 8)
1932 +            {
1933 +                int16x8_t  vsum1 = vmulq_laneq_s32((input[4]),high_vc,0);
1934 +                vsum1 = vmlaq_laneq_s32(vsum1,(input[5]),high_vc,1);
1935 +                vsum1 = vmlaq_laneq_s32(vsum1,(input[6]),high_vc,2);
1936 +                vsum1 = vmlaq_laneq_s32(vsum1,(input[7]),high_vc,3);
1937 +                vsum = vaddq_s32(vsum,vsum1);
1938 +            }
1939 +
1940 +            vsum = vshlq_s32(vsum, vhr);
1941 +
1942 +            *(uint16x4_t *)&dst[col] = vmovn_s32(vsum);
1943 +        }
1944 +
1945 +        src += srcStride;
1946 +        dst += dstStride;
1947 +    }
1948 +}
1949 +
1950 +#else
1951 +
1952 +template<int N, int width, int height>
1953 +void interp_vert_ps_neon(const uint8_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1954 +{
1955 +    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1956 +    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1957 +    int shift = IF_FILTER_PREC - headRoom;
1958 +    int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1959 +    src -= (N / 2 - 1) * srcStride;
1960 +
1961 +    int16x8_t vc;
1962 +    vc = *(int16x8_t *)c;
1963 +
1964 +    const int16x8_t voffset = vdupq_n_s16(offset);
1965 +    const int16x8_t vhr = vdupq_n_s16(-shift);
1966 +
1967 +    int row, col;
1968 +    for (row = 0; row < height; row++)
1969 +    {
1970 +        for (col = 0; col < width; col+=8)
1971 +        {
1972 +            int16x8_t vsum;
1973 +
1974 +            int16x8_t input[N];
1975 +
1976 +            for (int i=0;i<N;i++)
1977 +            {
1978 +                input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i*srcStride]);
1979 +            }
1980 +            vsum = voffset;
1981 +
1982 +            vsum = vmlaq_laneq_s16(vsum,(input[0]),vc,0);
1983 +            vsum = vmlaq_laneq_s16(vsum,(input[1]),vc,1);
1984 +            vsum = vmlaq_laneq_s16(vsum,(input[2]),vc,2);
1985 +            vsum = vmlaq_laneq_s16(vsum,(input[3]),vc,3);
1986 +
1987 +            if (N == 8)
1988 +            {
1989 +                int16x8_t  vsum1 = vmulq_laneq_s16((input[4]),vc,4);
1990 +                vsum1 = vmlaq_laneq_s16(vsum1,(input[5]),vc,5);
1991 +                vsum1 = vmlaq_laneq_s16(vsum1,(input[6]),vc,6);
1992 +                vsum1 = vmlaq_laneq_s16(vsum1,(input[7]),vc,7);
1993 +                vsum = vaddq_s16(vsum,vsum1);
1994 +            }
1995 +
1996 +            vsum = vshlq_s32(vsum, vhr);
1997 +            *(int16x8_t *)&dst[col] = vsum;
1998 +        }
1999 +
2000 +        src += srcStride;
2001 +        dst += dstStride;
2002 +    }
2003 +}
2004 +
2005 +#endif
2006 +
2007 +
2008 +
2009 +template<int N, int width, int height>
2010 +void interp_vert_sp_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
2011 +{
2012 +    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
2013 +    int shift = IF_FILTER_PREC + headRoom;
2014 +    int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
2015 +    uint16_t maxVal = (1 << X265_DEPTH) - 1;
2016 +    const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
2017 +
2018 +    src -= (N / 2 - 1) * srcStride;
2019 +
2020 +    int16x8_t vc;
2021 +    vc = *(int16x8_t *)coeff;
2022 +    int16x4_t low_vc = vget_low_s16(vc);
2023 +    int16x4_t high_vc = vget_high_s16(vc);
2024 +
2025 +    const int32x4_t voffset = vdupq_n_s32(offset);
2026 +    const int32x4_t vhr = vdupq_n_s32(-shift);
2027 +
2028 +    int row, col;
2029 +    for (row = 0; row < height; row++)
2030 +    {
2031 +        for (col = 0; col < width; col+=8)
2032 +        {
2033 +            int32x4_t vsum1,vsum2;
2034 +
2035 +            int16x8_t input[N];
2036 +
2037 +            for (int i=0;i<N;i++)
2038 +            {
2039 +                input[i] = *(int16x8_t *)&src[col+i*srcStride];
2040 +            }
2041 +            vsum1 = voffset;
2042 +            vsum2 = voffset;
2043 +
2044 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[0]),low_vc,0);
2045 +            vsum2 = vmlal_high_lane_s16(vsum2,input[0],low_vc,0);
2046 +
2047 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[1]),low_vc,1);
2048 +            vsum2 = vmlal_high_lane_s16(vsum2,input[1],low_vc,1);
2049 +
2050 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[2]),low_vc,2);
2051 +            vsum2 = vmlal_high_lane_s16(vsum2,input[2],low_vc,2);
2052 +
2053 +            vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[3]),low_vc,3);
2054 +            vsum2 = vmlal_high_lane_s16(vsum2,input[3],low_vc,3);
2055 +
2056 +            if (N == 8)
2057 +            {
2058 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[4]),high_vc,0);
2059 +                vsum2 = vmlal_high_lane_s16(vsum2,input[4],high_vc,0);
2060 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[5]),high_vc,1);
2061 +                vsum2 = vmlal_high_lane_s16(vsum2,input[5],high_vc,1);
2062 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[6]),high_vc,2);
2063 +                vsum2 = vmlal_high_lane_s16(vsum2,input[6],high_vc,2);
2064 +                vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[7]),high_vc,3);
2065 +                vsum2 = vmlal_high_lane_s16(vsum2,input[7],high_vc,3);
2066 +            }
2067 +
2068 +            vsum1 = vshlq_s32(vsum1, vhr);
2069 +            vsum2 = vshlq_s32(vsum2, vhr);
2070 +
2071 +            int16x8_t vsum = vuzp1q_s16(vsum1,vsum2);
2072 +            vsum = vminq_s16(vsum,vdupq_n_s16(maxVal));
2073 +            vsum = vmaxq_s16(vsum,vdupq_n_s16(0));
2074 +#if HIGH_BIT_DEPTH
2075 +            *(int16x8_t *)&dst[col] = vsum;
2076 +#else
2077 +            uint8x16_t usum = vuzp1q_u8(vsum,vsum);
2078 +            *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
2079 +#endif
2080 +
2081 +        }
2082 +
2083 +        src += srcStride;
2084 +        dst += dstStride;
2085 +    }
2086 +}
2087 +
2088 +
2089 +
2090 +
2091 +
2092 +
2093 +template<int N, int width, int height>
2094 +void interp_hv_pp_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
2095 +{
2096 +    ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
2097 +
2098 +    interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
2099 +    interp_vert_sp_neon<N,width,height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
2100 +}
2101 +
2102 +
2103 +
2104 +}
2105 +
2106 +
2107 +
2108 +
2109 +namespace X265_NS {
2110 +   #define CHROMA_420(W, H) \
2111 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
2112 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hps = interp_horiz_ps_neon<4, W, H>; \
2113 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_neon<4, W, H>;  \
2114 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_neon<4, W, H>;  \
2115 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_neon<4, W, H>;  \
2116 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_neon<4, W, H>; \
2117 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2118 +       p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2119 +
2120 +   #define CHROMA_422(W, H) \
2121 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
2122 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hps = interp_horiz_ps_neon<4, W, H>; \
2123 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_neon<4, W, H>;  \
2124 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_neon<4, W, H>;  \
2125 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_neon<4, W, H>;  \
2126 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_neon<4, W, H>; \
2127 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2128 +       p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2129 +
2130 +   #define CHROMA_444(W, H) \
2131 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
2132 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_neon<4, W, H>; \
2133 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_neon<4, W, H>;  \
2134 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_neon<4, W, H>;  \
2135 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_neon<4, W, H>;  \
2136 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_neon<4, W, H>; \
2137 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2138 +       p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2139 +
2140 +   #define LUMA(W, H) \
2141 +       p.pu[LUMA_ ## W ## x ## H].luma_hpp     = interp_horiz_pp_neon<8, W, H>; \
2142 +       p.pu[LUMA_ ## W ## x ## H].luma_hps     = interp_horiz_ps_neon<8, W, H>; \
2143 +       p.pu[LUMA_ ## W ## x ## H].luma_vpp     = interp_vert_pp_neon<8, W, H>;  \
2144 +       p.pu[LUMA_ ## W ## x ## H].luma_vps     = interp_vert_ps_neon<8, W, H>;  \
2145 +       p.pu[LUMA_ ## W ## x ## H].luma_vsp     = interp_vert_sp_neon<8, W, H>;  \
2146 +       p.pu[LUMA_ ## W ## x ## H].luma_vss     = interp_vert_ss_neon<8, W, H>;  \
2147 +       p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_neon<8, W, H>; \
2148 +       p.pu[LUMA_ ## W ## x ## H].convert_p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2149 +       p.pu[LUMA_ ## W ## x ## H].convert_p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2150 +
2151 +
2152 +void setupFilterPrimitives_neon(EncoderPrimitives &p)
2153 +{
2154 +
2155 +  // All neon functions assume width of multiple of 8, (2,4,12 variants are not optimized)
2156 +
2157 +  LUMA(8, 8);
2158 +  LUMA(8, 4);
2159 +  LUMA(16, 16);
2160 +  CHROMA_420(8,  8);
2161 +  LUMA(16,  8);
2162 +  CHROMA_420(8,  4);
2163 +  LUMA(8, 16);
2164 +  LUMA(16, 12);
2165 +  CHROMA_420(8,  6);
2166 +  LUMA(16,  4);
2167 +  CHROMA_420(8,  2);
2168 +  LUMA(32, 32);
2169 +  CHROMA_420(16, 16);
2170 +  LUMA(32, 16);
2171 +  CHROMA_420(16, 8);
2172 +  LUMA(16, 32);
2173 +  CHROMA_420(8,  16);
2174 +  LUMA(32, 24);
2175 +  CHROMA_420(16, 12);
2176 +  LUMA(24, 32);
2177 +  LUMA(32,  8);
2178 +  CHROMA_420(16, 4);
2179 +  LUMA(8, 32);
2180 +  LUMA(64, 64);
2181 +  CHROMA_420(32, 32);
2182 +  LUMA(64, 32);
2183 +  CHROMA_420(32, 16);
2184 +  LUMA(32, 64);
2185 +  CHROMA_420(16, 32);
2186 +  LUMA(64, 48);
2187 +  CHROMA_420(32, 24);
2188 +  LUMA(48, 64);
2189 +  CHROMA_420(24, 32);
2190 +  LUMA(64, 16);
2191 +  CHROMA_420(32, 8);
2192 +  LUMA(16, 64);
2193 +  CHROMA_420(8,  32);
2194 +  CHROMA_422(8,  16);
2195 +  CHROMA_422(8,  8);
2196 +  CHROMA_422(8,  12);
2197 +  CHROMA_422(8,  4);
2198 +  CHROMA_422(16, 32);
2199 +  CHROMA_422(16, 16);
2200 +  CHROMA_422(8,  32);
2201 +  CHROMA_422(16, 24);
2202 +  CHROMA_422(16, 8);
2203 +  CHROMA_422(32, 64);
2204 +  CHROMA_422(32, 32);
2205 +  CHROMA_422(16, 64);
2206 +  CHROMA_422(32, 48);
2207 +  CHROMA_422(24, 64);
2208 +  CHROMA_422(32, 16);
2209 +  CHROMA_422(8,  64);
2210 +  CHROMA_444(8,  8);
2211 +  CHROMA_444(8,  4);
2212 +  CHROMA_444(16, 16);
2213 +  CHROMA_444(16, 8);
2214 +  CHROMA_444(8,  16);
2215 +  CHROMA_444(16, 12);
2216 +  CHROMA_444(16, 4);
2217 +  CHROMA_444(32, 32);
2218 +  CHROMA_444(32, 16);
2219 +  CHROMA_444(16, 32);
2220 +  CHROMA_444(32, 24);
2221 +  CHROMA_444(24, 32);
2222 +  CHROMA_444(32, 8);
2223 +  CHROMA_444(8,  32);
2224 +  CHROMA_444(64, 64);
2225 +  CHROMA_444(64, 32);
2226 +  CHROMA_444(32, 64);
2227 +  CHROMA_444(64, 48);
2228 +  CHROMA_444(48, 64);
2229 +  CHROMA_444(64, 16);
2230 +  CHROMA_444(16, 64);
2231 +
2232 +}
2233 +
2234 +};
2235 +
2236 +
2237 +#endif
2238 +
2239 +
2240 diff -Naur ./source/common/arm64/filter-prim.h ../x265_apple_patch/source/common/arm64/filter-prim.h
2241 --- ./source/common/arm64/filter-prim.h 1970-01-01 01:00:00.000000000 +0100
2242 +++ ../x265_apple_patch/source/common/arm64/filter-prim.h       2021-05-08 13:08:01.000000000 +0100
2243 @@ -0,0 +1,20 @@
2244 +#ifndef _FILTER_PRIM_ARM64_H__
2245 +#define _FILTER_PRIM_ARM64_H__
2246 +
2247 +
2248 +#include "common.h"
2249 +#include "slicetype.h"      // LOWRES_COST_MASK
2250 +#include "primitives.h"
2251 +#include "x265.h"
2252 +
2253 +
2254 +namespace X265_NS {
2255 +
2256 +
2257 +void setupFilterPrimitives_neon(EncoderPrimitives &p);
2258 +
2259 +};
2260 +
2261 +
2262 +#endif
2263 +
2264 diff -Naur ./source/common/arm64/intrapred-prim.cpp ../x265_apple_patch/source/common/arm64/intrapred-prim.cpp
2265 --- ./source/common/arm64/intrapred-prim.cpp    1970-01-01 01:00:00.000000000 +0100
2266 +++ ../x265_apple_patch/source/common/arm64/intrapred-prim.cpp  2021-05-08 13:08:01.000000000 +0100
2267 @@ -0,0 +1,266 @@
2268 +/*****************************************************************************
2269 + * Copyright (C) 2013-2017 MulticoreWare, Inc
2270 + *
2271 + * Authors: Min Chen <chenm003@163.com>
2272 + *
2273 + * This program is free software; you can redistribute it and/or modify
2274 + * it under the terms of the GNU General Public License as published by
2275 + * the Free Software Foundation; either version 2 of the License, or
2276 + * (at your option) any later version.
2277 + *
2278 + * This program is distributed in the hope that it will be useful,
2279 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2280 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
2281 + * GNU General Public License for more details.
2282 + *
2283 + * You should have received a copy of the GNU General Public License
2284 + * along with this program; if not, write to the Free Software
2285 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
2286 + *
2287 + * This program is also available under a commercial proprietary license.
2288 + * For more information, contact us at license @ x265.com.
2289 + *****************************************************************************/
2290 +
2291 +
2292 +#include "common.h"
2293 +#include "primitives.h"
2294 +
2295 +
2296 +#if 1
2297 +#include "arm64-utils.h"
2298 +#include <arm_neon.h>
2299 +
2300 +using namespace X265_NS;
2301 +
2302 +namespace {
2303 +
2304 +
2305 +
2306 +template<int width>
2307 +void intra_pred_ang_neon(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
2308 +{
2309 +    int width2 = width << 1;
2310 +    // Flip the neighbours in the horizontal case.
2311 +    int horMode = dirMode < 18;
2312 +    pixel neighbourBuf[129];
2313 +    const pixel *srcPix = srcPix0;
2314 +
2315 +    if (horMode)
2316 +    {
2317 +        neighbourBuf[0] = srcPix[0];
2318 +        //for (int i = 0; i < width << 1; i++)
2319 +        //{
2320 +        //    neighbourBuf[1 + i] = srcPix[width2 + 1 + i];
2321 +        //    neighbourBuf[width2 + 1 + i] = srcPix[1 + i];
2322 +        //}
2323 +        memcpy(&neighbourBuf[1],&srcPix[width2+1],sizeof(pixel)*(width << 1));
2324 +        memcpy(&neighbourBuf[width2 + 1],&srcPix[1],sizeof(pixel)*(width << 1));
2325 +        srcPix = neighbourBuf;
2326 +    }
2327 +
2328 +    // Intra prediction angle and inverse angle tables.
2329 +    const int8_t angleTable[17] = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
2330 +    const int16_t invAngleTable[8] = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
2331 +
2332 +    // Get the prediction angle.
2333 +    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
2334 +    int angle = angleTable[8 + angleOffset];
2335 +
2336 +    // Vertical Prediction.
2337 +    if (!angle)
2338 +    {
2339 +        for (int y = 0; y < width; y++) {
2340 +            memcpy(&dst[y * dstStride],srcPix + 1,sizeof(pixel)*width);
2341 +        }
2342 +        if (bFilter)
2343 +        {
2344 +            int topLeft = srcPix[0], top = srcPix[1];
2345 +            for (int y = 0; y < width; y++)
2346 +                dst[y * dstStride] = x265_clip((int16_t)(top + ((srcPix[width2 + 1 + y] - topLeft) >> 1)));
2347 +        }
2348 +    }
2349 +    else // Angular prediction.
2350 +    {
2351 +        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf[1]).
2352 +        pixel refBuf[64];
2353 +        const pixel *ref;
2354 +
2355 +        // Use the projected left neighbours and the top neighbours.
2356 +        if (angle < 0)
2357 +        {
2358 +            // Number of neighbours projected.
2359 +            int nbProjected = -((width * angle) >> 5) - 1;
2360 +            pixel *ref_pix = refBuf + nbProjected + 1;
2361 +
2362 +            // Project the neighbours.
2363 +            int invAngle = invAngleTable[- angleOffset - 1];
2364 +            int invAngleSum = 128;
2365 +            for (int i = 0; i < nbProjected; i++)
2366 +            {
2367 +                invAngleSum += invAngle;
2368 +                ref_pix[- 2 - i] = srcPix[width2 + (invAngleSum >> 8)];
2369 +            }
2370 +
2371 +            // Copy the top-left and top pixels.
2372 +            //for (int i = 0; i < width + 1; i++)
2373 +                //ref_pix[-1 + i] = srcPix[i];
2374 +
2375 +            memcpy(&ref_pix[-1],srcPix,(width+1)*sizeof(pixel));
2376 +            ref = ref_pix;
2377 +        }
2378 +        else // Use the top and top-right neighbours.
2379 +            ref = srcPix + 1;
2380 +
2381 +        // Pass every row.
2382 +        int angleSum = 0;
2383 +        for (int y = 0; y < width; y++)
2384 +        {
2385 +            angleSum += angle;
2386 +            int offset = angleSum >> 5;
2387 +            int fraction = angleSum & 31;
2388 +
2389 +            if (fraction) // Interpolate
2390 +            {
2391 +                if (width >= 8 && sizeof(pixel) == 1)
2392 +                {
2393 +                    const int16x8_t f0 = vdupq_n_s16(32-fraction);
2394 +                    const int16x8_t f1 = vdupq_n_s16(fraction);
2395 +                    for (int x = 0;x<width;x+=8) {
2396 +                        uint8x8_t in0 = *(uint8x8_t *)&ref[offset + x];
2397 +                        uint8x8_t in1 = *(uint8x8_t *)&ref[offset+ x + 1];
2398 +                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16),vmovl_u8(in0),f0);
2399 +                        lo = vmlaq_s16(lo,vmovl_u8(in1),f1);
2400 +                        lo = vshrq_n_s16(lo,5);
2401 +                        *(uint8x8_t *)&dst[y * dstStride + x] = vmovn_u16(lo);
2402 +                    }
2403 +                }
2404 +                else if (width >= 4 && sizeof(pixel) == 2)
2405 +                {
2406 +                    const int32x4_t f0 = vdupq_n_s32(32-fraction);
2407 +                    const int32x4_t f1 = vdupq_n_s32(fraction);
2408 +                    for (int x = 0;x<width;x+=4) {
2409 +                        uint16x4_t in0 = *(uint16x4_t *)&ref[offset + x];
2410 +                        uint16x4_t in1 = *(uint16x4_t *)&ref[offset+ x + 1];
2411 +                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16),vmovl_u16(in0),f0);
2412 +                        lo = vmlaq_s32(lo,vmovl_u16(in1),f1);
2413 +                        lo = vshrq_n_s32(lo,5);
2414 +                        *(uint16x4_t *)&dst[y * dstStride + x] = vmovn_u32(lo);
2415 +                    }
2416 +                }
2417 +                else {
2418 +                    for (int x = 0; x < width; x++)
2419 +                        dst[y * dstStride + x] = (pixel)(((32 - fraction) * ref[offset + x] + fraction * ref[offset + x + 1] + 16) >> 5);
2420 +                }
2421 +            }
2422 +            else // Copy.
2423 +            {
2424 +                memcpy(&dst[y * dstStride],&ref[offset],sizeof(pixel)*width);
2425 +            }
2426 +        }
2427 +    }
2428 +
2429 +    // Flip for horizontal.
2430 +    if (horMode)
2431 +    {
2432 +        if (width == 8)  transpose8x8(dst,dst,dstStride,dstStride);
2433 +        else if (width == 16) transpose16x16(dst,dst,dstStride,dstStride);
2434 +        else if (width == 32) transpose32x32(dst,dst,dstStride,dstStride);
2435 +        else {
2436 +            for (int y = 0; y < width - 1; y++)
2437 +            {
2438 +                for (int x = y + 1; x < width; x++)
2439 +                {
2440 +                    pixel tmp              = dst[y * dstStride + x];
2441 +                    dst[y * dstStride + x] = dst[x * dstStride + y];
2442 +                    dst[x * dstStride + y] = tmp;
2443 +                }
2444 +            }
2445 +        }
2446 +    }
2447 +}
2448 +
2449 +template<int log2Size>
2450 +void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
2451 +{
2452 +    const int size = 1 << log2Size;
2453 +    for (int mode = 2; mode <= 34; mode++)
2454 +    {
2455 +        pixel *srcPix  = (g_intraFilterFlags[mode] & size ? filtPix  : refPix);
2456 +        pixel *out = dest + ((mode - 2) << (log2Size * 2));
2457 +
2458 +        intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
2459 +
2460 +        // Optimize code don't flip buffer
2461 +        bool modeHor = (mode < 18);
2462 +
2463 +        // transpose the block if this is a horizontal mode
2464 +        if (modeHor)
2465 +        {
2466 +            if (size == 8) transpose8x8(out,out,size,size);
2467 +            else if (size == 16) transpose16x16(out,out,size,size);
2468 +            else if (size == 32) transpose32x32(out,out,size,size);
2469 +            else {
2470 +                for (int k = 0; k < size - 1; k++)
2471 +                {
2472 +                    for (int l = k + 1; l < size; l++)
2473 +                    {
2474 +                        pixel tmp         = out[k * size + l];
2475 +                        out[k * size + l] = out[l * size + k];
2476 +                        out[l * size + k] = tmp;
2477 +                    }
2478 +                }
2479 +            }
2480 +        }
2481 +    }
2482 +}
2483 +}
2484 +
2485 +namespace X265_NS {
2486 +// x265 private namespace
2487 +
2488 +void setupIntraPrimitives_neon(EncoderPrimitives& p)
2489 +{
2490 +//    p.cu[BLOCK_4x4].intra_filter = intraFilter<4>;
2491 +//    p.cu[BLOCK_8x8].intra_filter = intraFilter<8>;
2492 +//    p.cu[BLOCK_16x16].intra_filter = intraFilter<16>;
2493 +//    p.cu[BLOCK_32x32].intra_filter = intraFilter<32>;
2494 +
2495 +//    p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_neon<2>;
2496 +//    p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_neon<3>;
2497 +//    p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_neon<4>;
2498 +//    p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = planar_pred_neon<5>;
2499 +//
2500 +//    p.cu[BLOCK_4x4].intra_pred[DC_IDX] = intra_pred_dc_neon<4>;
2501 +//    p.cu[BLOCK_8x8].intra_pred[DC_IDX] = intra_pred_dc_neon<8>;
2502 +//    p.cu[BLOCK_16x16].intra_pred[DC_IDX] = intra_pred_dc_neon<16>;
2503 +//    p.cu[BLOCK_32x32].intra_pred[DC_IDX] = intra_pred_dc_neon<32>;
2504 +
2505 +    for (int i = 2; i < NUM_INTRA_MODE; i++)
2506 +    {
2507 +        p.cu[BLOCK_4x4].intra_pred[i] = intra_pred_ang_neon<4>;
2508 +        p.cu[BLOCK_8x8].intra_pred[i] = intra_pred_ang_neon<8>;
2509 +        p.cu[BLOCK_16x16].intra_pred[i] = intra_pred_ang_neon<16>;
2510 +        p.cu[BLOCK_32x32].intra_pred[i] = intra_pred_ang_neon<32>;
2511 +    }
2512 +
2513 +    p.cu[BLOCK_4x4].intra_pred_allangs = all_angs_pred_neon<2>;
2514 +    p.cu[BLOCK_8x8].intra_pred_allangs = all_angs_pred_neon<3>;
2515 +    p.cu[BLOCK_16x16].intra_pred_allangs = all_angs_pred_neon<4>;
2516 +    p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_neon<5>;
2517 +}
2518 +}
2519 +
2520 +
2521 +
2522 +#else
2523 +
2524 +namespace X265_NS {
2525 +// x265 private namespace
2526 +void setupIntraPrimitives_neon(EncoderPrimitives& p)
2527 +{}
2528 +}
2529 +
2530 +#endif
2531 +
2532 +
2533 +
2534 diff -Naur ./source/common/arm64/intrapred-prim.h ../x265_apple_patch/source/common/arm64/intrapred-prim.h
2535 --- ./source/common/arm64/intrapred-prim.h      1970-01-01 01:00:00.000000000 +0100
2536 +++ ../x265_apple_patch/source/common/arm64/intrapred-prim.h    2021-05-08 13:08:01.000000000 +0100
2537 @@ -0,0 +1,14 @@
2538 +#ifndef INTRAPRED_PRIM_H__
2539 +
2540 +#if defined(__aarch64__)
2541 +
2542 +namespace X265_NS {
2543 +// x265 private namespace
2544 +
2545 +void setupIntraPrimitives_neon(EncoderPrimitives& p);
2546 +}
2547 +
2548 +#endif
2549 +
2550 +#endif
2551 +
2552 diff -Naur ./source/common/arm64/loopfilter-prim.cpp ../x265_apple_patch/source/common/arm64/loopfilter-prim.cpp
2553 --- ./source/common/arm64/loopfilter-prim.cpp   1970-01-01 01:00:00.000000000 +0100
2554 +++ ../x265_apple_patch/source/common/arm64/loopfilter-prim.cpp 2021-05-08 13:08:01.000000000 +0100
2555 @@ -0,0 +1,305 @@
2556 +/*****************************************************************************
2557 +* Copyright (C) 2013-2017 MulticoreWare, Inc
2558 +*
2559 +* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
2560 +*          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
2561 +*          Min Chen <chenm003@163.com>
2562 +*
2563 +* This program is free software; you can redistribute it and/or modify
2564 +* it under the terms of the GNU General Public License as published by
2565 +* the Free Software Foundation; either version 2 of the License, or
2566 +* (at your option) any later version.
2567 +*
2568 +* This program is distributed in the hope that it will be useful,
2569 +* but WITHOUT ANY WARRANTY; without even the implied warranty of
2570 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
2571 +* GNU General Public License for more details.
2572 +*
2573 +* You should have received a copy of the GNU General Public License
2574 +* along with this program; if not, write to the Free Software
2575 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
2576 +*
2577 +* This program is also available under a commercial proprietary license.
2578 +* For more information, contact us at license @ x265.com.
2579 +*****************************************************************************/
2580 +#include "loopfilter-prim.h"
2581 +
2582 +#define PIXEL_MIN 0
2583 +
2584 +
2585 +
2586 +#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
2587 +#include<arm_neon.h>
2588 +
2589 +namespace {
2590 +
2591 +
2592 +/* get the sign of input variable (TODO: this is a dup, make common) */
2593 +static inline int8_t signOf(int x)
2594 +{
2595 +    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
2596 +}
2597 +
2598 +static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
2599 +{
2600 +  int16x8_t in = vsubl_u8(in0,in1);
2601 +  return vmovn_s16(vmaxq_s16(vminq_s16(in,vdupq_n_s16(1)),vdupq_n_s16(-1)));
2602 +}
2603 +
2604 +static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
2605 +{
2606 +  int x = 0;
2607 +  for (; (x + 8) <= endX; x += 8) {
2608 +    *(int8x8_t *)&dst[x]  = sign_diff_neon(*(uint8x8_t *)&src1[x],*(uint8x8_t *)&src2[x]);
2609 +  }
2610 +
2611 +    for (; x < endX; x++)
2612 +        dst[x] = signOf(src1[x] - src2[x]);
2613 +}
2614 +
2615 +static void processSaoCUE0_neon(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
2616 +{
2617 +
2618 +
2619 +    int y;
2620 +    int8_t signRight, signLeft0;
2621 +    int8_t edgeType;
2622 +
2623 +    for (y = 0; y < 2; y++)
2624 +    {
2625 +        signLeft0 = signLeft[y];
2626 +        int x = 0;
2627 +
2628 +        if (width >= 8) {
2629 +            int8x8_t vsignRight;
2630 +            int8x8x2_t shifter;
2631 +            shifter.val[1][0] = signLeft0;
2632 +            static const int8x8_t index = {8,0,1,2,3,4,5,6};
2633 +            int8x8_t tbl = *(int8x8_t *)offsetEo;
2634 +            for (; (x+8) <= width; x+=8)
2635 +            {
2636 +                uint8x8_t in = *(uint8x8_t *)&rec[x];
2637 +                vsignRight = sign_diff_neon(in,*(uint8x8_t *)&rec[x+1]);
2638 +                shifter.val[0] = vneg_s8(vsignRight);
2639 +                int8x8_t tmp = shifter.val[0];
2640 +                int8x8_t edge = vtbl2_s8(shifter,index);
2641 +                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight,edge),vdup_n_s8(2));
2642 +                shifter.val[1][0] = tmp[7];
2643 +                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2644 +                t1 = vaddw_u8(t1,in);
2645 +                t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2646 +                t1 = vminq_s16(t1,vdupq_n_s16(255));
2647 +                *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2648 +            }
2649 +            signLeft0 = shifter.val[1][0];
2650 +        }
2651 +        for (; x < width; x++)
2652 +        {
2653 +            signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
2654 +            edgeType = signRight + signLeft0 + 2;
2655 +            signLeft0 = -signRight;
2656 +            rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2657 +        }
2658 +        rec += stride;
2659 +    }
2660 +}
2661 +
2662 +static void processSaoCUE1_neon(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
2663 +{
2664 +    int x = 0;
2665 +    int8_t signDown;
2666 +    int edgeType;
2667 +
2668 +    if (width >= 8) {
2669 +        int8x8_t tbl = *(int8x8_t *)offsetEo;
2670 +        for (; (x+8) <= width; x+=8)
2671 +        {
2672 +            uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2673 +            uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride];
2674 +            int8x8_t vsignDown = sign_diff_neon(in0,in1);
2675 +            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&upBuff1[x]),vdup_n_s8(2));
2676 +            *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
2677 +            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2678 +            t1 = vaddw_u8(t1,in0);
2679 +            *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
2680 +      }
2681 +    }
2682 +    for (; x < width; x++)
2683 +    {
2684 +        signDown = signOf(rec[x] - rec[x + stride]);
2685 +        edgeType = signDown + upBuff1[x] + 2;
2686 +        upBuff1[x] = -signDown;
2687 +        rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2688 +    }
2689 +}
2690 +
2691 +static void processSaoCUE1_2Rows_neon(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
2692 +{
2693 +    int y;
2694 +    int8_t signDown;
2695 +    int edgeType;
2696 +
2697 +    for (y = 0; y < 2; y++)
2698 +    {
2699 +      int x=0;
2700 +      if (width >= 8) {
2701 +        int8x8_t tbl = *(int8x8_t *)offsetEo;
2702 +        for (; (x+8) <= width; x+=8)
2703 +        {
2704 +          uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2705 +          uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride];
2706 +          int8x8_t vsignDown = sign_diff_neon(in0,in1);
2707 +          int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&upBuff1[x]),vdup_n_s8(2));
2708 +          *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
2709 +          int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2710 +          t1 = vaddw_u8(t1,in0);
2711 +          t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2712 +          t1 = vminq_s16(t1,vdupq_n_s16(255));
2713 +          *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2714 +
2715 +        }
2716 +      }
2717 +      for (; x < width; x++)
2718 +      {
2719 +          signDown = signOf(rec[x] - rec[x + stride]);
2720 +          edgeType = signDown + upBuff1[x] + 2;
2721 +          upBuff1[x] = -signDown;
2722 +          rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2723 +      }
2724 +      rec += stride;
2725 +  }
2726 +}
2727 +
2728 +static void processSaoCUE2_neon(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
2729 +{
2730 +    int x;
2731 +
2732 +    if (abs(buff1-bufft) < 16)
2733 +    {
2734 +      for (x = 0; x < width; x++)
2735 +      {
2736 +          int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
2737 +          int edgeType = signDown + buff1[x] + 2;
2738 +          bufft[x + 1] = -signDown;
2739 +          rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);;
2740 +      }
2741 +    }
2742 +    else
2743 +    {
2744 +      int8x8_t tbl = *(int8x8_t *)offsetEo;
2745 +      x=0;
2746 +      for (; (x + 8) <= width; x+=8)
2747 +      {
2748 +          uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2749 +          uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride+1];
2750 +          int8x8_t vsignDown = sign_diff_neon(in0,in1);
2751 +          int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&buff1[x]),vdup_n_s8(2));
2752 +          *(int8x8_t *)&bufft[x+1] = vneg_s8(vsignDown);
2753 +          int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2754 +          t1 = vaddw_u8(t1,in0);
2755 +          t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2756 +          t1 = vminq_s16(t1,vdupq_n_s16(255));
2757 +          *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2758 +      }
2759 +      for (; x < width; x++)
2760 +      {
2761 +          int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
2762 +          int edgeType = signDown + buff1[x] + 2;
2763 +          bufft[x + 1] = -signDown;
2764 +          rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);;
2765 +      }
2766 +
2767 +    }
2768 +}
2769 +
2770 +
2771 +static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
2772 +{
2773 +    int8_t signDown;
2774 +    int8_t edgeType;
2775 +  int8x8_t tbl = *(int8x8_t *)offsetEo;
2776 +
2777 +    int x = startX + 1;
2778 +  for (; (x+8) <= endX; x+=8 )
2779 +  {
2780 +    uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2781 +    uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride];
2782 +    int8x8_t vsignDown = sign_diff_neon(in0,in1);
2783 +    int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&upBuff1[x]),vdup_n_s8(2));
2784 +    *(int8x8_t *)&upBuff1[x-1] = vneg_s8(vsignDown);
2785 +    int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2786 +    t1 = vaddw_u8(t1,in0);
2787 +    t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2788 +    t1 = vminq_s16(t1,vdupq_n_s16(255));
2789 +    *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2790 +
2791 +  }
2792 +    for (; x < endX; x++)
2793 +    {
2794 +        signDown = signOf(rec[x] - rec[x + stride]);
2795 +        edgeType = signDown + upBuff1[x] + 2;
2796 +        upBuff1[x - 1] = -signDown;
2797 +        rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2798 +    }
2799 +}
2800 +
2801 +static void processSaoCUB0_neon(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
2802 +{
2803 +    #define SAO_BO_BITS 5
2804 +    const int boShift = X265_DEPTH - SAO_BO_BITS;
2805 +    int x, y;
2806 +  int8x8x4_t table;
2807 +  table = *(int8x8x4_t *)offset;
2808 +
2809 +    for (y = 0; y < ctuHeight; y++)
2810 +    {
2811 +
2812 +        for (x = 0; (x+8) <= ctuWidth; x+=8)
2813 +        {
2814 +          int8x8_t in = *(int8x8_t*)&rec[x];
2815 +          int8x8_t offsets = vtbl4_s8(table,vshr_n_u8(in,boShift));
2816 +          int16x8_t tmp = vmovl_s8(offsets);
2817 +          tmp = vaddw_u8(tmp,in);
2818 +          tmp = vmaxq_s16(tmp,vdupq_n_s16(0));
2819 +          tmp = vminq_s16(tmp,vdupq_n_s16(255));
2820 +          *(uint8x8_t *)&rec[x] = vmovn_u16(tmp);
2821 +        }
2822 +        for (; x < ctuWidth; x++)
2823 +        {
2824 +            rec[x] = x265_clip(rec[x] + offset[rec[x] >> boShift]);
2825 +        }
2826 +        rec += stride;
2827 +    }
2828 +}
2829 +
2830 +}
2831 +
2832 +
2833 +
2834 +namespace X265_NS {
2835 +void setupLoopFilterPrimitives_neon(EncoderPrimitives &p)
2836 +{
2837 +    p.saoCuOrgE0 = processSaoCUE0_neon;
2838 +    p.saoCuOrgE1 = processSaoCUE1_neon;
2839 +    p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows_neon;
2840 +    p.saoCuOrgE2[0] = processSaoCUE2_neon;
2841 +    p.saoCuOrgE2[1] = processSaoCUE2_neon;
2842 +    p.saoCuOrgE3[0] = processSaoCUE3_neon;
2843 +    p.saoCuOrgE3[1] = processSaoCUE3_neon;
2844 +    p.saoCuOrgB0 = processSaoCUB0_neon;
2845 +    p.sign = calSign_neon;
2846 +
2847 +}
2848 +
2849 +#else //HIGH_BIT_DEPTH
2850 +
2851 +
2852 +namespace X265_NS {
2853 +void setupLoopFilterPrimitives_neon(EncoderPrimitives &)
2854 +{
2855 +}
2856 +
2857 +#endif
2858 +
2859 +
2860 +}
2861 diff -Naur ./source/common/arm64/loopfilter-prim.h ../x265_apple_patch/source/common/arm64/loopfilter-prim.h
2862 --- ./source/common/arm64/loopfilter-prim.h     1970-01-01 01:00:00.000000000 +0100
2863 +++ ../x265_apple_patch/source/common/arm64/loopfilter-prim.h   2021-05-08 13:08:01.000000000 +0100
2864 @@ -0,0 +1,43 @@
2865 +#ifndef _LOOPFILTER_NEON_H__
2866 +#define _LOOPFILTER_NEON_H__
2867 +
2868 +
2869 +/*****************************************************************************
2870 +* Copyright (C) 2013-2017 MulticoreWare, Inc
2871 +*
2872 +* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
2873 +*          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
2874 +*          Min Chen <chenm003@163.com>
2875 +*
2876 +* This program is free software; you can redistribute it and/or modify
2877 +* it under the terms of the GNU General Public License as published by
2878 +* the Free Software Foundation; either version 2 of the License, or
2879 +* (at your option) any later version.
2880 +*
2881 +* This program is distributed in the hope that it will be useful,
2882 +* but WITHOUT ANY WARRANTY; without even the implied warranty of
2883 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
2884 +* GNU General Public License for more details.
2885 +*
2886 +* You should have received a copy of the GNU General Public License
2887 +* along with this program; if not, write to the Free Software
2888 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
2889 +*
2890 +* This program is also available under a commercial proprietary license.
2891 +* For more information, contact us at license @ x265.com.
2892 +*****************************************************************************/
2893 +
2894 +
2895 +
2896 +#include "common.h"
2897 +#include "primitives.h"
2898 +
2899 +#define PIXEL_MIN 0
2900 +
2901 +namespace X265_NS {
2902 +void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
2903 +
2904 +};
2905 +
2906 +
2907 +#endif
2908 diff -Naur ./source/common/arm64/pixel-prim.cpp ../x265_apple_patch/source/common/arm64/pixel-prim.cpp
2909 --- ./source/common/arm64/pixel-prim.cpp        1970-01-01 01:00:00.000000000 +0100
2910 +++ ../x265_apple_patch/source/common/arm64/pixel-prim.cpp      2021-05-08 13:08:01.000000000 +0100
2911 @@ -0,0 +1,1940 @@
2912 +#include "common.h"
2913 +#include "slicetype.h"      // LOWRES_COST_MASK
2914 +#include "primitives.h"
2915 +#include "x265.h"
2916 +
2917 +#include "pixel-prim.h"
2918 +#include "arm64-utils.h"
2919 +#if HAVE_NEON
2920 +
2921 +#include <arm_neon.h>
2922 +
2923 +using namespace X265_NS;
2924 +
2925 +
2926 +
2927 +namespace  {
2928 +
2929 +
2930 +/* SATD SA8D variants - based on x264 */
2931 +static inline void SUMSUB_AB(int16x8_t& sum, int16x8_t& sub, const int16x8_t a, const int16x8_t b)
2932 +{
2933 +    sum = vaddq_s16(a,b);
2934 +    sub = vsubq_s16(a,b);
2935 +}
2936 +
2937 +static inline void transpose_8h(int16x8_t& t1, int16x8_t& t2, const int16x8_t s1, const int16x8_t s2)
2938 +{
2939 +    t1 = vtrn1q_s16(s1, s2);
2940 +    t2 = vtrn2q_s16(s1, s2);
2941 +}
2942 +
2943 +static inline void transpose_4s(int16x8_t& t1, int16x8_t& t2, const int16x8_t s1, const int16x8_t s2)
2944 +{
2945 +    t1 = vtrn1q_s32(s1, s2);
2946 +    t2 = vtrn2q_s32(s1, s2);
2947 +}
2948 +
2949 +#if (X265_DEPTH <= 10)
2950 +static inline void transpose_2d(int16x8_t& t1, int16x8_t& t2, const int16x8_t s1, const int16x8_t s2)
2951 +{
2952 +    t1 = vtrn1q_s64(s1, s2);
2953 +    t2 = vtrn2q_s64(s1, s2);
2954 +}
2955 +#endif
2956 +
2957 +
2958 +static inline void SUMSUB_ABCD(int16x8_t& s1, int16x8_t& d1, int16x8_t& s2, int16x8_t& d2,
2959 +                        int16x8_t a,int16x8_t  b,int16x8_t  c,int16x8_t  d)
2960 +{
2961 +    SUMSUB_AB(s1,d1,a,b);
2962 +    SUMSUB_AB(s2,d2,c,d);
2963 +}
2964 +
2965 +static inline void HADAMARD4_V(int16x8_t& r1,int16x8_t& r2,int16x8_t& r3,int16x8_t& r4,
2966 +                        int16x8_t& t1,int16x8_t& t2,int16x8_t& t3,int16x8_t& t4)
2967 +{
2968 +    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
2969 +    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
2970 +}
2971 +
2972 +
2973 +static int _satd_4x8_8x4_end_neon(int16x8_t v0,int16x8_t v1,int16x8_t v2, int16x8_t v3)
2974 +
2975 +{
2976 +
2977 +    int16x8_t v4,v5,v6,v7,v16,v17,v18,v19;
2978 +
2979 +
2980 +    SUMSUB_AB   (v16, v17, v0,  v1);
2981 +    SUMSUB_AB   (v18, v19, v2,  v3);
2982 +
2983 +    SUMSUB_AB   (v4 , v6 , v16, v18);
2984 +    SUMSUB_AB   (v5 , v7 , v17, v19);
2985 +
2986 +    v0 = vtrn1q_s16(v4, v5);
2987 +    v1 = vtrn2q_s16(v4, v5);
2988 +    v2 = vtrn1q_s16(v6, v7);
2989 +    v3 = vtrn2q_s16(v6, v7);
2990 +
2991 +    SUMSUB_AB   (v16, v17, v0,  v1);
2992 +    SUMSUB_AB   (v18, v19, v2,  v3);
2993 +
2994 +    v0 = vtrn1q_s32(v16, v18);
2995 +    v1 = vtrn2q_s32(v16, v18);
2996 +    v2 = vtrn1q_s32(v17, v19);
2997 +    v3 = vtrn2q_s32(v17, v19);
2998 +
2999 +    v0 = vabsq_s16(v0);
3000 +    v1 = vabsq_s16(v1);
3001 +    v2 = vabsq_s16(v2);
3002 +    v3 = vabsq_s16(v3);
3003 +
3004 +    v0 = vmaxq_u16(v0, v1);
3005 +    v1 = vmaxq_u16(v2, v3);
3006 +
3007 +    v0 = vaddq_u16(v0, v1);
3008 +    return vaddlvq_u16(v0);
3009 +}
3010 +
3011 +static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
3012 +{
3013 +    int16x8_t v2,v3;
3014 +    SUMSUB_AB   (v2,  v3,  v0,  v1);
3015 +
3016 +    v0 = vzip1q_s64(v2,v3);
3017 +    v1 = vzip2q_s64(v2,v3);
3018 +    SUMSUB_AB   (v2,  v3,  v0,  v1);
3019 +
3020 +    v0 = vtrn1q_s16(v2,v3);
3021 +    v1 = vtrn2q_s16(v2,v3);
3022 +    SUMSUB_AB   (v2,  v3,  v0,  v1);
3023 +
3024 +    v0 = vtrn1q_s32(v2,v3);
3025 +    v1 = vtrn2q_s32(v2,v3);
3026 +
3027 +    v0 = vabsq_s16(v0);
3028 +    v1 = vabsq_s16(v1);
3029 +    v0 = vmaxq_u16(v0, v1);
3030 +
3031 +    return vaddlvq_s16(v0);
3032 +}
3033 +
3034 +static void _satd_8x4v_8x8h_neon(int16x8_t& v0,int16x8_t& v1, int16x8_t&v2,int16x8_t& v3,int16x8_t& v20,int16x8_t& v21, int16x8_t&v22,int16x8_t& v23)
3035 +{
3036 +    int16x8_t v16,v17,v18,v19,v4,v5,v6,v7;
3037 +
3038 +    SUMSUB_AB(v16, v18, v0,  v2);
3039 +    SUMSUB_AB(v17, v19, v1,  v3);
3040 +
3041 +    HADAMARD4_V (v20, v21, v22, v23, v0,  v1, v2, v3);
3042 +
3043 +    transpose_8h(   v0,  v1,  v16, v17);
3044 +    transpose_8h(   v2,  v3,  v18, v19);
3045 +    transpose_8h(   v4,  v5,  v20, v21);
3046 +    transpose_8h(   v6,  v7,  v22, v23);
3047 +
3048 +    SUMSUB_AB   (v16, v17, v0,  v1);
3049 +    SUMSUB_AB   (v18, v19, v2,  v3);
3050 +    SUMSUB_AB   (v20, v21, v4,  v5);
3051 +    SUMSUB_AB   (v22, v23, v6,  v7);
3052 +
3053 +    transpose_4s(   v0,  v2,  v16, v18);
3054 +    transpose_4s(   v1,  v3,  v17, v19);
3055 +    transpose_4s(   v4,  v6,  v20, v22);
3056 +    transpose_4s(   v5,  v7,  v21, v23);
3057 +
3058 +    v0 = vabsq_s16(v0);
3059 +    v1 = vabsq_s16(v1);
3060 +    v2 = vabsq_s16(v2);
3061 +    v3 = vabsq_s16(v3);
3062 +    v4 = vabsq_s16(v4);
3063 +    v5 = vabsq_s16(v5);
3064 +    v6 = vabsq_s16(v6);
3065 +    v7 = vabsq_s16(v7);
3066 +
3067 +    v0 = vmaxq_u16(v0,v2);
3068 +    v1 = vmaxq_u16(v1,v3);
3069 +    v2 = vmaxq_u16(v4,v6);
3070 +    v3 = vmaxq_u16(v5,v7);
3071 +
3072 +}
3073 +
3074 +#if HIGH_BIT_DEPTH
3075 +
3076 +#if (X265_DEPTH > 10)
3077 +static inline void transpose_2d(int32x4_t& t1, int32x4_t& t2, const int32x4_t s1, const int32x4_t s2)
3078 +{
3079 +    t1 = vtrn1q_s64(s1, s2);
3080 +    t2 = vtrn2q_s64(s1, s2);
3081 +}
3082 +
3083 +static inline void ISUMSUB_AB(int32x4_t& sum, int32x4_t& sub, const int32x4_t a, const int32x4_t b)
3084 +{
3085 +    sum = vaddq_s32(a,b);
3086 +    sub = vsubq_s32(a,b);
3087 +}
3088 +
3089 +static inline void ISUMSUB_AB_FROM_INT16(int32x4_t& suml, int32x4_t& sumh, int32x4_t& subl, int32x4_t& subh, const int16x8_t a, const int16x8_t b)
3090 +{
3091 +    suml = vaddl_s16(vget_low_s16(a),vget_low_s16(b));
3092 +    sumh = vaddl_high_s16(a,b);
3093 +    subl = vsubl_s16(vget_low_s16(a),vget_low_s16(b));
3094 +    subh = vsubl_high_s16(a, b);
3095 +}
3096 +
3097 +#endif
3098 +
3099 +static inline void _sub_8x8_fly(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2,
3100 +                            int16x8_t& v0,int16x8_t& v1, int16x8_t& v2,int16x8_t& v3,
3101 +                            int16x8_t& v20,int16x8_t& v21, int16x8_t& v22,int16x8_t& v23)
3102 +{
3103 +    uint16x8_t r0,r1,r2,r3;
3104 +    uint16x8_t t0,t1,t2,t3;
3105 +    int16x8_t v16,v17;
3106 +    int16x8_t v18,v19;
3107 +
3108 +    r0 = *(uint16x8_t*)(pix1 + 0*stride_pix1);
3109 +    r1 = *(uint16x8_t*)(pix1 + 1*stride_pix1);
3110 +    r2 = *(uint16x8_t*)(pix1 + 2*stride_pix1);
3111 +    r3 = *(uint16x8_t*)(pix1 + 3*stride_pix1);
3112 +
3113 +    t0 = *(uint16x8_t*)(pix2 + 0*stride_pix2);
3114 +    t1 = *(uint16x8_t*)(pix2 + 1*stride_pix2);
3115 +    t2 = *(uint16x8_t*)(pix2 + 2*stride_pix2);
3116 +    t3 = *(uint16x8_t*)(pix2 + 3*stride_pix2);
3117 +
3118 +    v16 = vsubq_u16(r0,t0);
3119 +    v17 = vsubq_u16(r1,t1);
3120 +    v18 = vsubq_u16(r2,t2);
3121 +    v19 = vsubq_u16(r3,t3);
3122 +
3123 +    r0 = *(uint16x8_t*)(pix1 + 4*stride_pix1);
3124 +    r1 = *(uint16x8_t*)(pix1 + 5*stride_pix1);
3125 +    r2 = *(uint16x8_t*)(pix1 + 6*stride_pix1);
3126 +    r3 = *(uint16x8_t*)(pix1 + 7*stride_pix1);
3127 +
3128 +    t0 = *(uint16x8_t*)(pix2 + 4*stride_pix2);
3129 +    t1 = *(uint16x8_t*)(pix2 + 5*stride_pix2);
3130 +    t2 = *(uint16x8_t*)(pix2 + 6*stride_pix2);
3131 +    t3 = *(uint16x8_t*)(pix2 + 7*stride_pix2);
3132 +
3133 +    v20 = vsubq_u16(r0,t0);
3134 +    v21 = vsubq_u16(r1,t1);
3135 +    v22 = vsubq_u16(r2,t2);
3136 +    v23 = vsubq_u16(r3,t3);
3137 +
3138 +    SUMSUB_AB   (v0,  v1,  v16, v17);
3139 +    SUMSUB_AB   (v2,  v3,  v18, v19);
3140 +
3141 +}
3142 +
3143 +
3144 +
3145 +
3146 +static void _satd_16x4_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2,
3147 +                            int16x8_t& v0,int16x8_t&v1, int16x8_t&v2,int16x8_t&v3)
3148 +{
3149 +    uint8x16_t r0,r1,r2,r3;
3150 +    uint8x16_t t0,t1,t2,t3;
3151 +    int16x8_t v16,v17,v20,v21;
3152 +    int16x8_t v18,v19,v22,v23;
3153 +
3154 +    r0 = *(int16x8_t*)(pix1 + 0*stride_pix1);
3155 +    r1 = *(int16x8_t*)(pix1 + 1*stride_pix1);
3156 +    r2 = *(int16x8_t*)(pix1 + 2*stride_pix1);
3157 +    r3 = *(int16x8_t*)(pix1 + 3*stride_pix1);
3158 +
3159 +    t0 = *(int16x8_t*)(pix2 + 0*stride_pix2);
3160 +    t1 = *(int16x8_t*)(pix2 + 1*stride_pix2);
3161 +    t2 = *(int16x8_t*)(pix2 + 2*stride_pix2);
3162 +    t3 = *(int16x8_t*)(pix2 + 3*stride_pix2);
3163 +
3164 +
3165 +    v16 = vsubq_u16((r0),(t0) );
3166 +    v17 = vsubq_u16((r1),(t1) );
3167 +    v18 = vsubq_u16((r2),(t2) );
3168 +    v19 = vsubq_u16((r3),(t3) );
3169 +
3170 +    r0 = *(int16x8_t*)(pix1 + 0*stride_pix1 + 8);
3171 +    r1 = *(int16x8_t*)(pix1 + 1*stride_pix1 + 8);
3172 +    r2 = *(int16x8_t*)(pix1 + 2*stride_pix1 + 8);
3173 +    r3 = *(int16x8_t*)(pix1 + 3*stride_pix1 + 8);
3174 +
3175 +    t0 = *(int16x8_t*)(pix2 + 0*stride_pix2 + 8);
3176 +    t1 = *(int16x8_t*)(pix2 + 1*stride_pix2 + 8);
3177 +    t2 = *(int16x8_t*)(pix2 + 2*stride_pix2 + 8);
3178 +    t3 = *(int16x8_t*)(pix2 + 3*stride_pix2 + 8);
3179 +
3180 +
3181 +    v20 = vsubq_u16(r0,t0);
3182 +    v21 = vsubq_u16(r1,t1);
3183 +    v22 = vsubq_u16(r2,t2);
3184 +    v23 = vsubq_u16(r3,t3);
3185 +
3186 +    SUMSUB_AB   (v0,  v1,  v16, v17);
3187 +    SUMSUB_AB   (v2,  v3,  v18, v19);
3188 +
3189 +    _satd_8x4v_8x8h_neon(v0,v1,v2,v3,v20,v21,v22,v23);
3190 +
3191 +}
3192 +
3193 +
3194 +int pixel_satd_4x4_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2)
3195 +{
3196 +    uint64x2_t t0,t1,r0,r1;
3197 +    t0[0] = *(uint64_t *)(pix1 + 0*stride_pix1);
3198 +    t1[0] = *(uint64_t *)(pix1 + 1*stride_pix1);
3199 +    t0[1] = *(uint64_t *)(pix1 + 2*stride_pix1);
3200 +    t1[1] = *(uint64_t *)(pix1 + 3*stride_pix1);
3201 +
3202 +    r0[0] = *(uint64_t *)(pix2 + 0*stride_pix1);
3203 +    r1[0] = *(uint64_t *)(pix2 + 1*stride_pix2);
3204 +    r0[1] = *(uint64_t *)(pix2 + 2*stride_pix2);
3205 +    r1[1] = *(uint64_t *)(pix2 + 3*stride_pix2);
3206 +
3207 +    return _satd_4x4_neon(vsubq_u16(t0,r0), vsubq_u16(r1,t1));
3208 +}
3209 +
3210 +
3211 +
3212 +
3213 +
3214 +
3215 +int pixel_satd_8x4_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2)
3216 +{
3217 +    uint16x8_t i0,i1,i2,i3,i4,i5,i6,i7;
3218 +
3219 +    i0 = *(uint16x8_t *)(pix1 + 0*stride_pix1);
3220 +    i1 = *(uint16x8_t *)(pix2 + 0*stride_pix2);
3221 +    i2 = *(uint16x8_t *)(pix1 + 1*stride_pix1);
3222 +    i3 = *(uint16x8_t *)(pix2 + 1*stride_pix2);
3223 +    i4 = *(uint16x8_t *)(pix1 + 2*stride_pix1);
3224 +    i5 = *(uint16x8_t *)(pix2 + 2*stride_pix2);
3225 +    i6 = *(uint16x8_t *)(pix1 + 3*stride_pix1);
3226 +    i7 = *(uint16x8_t *)(pix2 + 3*stride_pix2);
3227 +
3228 +    int16x8_t v0 = vsubq_u16(i0,i1);
3229 +    int16x8_t v1 = vsubq_u16(i2,i3);
3230 +    int16x8_t v2 = vsubq_u16(i4,i5);
3231 +    int16x8_t v3 = vsubq_u16(i6,i7);
3232 +
3233 +    return _satd_4x8_8x4_end_neon(v0,v1,v2,v3);
3234 +}
3235 +
3236 +
3237 +int pixel_satd_16x16_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2)
3238 +{
3239 +    int32x4_t v30 = vdupq_n_u32(0),v31= vdupq_n_u32(0);
3240 +    int16x8_t v0,v1,v2,v3;
3241 +
3242 +    _satd_16x4_neon(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3);
3243 +    v30 = vpadalq_u16(v30,v0);
3244 +    v30 = vpadalq_u16(v30,v1);
3245 +    v31 = vpadalq_u16(v31,v2);
3246 +    v31 = vpadalq_u16(v31,v3);
3247 +
3248 +    _satd_16x4_neon(pix1 + 4*stride_pix1,stride_pix1,pix2+4*stride_pix2,stride_pix2,v0,v1,v2,v3);
3249 +    v30 = vpadalq_u16(v30,v0);
3250 +    v30 = vpadalq_u16(v30,v1);
3251 +    v31 = vpadalq_u16(v31,v2);
3252 +    v31 = vpadalq_u16(v31,v3);
3253 +
3254 +    _satd_16x4_neon(pix1 + 8*stride_pix1,stride_pix1,pix2+8*stride_pix2,stride_pix2,v0,v1,v2,v3);
3255 +    v30 = vpadalq_u16(v30,v0);
3256 +    v30 = vpadalq_u16(v30,v1);
3257 +    v31 = vpadalq_u16(v31,v2);
3258 +    v31 = vpadalq_u16(v31,v3);
3259 +
3260 +    _satd_16x4_neon(pix1 + 12*stride_pix1,stride_pix1,pix2+12*stride_pix2,stride_pix2,v0,v1,v2,v3);
3261 +    v30 = vpadalq_u16(v30,v0);
3262 +    v30 = vpadalq_u16(v30,v1);
3263 +    v31 = vpadalq_u16(v31,v2);
3264 +    v31 = vpadalq_u16(v31,v3);
3265 +
3266 +    return vaddvq_s32(vaddq_s32(v30,v31));
3267 +
3268 +}
3269 +
3270 +#else       //HIGH_BIT_DEPTH
3271 +
3272 +static void _satd_16x4_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2,
3273 +                            int16x8_t& v0,int16x8_t&v1, int16x8_t&v2,int16x8_t&v3)
3274 +{
3275 +    uint8x16_t r0,r1,r2,r3;
3276 +    uint8x16_t t0,t1,t2,t3;
3277 +    int16x8_t v16,v17,v20,v21;
3278 +    int16x8_t v18,v19,v22,v23;
3279 +
3280 +    r0 = *(uint8x16_t*)(pix1 + 0*stride_pix1);
3281 +    r1 = *(uint8x16_t*)(pix1 + 1*stride_pix1);
3282 +    r2 = *(uint8x16_t*)(pix1 + 2*stride_pix1);
3283 +    r3 = *(uint8x16_t*)(pix1 + 3*stride_pix1);
3284 +
3285 +    t0 = *(uint8x16_t*)(pix2 + 0*stride_pix2);
3286 +    t1 = *(uint8x16_t*)(pix2 + 1*stride_pix2);
3287 +    t2 = *(uint8x16_t*)(pix2 + 2*stride_pix2);
3288 +    t3 = *(uint8x16_t*)(pix2 + 3*stride_pix2);
3289 +
3290 +
3291 +
3292 +    v16 = vsubl_u8(vget_low_u8(r0),vget_low_u8(t0) );
3293 +    v20 = vsubl_high_u8(r0,t0);
3294 +    v17 = vsubl_u8(vget_low_u8(r1),vget_low_u8(t1) );
3295 +    v21 = vsubl_high_u8(r1,t1);
3296 +    v18 = vsubl_u8(vget_low_u8(r2),vget_low_u8(t2) );
3297 +    v22 = vsubl_high_u8(r2,t2);
3298 +    v19 = vsubl_u8(vget_low_u8(r3),vget_low_u8(t3) );
3299 +    v23 = vsubl_high_u8(r3,t3);
3300 +
3301 +    SUMSUB_AB   (v0,  v1,  v16, v17);
3302 +    SUMSUB_AB   (v2,  v3,  v18, v19);
3303 +
3304 +    _satd_8x4v_8x8h_neon(v0,v1,v2,v3,v20,v21,v22,v23);
3305 +
3306 +}
3307 +
3308 +
3309 +static inline void _sub_8x8_fly(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2,
3310 +                            int16x8_t& v0,int16x8_t& v1, int16x8_t& v2,int16x8_t& v3,
3311 +                            int16x8_t& v20,int16x8_t& v21, int16x8_t& v22,int16x8_t& v23)
3312 +{
3313 +    uint8x8_t r0,r1,r2,r3;
3314 +    uint8x8_t t0,t1,t2,t3;
3315 +    int16x8_t v16,v17;
3316 +    int16x8_t v18,v19;
3317 +
3318 +    r0 = *(uint8x8_t*)(pix1 + 0*stride_pix1);
3319 +    r1 = *(uint8x8_t*)(pix1 + 1*stride_pix1);
3320 +    r2 = *(uint8x8_t*)(pix1 + 2*stride_pix1);
3321 +    r3 = *(uint8x8_t*)(pix1 + 3*stride_pix1);
3322 +
3323 +    t0 = *(uint8x8_t*)(pix2 + 0*stride_pix2);
3324 +    t1 = *(uint8x8_t*)(pix2 + 1*stride_pix2);
3325 +    t2 = *(uint8x8_t*)(pix2 + 2*stride_pix2);
3326 +    t3 = *(uint8x8_t*)(pix2 + 3*stride_pix2);
3327 +
3328 +    v16 = vsubl_u8(r0,t0);
3329 +    v17 = vsubl_u8(r1,t1);
3330 +    v18 = vsubl_u8(r2,t2);
3331 +    v19 = vsubl_u8(r3,t3);
3332 +
3333 +    r0 = *(uint8x8_t*)(pix1 + 4*stride_pix1);
3334 +    r1 = *(uint8x8_t*)(pix1 + 5*stride_pix1);
3335 +    r2 = *(uint8x8_t*)(pix1 + 6*stride_pix1);
3336 +    r3 = *(uint8x8_t*)(pix1 + 7*stride_pix1);
3337 +
3338 +    t0 = *(uint8x8_t*)(pix2 + 4*stride_pix2);
3339 +    t1 = *(uint8x8_t*)(pix2 + 5*stride_pix2);
3340 +    t2 = *(uint8x8_t*)(pix2 + 6*stride_pix2);
3341 +    t3 = *(uint8x8_t*)(pix2 + 7*stride_pix2);
3342 +
3343 +    v20 = vsubl_u8(r0,t0);
3344 +    v21 = vsubl_u8(r1,t1);
3345 +    v22 = vsubl_u8(r2,t2);
3346 +    v23 = vsubl_u8(r3,t3);
3347 +
3348 +
3349 +    SUMSUB_AB   (v0,  v1,  v16, v17);
3350 +    SUMSUB_AB   (v2,  v3,  v18, v19);
3351 +
3352 +}
3353 +
3354 +int pixel_satd_4x4_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2)
3355 +{
3356 +    uint32x2_t t0,t1,r0,r1;
3357 +    t0[0] = *(uint32_t *)(pix1 + 0*stride_pix1);
3358 +    t1[0] = *(uint32_t *)(pix1 + 1*stride_pix1);
3359 +    t0[1] = *(uint32_t *)(pix1 + 2*stride_pix1);
3360 +    t1[1] = *(uint32_t *)(pix1 + 3*stride_pix1);
3361 +
3362 +    r0[0] = *(uint32_t *)(pix2 + 0*stride_pix1);
3363 +    r1[0] = *(uint32_t *)(pix2 + 1*stride_pix2);
3364 +    r0[1] = *(uint32_t *)(pix2 + 2*stride_pix2);
3365 +    r1[1] = *(uint32_t *)(pix2 + 3*stride_pix2);
3366 +
3367 +    return _satd_4x4_neon(vsubl_u8(t0,r0), vsubl_u8(r1,t1));
3368 +}
3369 +
3370 +
3371 +int pixel_satd_8x4_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2)
3372 +{
3373 +    uint8x8_t i0,i1,i2,i3,i4,i5,i6,i7;
3374 +
3375 +    i0 = *(uint8x8_t *)(pix1 + 0*stride_pix1);
3376 +    i1 = *(uint8x8_t *)(pix2 + 0*stride_pix2);
3377 +    i2 = *(uint8x8_t *)(pix1 + 1*stride_pix1);
3378 +    i3 = *(uint8x8_t *)(pix2 + 1*stride_pix2);
3379 +    i4 = *(uint8x8_t *)(pix1 + 2*stride_pix1);
3380 +    i5 = *(uint8x8_t *)(pix2 + 2*stride_pix2);
3381 +    i6 = *(uint8x8_t *)(pix1 + 3*stride_pix1);
3382 +    i7 = *(uint8x8_t *)(pix2 + 3*stride_pix2);
3383 +
3384 +    int16x8_t v0 = vsubl_u8(i0,i1);
3385 +    int16x8_t v1 = vsubl_u8(i2,i3);
3386 +    int16x8_t v2 = vsubl_u8(i4,i5);
3387 +    int16x8_t v3 = vsubl_u8(i6,i7);
3388 +
3389 +    return _satd_4x8_8x4_end_neon(v0,v1,v2,v3);
3390 +}
3391 +
3392 +int pixel_satd_16x16_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2)
3393 +{
3394 +    int16x8_t v30,v31;
3395 +    int16x8_t v0,v1,v2,v3;
3396 +
3397 +    _satd_16x4_neon(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3);
3398 +    v30 = vaddq_s16(v0,v1);
3399 +    v31 = vaddq_s16(v2,v3);
3400 +
3401 +    _satd_16x4_neon(pix1 + 4*stride_pix1,stride_pix1,pix2+4*stride_pix2,stride_pix2,v0,v1,v2,v3);
3402 +    v0 = vaddq_s16(v0,v1);
3403 +    v1 = vaddq_s16(v2,v3);
3404 +    v30 = vaddq_s16(v30, v0);
3405 +    v31 = vaddq_s16(v31, v1);
3406 +
3407 +    _satd_16x4_neon(pix1 + 8*stride_pix1,stride_pix1,pix2+8*stride_pix2,stride_pix2,v0,v1,v2,v3);
3408 +    v0 = vaddq_s16(v0,v1);
3409 +    v1 = vaddq_s16(v2,v3);
3410 +    v30 = vaddq_s16(v30, v0);
3411 +    v31 = vaddq_s16(v31, v1);
3412 +
3413 +    _satd_16x4_neon(pix1 + 12*stride_pix1,stride_pix1,pix2+12*stride_pix2,stride_pix2,v0,v1,v2,v3);
3414 +    v0 = vaddq_s16(v0,v1);
3415 +    v1 = vaddq_s16(v2,v3);
3416 +    v30 = vaddq_s16(v30, v0);
3417 +    v31 = vaddq_s16(v31, v1);
3418 +
3419 +    int32x4_t sum0 = vpaddlq_u16(v30);
3420 +    int32x4_t sum1 = vpaddlq_u16(v31);
3421 +    sum0 = vaddq_s32(sum0,sum1);
3422 +    return vaddvq_s32(sum0);
3423 +
3424 +}
3425 +#endif      //HIGH_BIT_DEPTH
3426 +
3427 +
3428 +static inline void _sa8d_8x8_neon_end(int16x8_t& v0,int16x8_t& v1,int16x8_t v2,int16x8_t v3,
3429 +                                     int16x8_t v20,int16x8_t v21,int16x8_t v22,int16x8_t v23)
3430 +{
3431 +    int16x8_t v16,v17,v18,v19;
3432 +    int16x8_t v4,v5,v6,v7;
3433 +
3434 +    SUMSUB_AB   (v16, v18, v0,  v2);
3435 +    SUMSUB_AB   (v17, v19, v1,  v3);
3436 +
3437 +    HADAMARD4_V (v20, v21, v22, v23, v0,  v1, v2, v3);
3438 +
3439 +    SUMSUB_AB   (v0,  v16, v16, v20);
3440 +    SUMSUB_AB   (v1,  v17, v17, v21);
3441 +    SUMSUB_AB   (v2,  v18, v18, v22);
3442 +    SUMSUB_AB   (v3,  v19, v19, v23);
3443 +
3444 +    transpose_8h   (v20, v21, v16, v17);
3445 +    transpose_8h   (v4,  v5,  v0,  v1);
3446 +    transpose_8h   (v22, v23, v18, v19);
3447 +    transpose_8h   (v6,  v7,  v2,  v3);
3448 +
3449 +#if (X265_DEPTH <= 10)
3450 +
3451 +    int16x8_t v24,v25;
3452 +
3453 +    SUMSUB_AB   (v2,  v3,  v20, v21);
3454 +    SUMSUB_AB   (v24, v25, v4,  v5);
3455 +    SUMSUB_AB   (v0,  v1,  v22, v23);
3456 +    SUMSUB_AB   (v4,  v5,  v6,  v7);
3457 +
3458 +    transpose_4s   (v20, v22, v2,  v0);
3459 +    transpose_4s   (v21, v23, v3,  v1);
3460 +    transpose_4s   (v16, v18, v24, v4);
3461 +    transpose_4s   (v17, v19, v25, v5);
3462 +
3463 +    SUMSUB_AB   (v0,  v2,  v20, v22);
3464 +    SUMSUB_AB   (v1,  v3,  v21, v23);
3465 +    SUMSUB_AB   (v4,  v6,  v16, v18);
3466 +    SUMSUB_AB   (v5,  v7,  v17, v19);
3467 +
3468 +    transpose_2d   (v16, v20,  v0,  v4);
3469 +    transpose_2d   (v17, v21,  v1,  v5);
3470 +    transpose_2d   (v18, v22,  v2,  v6);
3471 +    transpose_2d   (v19, v23,  v3,  v7);
3472 +
3473 +
3474 +    v16 = vabsq_s16(v16);
3475 +    v17 = vabsq_s16(v17);
3476 +    v18 = vabsq_s16(v18);
3477 +    v19 = vabsq_s16(v19);
3478 +    v20 = vabsq_s16(v20);
3479 +    v21 = vabsq_s16(v21);
3480 +    v22 = vabsq_s16(v22);
3481 +    v23 = vabsq_s16(v23);
3482 +
3483 +    v16 = vmaxq_u16(v16,v20);
3484 +    v17 = vmaxq_u16(v17,v21);
3485 +    v18 = vmaxq_u16(v18,v22);
3486 +    v19 = vmaxq_u16(v19,v23);
3487 +
3488 +#if HIGH_BIT_DEPTH
3489 +    v0 = vpaddlq_u16(v16);
3490 +    v1 = vpaddlq_u16(v17);
3491 +    v0 = vpadalq_u16(v0,v18);
3492 +    v1 = vpadalq_u16(v1,v19);
3493 +
3494 +#else //HIGH_BIT_DEPTH
3495 +
3496 +    v0 = vaddq_u16(v16,v17);
3497 +    v1 = vaddq_u16(v18,v19);
3498 +
3499 +#endif //HIGH_BIT_DEPTH
3500 +
3501 +#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
3502 +
3503 +    int32x4_t v2l,v2h,v3l,v3h,v24l,v24h,v25l,v25h,v0l,v0h,v1l,v1h;
3504 +    int32x4_t v22l,v22h,v23l,v23h;
3505 +    int32x4_t v4l,v4h,v5l,v5h;
3506 +    int32x4_t v6l,v6h,v7l,v7h;
3507 +    int32x4_t v16l,v16h,v17l,v17h;
3508 +    int32x4_t v18l,v18h,v19l,v19h;
3509 +    int32x4_t v20l,v20h,v21l,v21h;
3510 +
3511 +    ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
3512 +    ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
3513 +
3514 +    v22l = vmovl_s16(vget_low_s16(v22));
3515 +    v22h = vmovl_high_s16(v22);
3516 +    v23l = vmovl_s16(vget_low_s16(v23));
3517 +    v23h = vmovl_high_s16(v23);
3518 +
3519 +    ISUMSUB_AB(v0l,  v1l,  v22l, v23l);
3520 +    ISUMSUB_AB(v0h,  v1h,  v22h, v23h);
3521 +
3522 +    v6l = vmovl_s16(vget_low_s16(v6));
3523 +    v6h = vmovl_high_s16(v6);
3524 +    v7l = vmovl_s16(vget_low_s16(v7));
3525 +    v7h = vmovl_high_s16(v7);
3526 +
3527 +    ISUMSUB_AB   (v4l,  v5l,  v6l,  v7l);
3528 +    ISUMSUB_AB   (v4h,  v5h,  v6h,  v7h);
3529 +
3530 +    transpose_2d   (v20l, v22l, v2l,  v0l);
3531 +    transpose_2d   (v21l, v23l, v3l,  v1l);
3532 +    transpose_2d   (v16l, v18l, v24l, v4l);
3533 +    transpose_2d   (v17l, v19l, v25l, v5l);
3534 +
3535 +    transpose_2d   (v20h, v22h, v2h,  v0h);
3536 +    transpose_2d   (v21h, v23h, v3h,  v1h);
3537 +    transpose_2d   (v16h, v18h, v24h, v4h);
3538 +    transpose_2d   (v17h, v19h, v25h, v5h);
3539 +
3540 +    ISUMSUB_AB   (v0l,  v2l,  v20l, v22l);
3541 +    ISUMSUB_AB   (v1l,  v3l,  v21l, v23l);
3542 +    ISUMSUB_AB   (v4l,  v6l,  v16l, v18l);
3543 +    ISUMSUB_AB   (v5l,  v7l,  v17l, v19l);
3544 +
3545 +    ISUMSUB_AB   (v0h,  v2h,  v20h, v22h);
3546 +    ISUMSUB_AB   (v1h,  v3h,  v21h, v23h);
3547 +    ISUMSUB_AB   (v4h,  v6h,  v16h, v18h);
3548 +    ISUMSUB_AB   (v5h,  v7h,  v17h, v19h);
3549 +
3550 +    v16l = v0l;
3551 +    v16h = v4l;
3552 +    v20l = v0h;
3553 +    v20h = v4h;
3554 +
3555 +    v17l = v1l;
3556 +    v17h = v5l;
3557 +    v21l = v1h;
3558 +    v21h = v5h;
3559 +
3560 +    v18l = v2l;
3561 +    v18h = v6l;
3562 +    v22l = v2h;
3563 +    v22h = v6h;
3564 +
3565 +    v19l = v3l;
3566 +    v19h = v7l;
3567 +    v23l = v3h;
3568 +    v23h = v7h;
3569 +
3570 +    v16l = vabsq_s32(v16l);
3571 +    v17l = vabsq_s32(v17l);
3572 +    v18l = vabsq_s32(v18l);
3573 +    v19l = vabsq_s32(v19l);
3574 +    v20l = vabsq_s32(v20l);
3575 +    v21l = vabsq_s32(v21l);
3576 +    v22l = vabsq_s32(v22l);
3577 +    v23l = vabsq_s32(v23l);
3578 +
3579 +    v16h = vabsq_s32(v16h);
3580 +    v17h = vabsq_s32(v17h);
3581 +    v18h = vabsq_s32(v18h);
3582 +    v19h = vabsq_s32(v19h);
3583 +    v20h = vabsq_s32(v20h);
3584 +    v21h = vabsq_s32(v21h);
3585 +    v22h = vabsq_s32(v22h);
3586 +    v23h = vabsq_s32(v23h);
3587 +
3588 +    v16l = vmaxq_u32(v16l,v20l);
3589 +    v17l = vmaxq_u32(v17l,v21l);
3590 +    v18l = vmaxq_u32(v18l,v22l);
3591 +    v19l = vmaxq_u32(v19l,v23l);
3592 +
3593 +    v16h = vmaxq_u32(v16h,v20h);
3594 +    v17h = vmaxq_u32(v17h,v21h);
3595 +    v18h = vmaxq_u32(v18h,v22h);
3596 +    v19h = vmaxq_u32(v19h,v23h);
3597 +
3598 +    v16l = vaddq_u32(v16l,v16h);
3599 +    v17l = vaddq_u32(v17l,v17h);
3600 +    v18l = vaddq_u32(v18l,v18h);
3601 +    v19l = vaddq_u32(v19l,v19h);
3602 +
3603 +    v0 = vaddq_u32(v16l, v17l);
3604 +    v1 = vaddq_u32(v18l,v19l);
3605 +
3606 +
3607 +#endif
3608 +
3609 +}
3610 +
3611 +
3612 +
3613 +static inline void _satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2,
3614 +                            int16x8_t& v0,int16x8_t&v1, int16x8_t&v2,int16x8_t&v3)
3615 +{
3616 +
3617 +    int16x8_t v20,v21,v22,v23;
3618 +    _sub_8x8_fly(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3,v20,v21,v22,v23);
3619 +    _satd_8x4v_8x8h_neon(v0,v1,v2,v3,v20,v21,v22,v23);
3620 +
3621 +}
3622 +
3623 +
3624 +
3625 +int pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3626 +{
3627 +    int16x8_t v30,v31;
3628 +    int16x8_t v0,v1,v2,v3;
3629 +
3630 +    _satd_8x8_neon(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3);
3631 +#if !(HIGH_BIT_DEPTH)
3632 +    v30 = vaddq_u16(v0,v1);
3633 +    v31 = vaddq_u16(v2,v3);
3634 +
3635 +    uint16x8_t sum = vaddq_u16(v30,v31);
3636 +    return vaddvq_s32(vpaddlq_u16(sum));
3637 +#else
3638 +
3639 +    v30 = vaddq_u16(v0,v1);
3640 +    v31 = vaddq_u16(v2,v3);
3641 +
3642 +    int32x4_t sum = vpaddlq_u16(v30);
3643 +    sum = vpadalq_u16(sum, v31);
3644 +    return vaddvq_s32(sum);
3645 +#endif
3646 +}
3647 +
3648 +
3649 +int pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3650 +{
3651 +    int16x8_t v0,v1,v2,v3;
3652 +    int16x8_t v20,v21,v22,v23;
3653 +
3654 +    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3655 +    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3656 +
3657 +#if HIGH_BIT_DEPTH
3658 +//#if 1//HIGH_BIT_DEPTH
3659 +    int32x4_t s = vaddq_u32(v0,v1);
3660 +    return (vaddvq_u32(s) + 1) >> 1;
3661 +#else
3662 +    return (vaddlvq_s16(vaddq_u16(v0, v1)) + 1) >> 1;
3663 +#endif
3664 +}
3665 +
3666 +
3667 +
3668 +
3669 +
3670 +int pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3671 +{
3672 +    int16x8_t v0,v1,v2,v3;
3673 +    int16x8_t v20,v21,v22,v23;
3674 +    int32x4_t v30,v31;
3675 +
3676 +    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3677 +    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3678 +
3679 +#if !(HIGH_BIT_DEPTH)
3680 +    v30 = vpaddlq_u16(v0);
3681 +    v31 = vpaddlq_u16(v1);
3682 +#else
3683 +    v30 = vaddq_s32(v0,v1);
3684 +#endif
3685 +
3686 +    _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3687 +    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3688 +
3689 +#if !(HIGH_BIT_DEPTH)
3690 +     v30 = vpadalq_u16(v30,v0);
3691 +     v31 = vpadalq_u16(v31,v1);
3692 +#else
3693 +     v31 = vaddq_s32(v0,v1);
3694 +#endif
3695 +
3696 +
3697 +    _sub_8x8_fly(pix1 + 8*stride_pix1, stride_pix1, pix2 + 8*stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3698 +    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3699 +
3700 +#if !(HIGH_BIT_DEPTH)
3701 +    v30 = vpadalq_u16(v30,v0);
3702 +    v31 = vpadalq_u16(v31,v1);
3703 +#else
3704 +    v30 = vaddq_s32(v30,v0);
3705 +    v31 = vaddq_s32(v31,v1);
3706 +#endif
3707 +
3708 +    _sub_8x8_fly(pix1 + 8*stride_pix1 + 8, stride_pix1, pix2 + 8*stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3709 +    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3710 +
3711 +#if !(HIGH_BIT_DEPTH)
3712 +     v30 = vpadalq_u16(v30,v0);
3713 +     v31 = vpadalq_u16(v31,v1);
3714 +#else
3715 +     v30 = vaddq_s32(v30,v0);
3716 +     v31 = vaddq_s32(v31,v1);
3717 +#endif
3718 +
3719 +    v30 = vaddq_u32(v30,v31);
3720 +
3721 +    return (vaddvq_u32(v30) + 1) >> 1;
3722 +}
3723 +
3724 +
3725 +
3726 +
3727 +
3728 +
3729 +
3730 +
3731 +template<int size>
3732 +void blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
3733 +{
3734 +  for (int y = 0; y < size; y++) {
3735 +    int x = 0;
3736 +    int16x8_t v = vdupq_n_s16(val);
3737 +    for (; (x + 8) <= size; x+=8) {
3738 +        *(int16x8_t*)&dst[y * dstride + x] = v;
3739 +    }
3740 +    for (; x < size; x++) {
3741 +        dst[y * dstride + x] = val;
3742 +    }
3743 +  }
3744 +}
3745 +
3746 +template<int lx, int ly>
3747 +int sad_pp_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3748 +{
3749 +  int sum = 0;
3750 +
3751 +
3752 +  for (int y = 0; y < ly; y++)
3753 +  {
3754 +#if HIGH_BIT_DEPTH
3755 +      int x=0;
3756 +      uint16x8_t vsum16_1 = vdupq_n_u16(0);
3757 +      for (; (x + 8) <= lx; x+=8) {
3758 +        uint16x8_t p1 = *(uint16x8_t*)&pix1[x];
3759 +        uint16x8_t p2 = *(uint16x8_t*)&pix2[x];
3760 +        vsum16_1 = vabaq_s16(vsum16_1,p1,p2);
3761 +
3762 +      }
3763 +      if (lx & 4) {
3764 +        uint16x4_t p1 = *(uint16x4_t*)&pix1[x];
3765 +        uint16x4_t p2 = *(uint16x4_t*)&pix2[x];
3766 +        sum += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p2));
3767 +        x += 4;
3768 +      }
3769 +      if (lx >= 4) {
3770 +        sum += vaddlvq_s16(vsum16_1);
3771 +      }
3772 +
3773 +#else
3774 +
3775 +    int x=0;
3776 +    uint16x8_t vsum16_1 = vdupq_n_u16(0);
3777 +    uint16x8_t vsum16_2 = vdupq_n_u16(0);
3778 +
3779 +    for (; (x + 16) <= lx; x+=16) {
3780 +      uint8x16_t p1 = *(uint8x16_t*)&pix1[x];
3781 +      uint8x16_t p2 = *(uint8x16_t*)&pix2[x];
3782 +      vsum16_1 = vabal_u8(vsum16_1,vget_low_u8(p1),vget_low_u8(p2));
3783 +      vsum16_2 = vabal_high_u8(vsum16_2,p1,p2);
3784 +    }
3785 +    if (lx & 8) {
3786 +      uint8x8_t p1 = *(uint8x8_t*)&pix1[x];
3787 +      uint8x8_t p2 = *(uint8x8_t*)&pix2[x];
3788 +      vsum16_1 = vabal_u8(vsum16_1,p1,p2);
3789 +      x += 8;
3790 +    }
3791 +    if (lx & 4) {
3792 +      uint32x2_t p1 = vdup_n_u32(0);
3793 +      p1[0] = *(uint32_t*)&pix1[x];
3794 +      uint32x2_t p2 = vdup_n_u32(0);
3795 +      p2[0] = *(uint32_t*)&pix2[x];
3796 +      vsum16_1 = vabal_u8(vsum16_1,p1,p2);
3797 +      x += 4;
3798 +    }
3799 +    if (lx >= 16) {
3800 +      vsum16_1 = vaddq_u16(vsum16_1,vsum16_2);
3801 +    }
3802 +    if (lx >= 4) {
3803 +      sum += vaddvq_u16(vsum16_1);
3804 +    }
3805 +
3806 +#endif
3807 +    if (lx & 3) for (; x < lx; x++) {
3808 +        sum += abs(pix1[x] - pix2[x]);
3809 +    }
3810 +
3811 +    pix1 += stride_pix1;
3812 +    pix2 += stride_pix2;
3813 +  }
3814 +
3815 +  return sum;
3816 +}
3817 +
3818 +template<int lx, int ly>
3819 +void sad_x3_neon(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
3820 +{
3821 +  res[0] = 0;
3822 +  res[1] = 0;
3823 +  res[2] = 0;
3824 +  for (int y = 0; y < ly; y++)
3825 +  {
3826 +    int x = 0;
3827 +    uint16x8_t vsum16_0 = vdupq_n_u16(0);
3828 +    uint16x8_t vsum16_1 = vdupq_n_u16(0);
3829 +    uint16x8_t vsum16_2 = vdupq_n_u16(0);
3830 +#if HIGH_BIT_DEPTH
3831 +      for (; (x + 8) <= lx; x+=8) {
3832 +        uint16x8_t p1 = *(uint16x8_t*)&pix1[x];
3833 +        uint16x8_t p2 = *(uint16x8_t*)&pix2[x];
3834 +        uint16x8_t p3 = *(uint16x8_t*)&pix3[x];
3835 +        uint16x8_t p4 = *(uint16x8_t*)&pix4[x];
3836 +        vsum16_0 = vabaq_s16(vsum16_0,p1,p2);
3837 +        vsum16_1 = vabaq_s16(vsum16_1,p1,p3);
3838 +        vsum16_2 = vabaq_s16(vsum16_2,p1,p4);
3839 +
3840 +      }
3841 +      if (lx & 4) {
3842 +        uint16x4_t p1 = *(uint16x4_t*)&pix1[x];
3843 +        uint16x4_t p2 = *(uint16x4_t*)&pix2[x];
3844 +        uint16x4_t p3 = *(uint16x4_t*)&pix3[x];
3845 +        uint16x4_t p4 = *(uint16x4_t*)&pix4[x];
3846 +        res[0] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p2));
3847 +        res[1] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p3));
3848 +        res[2] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p4));
3849 +        x += 4;
3850 +      }
3851 +      if (lx >= 4) {
3852 +        res[0] += vaddlvq_s16(vsum16_0);
3853 +        res[1] += vaddlvq_s16(vsum16_1);
3854 +        res[2] += vaddlvq_s16(vsum16_2);
3855 +      }
3856 +#else
3857 +
3858 +    for (; (x + 16) <= lx; x+=16) {
3859 +      uint8x16_t p1 = *(uint8x16_t*)&pix1[x];
3860 +      uint8x16_t p2 = *(uint8x16_t*)&pix2[x];
3861 +      uint8x16_t p3 = *(uint8x16_t*)&pix3[x];
3862 +      uint8x16_t p4 = *(uint8x16_t*)&pix4[x];
3863 +      vsum16_0 = vabal_u8(vsum16_0,vget_low_u8(p1),vget_low_u8(p2));
3864 +      vsum16_0 = vabal_high_u8(vsum16_0,p1,p2);
3865 +      vsum16_1 = vabal_u8(vsum16_1,vget_low_u8(p1),vget_low_u8(p3));
3866 +      vsum16_1 = vabal_high_u8(vsum16_1,p1,p3);
3867 +      vsum16_2 = vabal_u8(vsum16_2,vget_low_u8(p1),vget_low_u8(p4));
3868 +      vsum16_2 = vabal_high_u8(vsum16_2,p1,p4);
3869 +    }
3870 +    if (lx & 8) {
3871 +      uint8x8_t p1 = *(uint8x8_t*)&pix1[x];
3872 +      uint8x8_t p2 = *(uint8x8_t*)&pix2[x];
3873 +      uint8x8_t p3 = *(uint8x8_t*)&pix3[x];
3874 +      uint8x8_t p4 = *(uint8x8_t*)&pix4[x];
3875 +      vsum16_0 = vabal_u8(vsum16_0,p1,p2);
3876 +      vsum16_1 = vabal_u8(vsum16_1,p1,p3);
3877 +      vsum16_2 = vabal_u8(vsum16_2,p1,p4);
3878 +      x += 8;
3879 +    }
3880 +    if (lx & 4) {
3881 +      uint32x2_t p1 = vdup_n_u32(0);
3882 +      p1[0] = *(uint32_t*)&pix1[x];
3883 +      uint32x2_t p2 = vdup_n_u32(0);
3884 +      p2[0] = *(uint32_t*)&pix2[x];
3885 +      uint32x2_t p3 = vdup_n_u32(0);
3886 +      p3[0] = *(uint32_t*)&pix3[x];
3887 +      uint32x2_t p4 = vdup_n_u32(0);
3888 +      p4[0] = *(uint32_t*)&pix4[x];
3889 +      vsum16_0 = vabal_u8(vsum16_0,p1,p2);
3890 +      vsum16_1 = vabal_u8(vsum16_1,p1,p3);
3891 +      vsum16_2 = vabal_u8(vsum16_2,p1,p4);
3892 +      x += 4;
3893 +    }
3894 +    if (lx >= 4) {
3895 +      res[0] += vaddvq_u16(vsum16_0);
3896 +      res[1] += vaddvq_u16(vsum16_1);
3897 +      res[2] += vaddvq_u16(vsum16_2);
3898 +    }
3899 +
3900 +#endif
3901 +    if (lx & 3) for (; x < lx; x++)
3902 +    {
3903 +      res[0] += abs(pix1[x] - pix2[x]);
3904 +      res[1] += abs(pix1[x] - pix3[x]);
3905 +      res[2] += abs(pix1[x] - pix4[x]);
3906 +    }
3907 +
3908 +    pix1 += FENC_STRIDE;
3909 +    pix2 += frefstride;
3910 +    pix3 += frefstride;
3911 +    pix4 += frefstride;
3912 +  }
3913 +}
3914 +
3915 +template<int lx, int ly>
3916 +void sad_x4_neon(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
3917 +{
3918 +  res[0] = 0;
3919 +  res[1] = 0;
3920 +  res[2] = 0;
3921 +  res[3] = 0;
3922 +  for (int y = 0; y < ly; y++)
3923 +  {
3924 +    int x=0;
3925 +    uint16x8_t vsum16_0 = vdupq_n_u16(0);
3926 +    uint16x8_t vsum16_1 = vdupq_n_u16(0);
3927 +    uint16x8_t vsum16_2 = vdupq_n_u16(0);
3928 +    uint16x8_t vsum16_3 = vdupq_n_u16(0);
3929 +#if HIGH_BIT_DEPTH
3930 +      for (; (x + 8) <= lx; x+=8) {
3931 +        uint16x8_t p1 = *(uint16x8_t*)&pix1[x];
3932 +        uint16x8_t p2 = *(uint16x8_t*)&pix2[x];
3933 +        uint16x8_t p3 = *(uint16x8_t*)&pix3[x];
3934 +        uint16x8_t p4 = *(uint16x8_t*)&pix4[x];
3935 +        uint16x8_t p5 = *(uint16x8_t*)&pix5[x];
3936 +        vsum16_0 = vabaq_s16(vsum16_0,p1,p2);
3937 +        vsum16_1 = vabaq_s16(vsum16_1,p1,p3);
3938 +        vsum16_2 = vabaq_s16(vsum16_2,p1,p4);
3939 +        vsum16_3 = vabaq_s16(vsum16_3,p1,p5);
3940 +
3941 +      }
3942 +      if (lx & 4) {
3943 +        uint16x4_t p1 = *(uint16x4_t*)&pix1[x];
3944 +        uint16x4_t p2 = *(uint16x4_t*)&pix2[x];
3945 +        uint16x4_t p3 = *(uint16x4_t*)&pix3[x];
3946 +        uint16x4_t p4 = *(uint16x4_t*)&pix4[x];
3947 +        uint16x4_t p5 = *(uint16x4_t*)&pix5[x];
3948 +        res[0] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p2));
3949 +        res[1] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p3));
3950 +        res[2] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p4));
3951 +        res[3] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p5));
3952 +        x += 4;
3953 +      }
3954 +      if (lx >= 4) {
3955 +        res[0] += vaddlvq_s16(vsum16_0);
3956 +        res[1] += vaddlvq_s16(vsum16_1);
3957 +        res[2] += vaddlvq_s16(vsum16_2);
3958 +        res[3] += vaddlvq_s16(vsum16_3);
3959 +      }
3960 +
3961 +#else
3962 +
3963 +    for (; (x + 16) <= lx; x+=16) {
3964 +      uint8x16_t p1 = *(uint8x16_t*)&pix1[x];
3965 +      uint8x16_t p2 = *(uint8x16_t*)&pix2[x];
3966 +      uint8x16_t p3 = *(uint8x16_t*)&pix3[x];
3967 +      uint8x16_t p4 = *(uint8x16_t*)&pix4[x];
3968 +      uint8x16_t p5 = *(uint8x16_t*)&pix5[x];
3969 +      vsum16_0 = vabal_u8(vsum16_0,vget_low_u8(p1),vget_low_u8(p2));
3970 +      vsum16_0 = vabal_high_u8(vsum16_0,p1,p2);
3971 +      vsum16_1 = vabal_u8(vsum16_1,vget_low_u8(p1),vget_low_u8(p3));
3972 +      vsum16_1 = vabal_high_u8(vsum16_1,p1,p3);
3973 +      vsum16_2 = vabal_u8(vsum16_2,vget_low_u8(p1),vget_low_u8(p4));
3974 +      vsum16_2 = vabal_high_u8(vsum16_2,p1,p4);
3975 +      vsum16_3 = vabal_u8(vsum16_3,vget_low_u8(p1),vget_low_u8(p5));
3976 +      vsum16_3 = vabal_high_u8(vsum16_3,p1,p5);
3977 +    }
3978 +    if (lx & 8) {
3979 +      uint8x8_t p1 = *(uint8x8_t*)&pix1[x];
3980 +      uint8x8_t p2 = *(uint8x8_t*)&pix2[x];
3981 +      uint8x8_t p3 = *(uint8x8_t*)&pix3[x];
3982 +      uint8x8_t p4 = *(uint8x8_t*)&pix4[x];
3983 +      uint8x8_t p5 = *(uint8x8_t*)&pix5[x];
3984 +      vsum16_0 = vabal_u8(vsum16_0,p1,p2);
3985 +      vsum16_1 = vabal_u8(vsum16_1,p1,p3);
3986 +      vsum16_2 = vabal_u8(vsum16_2,p1,p4);
3987 +      vsum16_3 = vabal_u8(vsum16_3,p1,p5);
3988 +      x += 8;
3989 +    }
3990 +    if (lx & 4) {
3991 +      uint32x2_t p1 = vdup_n_u32(0);
3992 +      p1[0] = *(uint32_t*)&pix1[x];
3993 +      uint32x2_t p2 = vdup_n_u32(0);
3994 +      p2[0] = *(uint32_t*)&pix2[x];
3995 +      uint32x2_t p3 = vdup_n_u32(0);
3996 +      p3[0] = *(uint32_t*)&pix3[x];
3997 +      uint32x2_t p4 = vdup_n_u32(0);
3998 +      p4[0] = *(uint32_t*)&pix4[x];
3999 +      uint32x2_t p5 = vdup_n_u32(0);
4000 +      p5[0] = *(uint32_t*)&pix5[x];
4001 +      vsum16_0 = vabal_u8(vsum16_0,p1,p2);
4002 +      vsum16_1 = vabal_u8(vsum16_1,p1,p3);
4003 +      vsum16_2 = vabal_u8(vsum16_2,p1,p4);
4004 +      vsum16_3 = vabal_u8(vsum16_3,p1,p5);
4005 +      x += 4;
4006 +    }
4007 +    if (lx >= 4) {
4008 +      res[0] += vaddvq_u16(vsum16_0);
4009 +      res[1] += vaddvq_u16(vsum16_1);
4010 +      res[2] += vaddvq_u16(vsum16_2);
4011 +      res[3] += vaddvq_u16(vsum16_3);
4012 +    }
4013 +
4014 +#endif
4015 +    if (lx & 3) for (; x < lx; x++)
4016 +    {
4017 +      res[0] += abs(pix1[x] - pix2[x]);
4018 +      res[1] += abs(pix1[x] - pix3[x]);
4019 +      res[2] += abs(pix1[x] - pix4[x]);
4020 +      res[3] += abs(pix1[x] - pix5[x]);
4021 +    }
4022 +
4023 +    pix1 += FENC_STRIDE;
4024 +    pix2 += frefstride;
4025 +    pix3 += frefstride;
4026 +    pix4 += frefstride;
4027 +    pix5 += frefstride;
4028 +  }
4029 +}
4030 +
4031 +
4032 +template<int lx, int ly, class T1, class T2>
4033 +sse_t sse_neon(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
4034 +{
4035 +    sse_t sum = 0;
4036 +
4037 +    int32x4_t vsum1 = vdupq_n_s32(0);
4038 +    int32x4_t vsum2 = vdupq_n_s32(0);
4039 +    for (int y = 0; y < ly; y++)
4040 +    {
4041 +      int x = 0;
4042 +        for (; (x+8) <= lx; x+=8)
4043 +        {
4044 +          int16x8_t tmp;
4045 +          if (sizeof(T1) == 2 && sizeof(T2) == 2) {
4046 +            tmp = vsubq_s16(*(int16x8_t *)&pix1[x],*(int16x8_t *)&pix2[x]);
4047 +          } else if (sizeof(T1) == 1 && sizeof(T2) == 1){
4048 +            tmp = vsubl_u8(*(uint8x8_t *)&pix1[x],*(uint8x8_t *)&pix2[x]);
4049 +          }
4050 +          else {
4051 +            X265_CHECK(false,"unsupported sse");
4052 +          }
4053 +          vsum1 = vmlal_s16(vsum1,vget_low_s16(tmp),vget_low_s16(tmp));
4054 +          vsum2 = vmlal_high_s16(vsum2,tmp,tmp);
4055 +        }
4056 +        for (; x < lx; x++)
4057 +        {
4058 +            int tmp = pix1[x] - pix2[x];
4059 +            sum += (tmp * tmp);
4060 +        }
4061 +
4062 +        if (sizeof(T1) == 2 && sizeof(T2) == 2)
4063 +        {
4064 +            int32x4_t vsum = vaddq_u32(vsum1,vsum2);;
4065 +            sum += vaddvq_u32(vsum);
4066 +            vsum1 = vsum2 = vdupq_n_u16(0);
4067 +        }
4068 +
4069 +        pix1 += stride_pix1;
4070 +        pix2 += stride_pix2;
4071 +    }
4072 +    int32x4_t vsum = vaddq_u32(vsum1,vsum2);
4073 +
4074 +    return sum + vaddvq_u32(vsum);
4075 +}
4076 +
4077 +
4078 +template<int bx, int by>
4079 +void blockcopy_ps_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
4080 +{
4081 +    for (int y = 0; y < by; y++)
4082 +    {
4083 +      int x= 0;
4084 +      for (; (x + 8) <= bx; x+=8)
4085 +      {
4086 +#if HIGH_BIT_DEPTH
4087 +        *(int16x8_t *)&a[x] = *(int16x8_t *)&b[x];
4088 +#else
4089 +        *(int16x8_t *)&a[x] = vmovl_u8(*(int8x8_t *)&b[x]);
4090 +#endif
4091 +      }
4092 +      for (; x < bx; x++) {
4093 +          a[x] = (int16_t)b[x];
4094 +      }
4095 +
4096 +      a += stridea;
4097 +      b += strideb;
4098 +    }
4099 +}
4100 +
4101 +
4102 +template<int bx, int by>
4103 +void blockcopy_pp_neon(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
4104 +{
4105 +    for (int y = 0; y < by; y++)
4106 +    {
4107 +      int x = 0;
4108 +#if HIGH_BIT_DEPTH
4109 +      for (; (x + 8) <= bx; x+=8)
4110 +      {
4111 +        *(int16x8_t *)&a[x] = *(int16x8_t *)&b[x];
4112 +      }
4113 +      if (bx & 4)
4114 +      {
4115 +        *(uint64_t *)&a[x] = *(uint64_t *)&b[x];
4116 +        x += 4;
4117 +      }
4118 +#else
4119 +      for (; (x + 16) <= bx; x+=16)
4120 +      {
4121 +        *(uint8x16_t *)&a[x] = *(uint8x16_t *)&b[x];
4122 +      }
4123 +      if (bx & 8)
4124 +      {
4125 +          *(uint8x8_t *)&a[x] = *(uint8x8_t *)&b[x];
4126 +          x += 8;
4127 +      }
4128 +      if (bx & 4)
4129 +      {
4130 +          *(uint32_t *)&a[x] = *(uint32_t *)&b[x];
4131 +          x += 4;
4132 +      }
4133 +#endif
4134 +      for (; x < bx; x++) {
4135 +          a[x] = b[x];
4136 +      }
4137 +
4138 +      a += stridea;
4139 +      b += strideb;
4140 +    }
4141 +}
4142 +
4143 +
4144 +template<int bx, int by>
4145 +void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
4146 +{
4147 +    for (int y = 0; y < by; y++)
4148 +    {
4149 +      int x = 0;
4150 +      for (; (x + 8) <= bx; x+=8) {
4151 +#if HIGH_BIT_DEPTH
4152 +        *(int16x8_t *)&a[x] = vsubq_s16(*(int16x8_t *)&b0[x], *(int16x8_t *)&b1[x]);
4153 +#else
4154 +        *(int16x8_t *)&a[x] = vsubl_u8(*(uint8x8_t *)&b0[x], *(uint8x8_t *)&b1[x]);
4155 +#endif
4156 +      }
4157 +      for (; x < bx; x++)
4158 +          a[x] = (int16_t)(b0[x] - b1[x]);
4159 +
4160 +        b0 += sstride0;
4161 +        b1 += sstride1;
4162 +        a += dstride;
4163 +    }
4164 +}
4165 +
4166 +template<int bx, int by>
4167 +void pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
4168 +{
4169 +        for (int y = 0; y < by; y++)
4170 +        {
4171 +          int x = 0;
4172 +          for (; (x + 8) <= bx; x+=8) {
4173 +            int16x8_t t;
4174 +            int16x8_t b1e = *(int16x8_t *)&b1[x];
4175 +            int16x8_t b0e;
4176 +#if HIGH_BIT_DEPTH
4177 +            b0e = *(int16x8_t *)&b0[x];
4178 +            t = vaddq_s16(b0e,b1e);
4179 +            t = vminq_s16(t,vdupq_n_s16((1 << X265_DEPTH) - 1));
4180 +            t = vmaxq_s16(t,vdupq_n_s16(0));
4181 +            *(int16x8_t *)&a[x] = t;
4182 +#else
4183 +            b0e = vmovl_u8(*(uint8x8_t *)&b0[x]);
4184 +            t = vaddq_s16(b0e,b1e);
4185 +            *(uint8x8_t *)&a[x] = vqmovun_s16(t);
4186 +#endif
4187 +          }
4188 +          for (; x < bx; x++)
4189 +              a[x] = (int16_t)x265_clip(b0[x] + b1[x]);
4190 +
4191 +          b0 += sstride0;
4192 +          b1 += sstride1;
4193 +          a += dstride;
4194 +        }
4195 +}
4196 +
4197 +template<int bx, int by>
4198 +void addAvg_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
4199 +{
4200 +
4201 +    const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
4202 +    const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
4203 +
4204 +    const int32x4_t addon = vdupq_n_s32(offset);
4205 +    for (int y = 0; y < by; y++)
4206 +    {
4207 +      int x = 0;
4208 +
4209 +        for (; (x + 8) <= bx; x += 8)
4210 +        {
4211 +          int16x8_t in0 = *(int16x8_t*)&src0[x];
4212 +          int16x8_t in1 = *(int16x8_t*)&src1[x];
4213 +          int32x4_t t1 = vaddl_s16(vget_low_s16(in0),vget_low_s16(in1));
4214 +          int32x4_t t2 = vaddl_high_s16(in0,in1);
4215 +          t1 = vaddq_s32(t1,addon);
4216 +          t2 = vaddq_s32(t2,addon);
4217 +          t1 = vshrq_n_s32(t1,shiftNum);
4218 +          t2 = vshrq_n_s32(t2,shiftNum);
4219 +          int16x8_t t = vuzp1q_s16(t1,t2);
4220 +#if HIGH_BIT_DEPTH
4221 +          t = vminq_s16(t,vdupq_n_s16((1 << X265_DEPTH) - 1));
4222 +          t = vmaxq_s16(t,vdupq_n_s16(0));
4223 +          *(int16x8_t *)&dst[x] = t;
4224 +#else
4225 +          *(uint8x8_t *)&dst[x] = vqmovun_s16(t);
4226 +#endif
4227 +        }
4228 +      for (; x < bx; x += 2)
4229 +      {
4230 +          dst[x + 0] = x265_clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
4231 +          dst[x + 1] = x265_clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
4232 +      }
4233 +
4234 +        src0 += src0Stride;
4235 +        src1 += src1Stride;
4236 +        dst  += dstStride;
4237 +    }
4238 +}
4239 +
4240 +template<int lx, int ly>
4241 +void pixelavg_pp_neon(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
4242 +{
4243 +    for (int y = 0; y < ly; y++)
4244 +    {
4245 +      int x = 0;
4246 +      for (; (x+8) <= lx; x+=8) {
4247 +#if HIGH_BIT_DEPTH
4248 +        int16x8_t in0 = *(int16x8_t *)&src0[x];
4249 +        int16x8_t in1 = *(int16x8_t *)&src1[x];
4250 +        int16x8_t t = vaddq_s16(in0,in1);
4251 +        t = vaddq_s16(t,vdupq_n_s16(1));
4252 +        t = vshrq_n_s16(t,1);
4253 +        *(int16x8_t *)&dst[x] = t;
4254 +#else
4255 +        int16x8_t in0 = vmovl_u8(*(uint8x8_t *)&src0[x]);
4256 +        int16x8_t in1 = vmovl_u8(*(uint8x8_t *)&src1[x]);
4257 +        int16x8_t t = vaddq_s16(in0,in1);
4258 +        t = vaddq_s16(t,vdupq_n_s16(1));
4259 +        t = vshrq_n_s16(t,1);
4260 +        *(uint8x8_t *)&dst[x] = vmovn_u16(t);
4261 +#endif
4262 +      }
4263 +      for (; x < lx; x++)
4264 +          dst[x] = (src0[x] + src1[x] + 1) >> 1;
4265 +
4266 +      src0 += sstride0;
4267 +      src1 += sstride1;
4268 +      dst += dstride;
4269 +    }
4270 +}
4271 +
4272 +
4273 +template<int size>
4274 +void cpy1Dto2D_shl_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4275 +{
4276 +    X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
4277 +    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
4278 +    X265_CHECK(shift >= 0, "invalid shift\n");
4279 +
4280 +    for (int i = 0; i < size; i++)
4281 +    {
4282 +        int j = 0;
4283 +        for (; (j+8) <= size; j+=8)
4284 +        {
4285 +          *(int16x8_t *)&dst[j] = vshlq_s16(*(int16x8_t*)&src[j],vdupq_n_s16(shift));
4286 +        }
4287 +        for (; j < size; j++)
4288 +        {
4289 +            dst[j] = src[j] << shift;
4290 +        }
4291 +        src += size;
4292 +        dst += dstStride;
4293 +    }
4294 +}
4295 +
4296 +
4297 +template<int size>
4298 +uint64_t pixel_var_neon(const uint8_t* pix, intptr_t i_stride)
4299 +{
4300 +    uint32_t sum = 0, sqr = 0;
4301 +
4302 +    int32x4_t vsqr = vdupq_n_s32(0);
4303 +    for (int y = 0; y < size; y++)
4304 +    {
4305 +      int x = 0;
4306 +      int16x8_t vsum = vdupq_n_s16(0);
4307 +      for (; (x + 8) <= size; x+=8)
4308 +      {
4309 +        int16x8_t in;
4310 +        in = vmovl_u8(*(uint8x8_t*)&pix[x]);
4311 +        vsum = vaddq_u16(vsum,in);
4312 +        vsqr = vmlal_s16(vsqr,vget_low_s16(in),vget_low_s16(in));
4313 +        vsqr = vmlal_high_s16(vsqr,in,in);
4314 +      }
4315 +      for (; x < size; x++)
4316 +      {
4317 +          sum += pix[x];
4318 +          sqr += pix[x] * pix[x];
4319 +      }
4320 +      sum += vaddvq_s16(vsum);
4321 +
4322 +      pix += i_stride;
4323 +    }
4324 +    sqr += vaddvq_u32(vsqr);
4325 +    return sum + ((uint64_t)sqr << 32);
4326 +}
4327 +
4328 +template<int blockSize>
4329 +void getResidual_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
4330 +{
4331 +    for (int y = 0; y < blockSize; y++)
4332 +    {
4333 +      int x = 0;
4334 +      for (; (x + 8) < blockSize; x+=8) {
4335 +        int16x8_t vfenc,vpred;
4336 +#if HIGH_BIT_DEPTH
4337 +        vfenc = *(int16x8_t *)&fenc[x];
4338 +        vpred = *(int16x8_t *)&pred[x];
4339 +#else
4340 +        vfenc = vmovl_u8(*(uint8x8_t *)&fenc[x]);
4341 +        vpred = vmovl_u8(*(uint8x8_t *)&pred[x]);
4342 +#endif
4343 +        *(int16x8_t*)&residual[x] = vsubq_s16(vfenc,vpred);
4344 +      }
4345 +      for (; x < blockSize; x++) {
4346 +            residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
4347 +      }
4348 +      fenc += stride;
4349 +      residual += stride;
4350 +      pred += stride;
4351 +  }
4352 +}
4353 +
4354 +#if 1//!(HIGH_BIT_DEPTH)
4355 +template<int size>
4356 +int psyCost_pp_neon(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
4357 +{
4358 +    static pixel zeroBuf[8] /* = { 0 } */;
4359 +
4360 +    if (size)
4361 +    {
4362 +        int dim = 1 << (size + 2);
4363 +        uint32_t totEnergy = 0;
4364 +        for (int i = 0; i < dim; i += 8)
4365 +        {
4366 +            for (int j = 0; j < dim; j+= 8)
4367 +            {
4368 +                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
4369 +                int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
4370 +                                   (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
4371 +                int reconEnergy =  pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
4372 +                                   (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
4373 +
4374 +                totEnergy += abs(sourceEnergy - reconEnergy);
4375 +            }
4376 +        }
4377 +        return totEnergy;
4378 +    }
4379 +    else
4380 +    {
4381 +        /* 4x4 is too small for sa8d */
4382 +        int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf, 0) >> 2);
4383 +        int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
4384 +        return abs(sourceEnergy - reconEnergy);
4385 +    }
4386 +}
4387 +
4388 +
4389 +template<int w, int h>
4390 +// Calculate sa8d in blocks of 8x8
4391 +int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4392 +{
4393 +    int cost = 0;
4394 +
4395 +    for (int y = 0; y < h; y += 8)
4396 +        for (int x = 0; x < w; x += 8)
4397 +            cost += pixel_sa8d_8x8_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
4398 +
4399 +    return cost;
4400 +}
4401 +
4402 +template<int w, int h>
4403 +// Calculate sa8d in blocks of 16x16
4404 +int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4405 +{
4406 +    int cost = 0;
4407 +
4408 +    for (int y = 0; y < h; y += 16)
4409 +        for (int x = 0; x < w; x += 16)
4410 +            cost += pixel_sa8d_16x16_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
4411 +
4412 +    return cost;
4413 +}
4414 +#endif
4415 +
4416 +template<int size>
4417 +void cpy2Dto1D_shl_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
4418 +{
4419 +    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
4420 +    X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
4421 +    X265_CHECK(shift >= 0, "invalid shift\n");
4422 +
4423 +    for (int i = 0; i < size; i++)
4424 +    {
4425 +        for (int j = 0; j < size; j++)
4426 +            dst[j] = src[j] << shift;
4427 +
4428 +        src += srcStride;
4429 +        dst += size;
4430 +    }
4431 +}
4432 +
4433 +
4434 +#if 1//!(HIGH_BIT_DEPTH)
4435 +template<int w, int h>
4436 +// calculate satd in blocks of 4x4
4437 +int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
4438 +{
4439 +    int satd = 0;
4440 +
4441 +    for (int row = 0; row < h; row += 4)
4442 +        for (int col = 0; col < w; col += 4)
4443 +            satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4444 +                             pix2 + row * stride_pix2 + col, stride_pix2);
4445 +
4446 +    return satd;
4447 +}
4448 +
4449 +template<int w, int h>
4450 +// calculate satd in blocks of 8x4
4451 +int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
4452 +{
4453 +    int satd = 0;
4454 +
4455 +    if (((w | h) & 15) == 0)
4456 +    {
4457 +        for (int row = 0; row < h; row += 16)
4458 +            for (int col = 0; col < w; col += 16)
4459 +                satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4460 +                                            pix2 + row * stride_pix2 + col, stride_pix2);
4461 +
4462 +    }
4463 +    else
4464 +    if (((w | h) & 7) == 0)
4465 +    {
4466 +        for (int row = 0; row < h; row += 8)
4467 +            for (int col = 0; col < w; col += 8)
4468 +                satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4469 +                                            pix2 + row * stride_pix2 + col, stride_pix2);
4470 +
4471 +    }
4472 +    else
4473 +    {
4474 +        for (int row = 0; row < h; row += 4)
4475 +            for (int col = 0; col < w; col += 8)
4476 +                satd += pixel_satd_8x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4477 +                                            pix2 + row * stride_pix2 + col, stride_pix2);
4478 +    }
4479 +
4480 +    return satd;
4481 +}
4482 +#endif
4483 +
4484 +
4485 +template<int blockSize>
4486 +void transpose_neon(pixel* dst, const pixel* src, intptr_t stride)
4487 +{
4488 +    for (int k = 0; k < blockSize; k++)
4489 +        for (int l = 0; l < blockSize; l++)
4490 +            dst[k * blockSize + l] = src[l * stride + k];
4491 +}
4492 +
4493 +
4494 +template<>
4495 +void transpose_neon<8>(pixel* dst, const pixel* src, intptr_t stride)
4496 +{
4497 +    transpose8x8(dst,src,8,stride);
4498 +}
4499 +
4500 +template<>
4501 +void transpose_neon<16>(pixel* dst, const pixel* src, intptr_t stride)
4502 +{
4503 +    transpose16x16(dst,src,16,stride);
4504 +}
4505 +
4506 +template<>
4507 +void transpose_neon<32>(pixel* dst, const pixel* src, intptr_t stride)
4508 +{
4509 +    transpose32x32(dst,src,32,stride);
4510 +}
4511 +
4512 +
4513 +template<>
4514 +void transpose_neon<64>(pixel* dst, const pixel* src, intptr_t stride)
4515 +{
4516 +    transpose32x32(dst,src,64,stride);
4517 +    transpose32x32(dst+32*64+32,src+32*stride+32,64,stride);
4518 +    transpose32x32(dst+32*64,src+32,64,stride);
4519 +    transpose32x32(dst+32,src+32*stride,64,stride);
4520 +}
4521 +
4522 +
4523 +template<int size>
4524 +sse_t pixel_ssd_s_neon(const int16_t* a, intptr_t dstride)
4525 +{
4526 +    sse_t sum = 0;
4527 +
4528 +
4529 +  int32x4_t vsum = vdupq_n_s32(0);
4530 +
4531 +    for (int y = 0; y < size; y++)
4532 +    {
4533 +      int x = 0;
4534 +
4535 +      for (; (x + 8) <= size; x+=8) {
4536 +        int16x8_t in = *(int16x8_t*)&a[x];
4537 +        vsum = vmlal_s16(vsum,vget_low_s16(in),vget_low_s16(in));
4538 +        vsum = vmlal_high_s16(vsum,(in),(in));
4539 +      }
4540 +      for (; x < size; x++) {
4541 +            sum += a[x] * a[x];
4542 +      }
4543 +
4544 +        a += dstride;
4545 +    }
4546 +    return sum + vaddvq_s32(vsum);
4547 +}
4548 +
4549 +
4550 +};
4551 +
4552 +
4553 +
4554 +
4555 +namespace X265_NS {
4556 +
4557 +
4558 +void setupPixelPrimitives_neon(EncoderPrimitives &p)
4559 +{
4560 +  #define LUMA_PU(W, H) \
4561 +      p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4562 +      p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
4563 +      p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
4564 +      p.pu[LUMA_ ## W ## x ## H].sad = sad_pp_neon<W, H>; \
4565 +      p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_neon<W, H>; \
4566 +      p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
4567 +      p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
4568 +      p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
4569 +
4570 +#if !(HIGH_BIT_DEPTH)
4571 +
4572 +#define LUMA_CU(W, H) \
4573 +      p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_neon<W, H>; \
4574 +      p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED]    = pixel_add_ps_neon<W, H>; \
4575 +      p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
4576 +      p.cu[BLOCK_ ## W ## x ## H].copy_ps       = blockcopy_ps_neon<W, H>; \
4577 +      p.cu[BLOCK_ ## W ## x ## H].copy_pp       = blockcopy_pp_neon<W, H>; \
4578 +      p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>;  \
4579 +      p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_neon<W>;  \
4580 +      p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
4581 +      p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
4582 +      p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
4583 +      p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp   = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
4584 +      p.cu[BLOCK_ ## W ## x ## H].transpose     = transpose_neon<W>; \
4585 +      p.cu[BLOCK_ ## W ## x ## H].var           = pixel_var_neon<W>; \
4586 +      p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED]  = getResidual_neon<W>; \
4587 +      p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED]     = getResidual_neon<W>; \
4588 +
4589 +#else
4590 +
4591 +    #define LUMA_CU(W, H) \
4592 +    p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_neon<W, H>; \
4593 +    p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED]    = pixel_add_ps_neon<W, H>; \
4594 +    p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
4595 +    p.cu[BLOCK_ ## W ## x ## H].copy_pp       = blockcopy_pp_neon<W, H>; \
4596 +    p.cu[BLOCK_ ## W ## x ## H].copy_ps       = blockcopy_ps_neon<W, H>; \
4597 +    p.cu[BLOCK_ ## W ## x ## H].copy_pp       = blockcopy_pp_neon<W, H>; \
4598 +    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>;  \
4599 +    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_neon<W>;  \
4600 +    p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
4601 +    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
4602 +    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
4603 +    p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp   = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
4604 +    p.cu[BLOCK_ ## W ## x ## H].transpose     = transpose_neon<W>; \
4605 +    /*p.cu[BLOCK_ ## W ## x ## H].var           = pixel_var_neon<W>;*/ \
4606 +    p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED]  = getResidual_neon<W>; \
4607 +    p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED]     = getResidual_neon<W>; \
4608 +
4609 +
4610 +
4611 +#endif
4612 +
4613 +
4614 +      LUMA_PU(4, 4);
4615 +      LUMA_PU(8, 8);
4616 +      LUMA_PU(16, 16);
4617 +      LUMA_PU(32, 32);
4618 +      LUMA_PU(64, 64);
4619 +      LUMA_PU(4, 8);
4620 +      LUMA_PU(8, 4);
4621 +      LUMA_PU(16,  8);
4622 +      LUMA_PU(8, 16);
4623 +      LUMA_PU(16, 12);
4624 +      LUMA_PU(12, 16);
4625 +      LUMA_PU(16,  4);
4626 +      LUMA_PU(4, 16);
4627 +      LUMA_PU(32, 16);
4628 +      LUMA_PU(16, 32);
4629 +      LUMA_PU(32, 24);
4630 +      LUMA_PU(24, 32);
4631 +      LUMA_PU(32,  8);
4632 +      LUMA_PU(8, 32);
4633 +      LUMA_PU(64, 32);
4634 +      LUMA_PU(32, 64);
4635 +      LUMA_PU(64, 48);
4636 +      LUMA_PU(48, 64);
4637 +      LUMA_PU(64, 16);
4638 +      LUMA_PU(16, 64);
4639 +
4640 +      p.pu[LUMA_4x4].satd   = pixel_satd_4x4_neon;
4641 +      p.pu[LUMA_8x8].satd   = satd8<8, 8>;
4642 +      p.pu[LUMA_8x4].satd   = pixel_satd_8x4_neon;
4643 +      p.pu[LUMA_4x8].satd   = satd4<4, 8>;
4644 +      p.pu[LUMA_16x16].satd = satd8<16, 16>;
4645 +      p.pu[LUMA_16x8].satd  = satd8<16, 8>;
4646 +      p.pu[LUMA_8x16].satd  = satd8<8, 16>;
4647 +      p.pu[LUMA_16x12].satd = satd8<16, 12>;
4648 +      p.pu[LUMA_12x16].satd = satd4<12, 16>;
4649 +      p.pu[LUMA_16x4].satd  = satd8<16, 4>;
4650 +      p.pu[LUMA_4x16].satd  = satd4<4, 16>;
4651 +      p.pu[LUMA_32x32].satd = satd8<32, 32>;
4652 +      p.pu[LUMA_32x16].satd = satd8<32, 16>;
4653 +      p.pu[LUMA_16x32].satd = satd8<16, 32>;
4654 +      p.pu[LUMA_32x24].satd = satd8<32, 24>;
4655 +      p.pu[LUMA_24x32].satd = satd8<24, 32>;
4656 +      p.pu[LUMA_32x8].satd  = satd8<32, 8>;
4657 +      p.pu[LUMA_8x32].satd  = satd8<8, 32>;
4658 +      p.pu[LUMA_64x64].satd = satd8<64, 64>;
4659 +      p.pu[LUMA_64x32].satd = satd8<64, 32>;
4660 +      p.pu[LUMA_32x64].satd = satd8<32, 64>;
4661 +      p.pu[LUMA_64x48].satd = satd8<64, 48>;
4662 +      p.pu[LUMA_48x64].satd = satd8<48, 64>;
4663 +      p.pu[LUMA_64x16].satd = satd8<64, 16>;
4664 +      p.pu[LUMA_16x64].satd = satd8<16, 64>;
4665 +
4666 +
4667 +      LUMA_CU(4, 4);
4668 +      LUMA_CU(8, 8);
4669 +      LUMA_CU(16, 16);
4670 +      LUMA_CU(32, 32);
4671 +      LUMA_CU(64, 64);
4672 +
4673 +
4674 +      p.cu[BLOCK_4x4].sa8d   = pixel_satd_4x4_neon;
4675 +      p.cu[BLOCK_8x8].sa8d   = pixel_sa8d_8x8_neon;
4676 +      p.cu[BLOCK_16x16].sa8d = pixel_sa8d_16x16_neon;
4677 +      p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
4678 +      p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
4679 +
4680 +
4681 +  #define CHROMA_PU_420(W, H) \
4682 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[NONALIGNED]  = addAvg_neon<W, H>;         \
4683 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[ALIGNED]  = addAvg_neon<W, H>;         \
4684 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4685 +
4686 +
4687 +      CHROMA_PU_420(4, 4);
4688 +      CHROMA_PU_420(8, 8);
4689 +      CHROMA_PU_420(16, 16);
4690 +      CHROMA_PU_420(32, 32);
4691 +      CHROMA_PU_420(4, 2);
4692 +      CHROMA_PU_420(8, 4);
4693 +      CHROMA_PU_420(4, 8);
4694 +      CHROMA_PU_420(8, 6);
4695 +      CHROMA_PU_420(6, 8);
4696 +      CHROMA_PU_420(8, 2);
4697 +      CHROMA_PU_420(2, 8);
4698 +      CHROMA_PU_420(16, 8);
4699 +      CHROMA_PU_420(8,  16);
4700 +      CHROMA_PU_420(16, 12);
4701 +      CHROMA_PU_420(12, 16);
4702 +      CHROMA_PU_420(16, 4);
4703 +      CHROMA_PU_420(4,  16);
4704 +      CHROMA_PU_420(32, 16);
4705 +      CHROMA_PU_420(16, 32);
4706 +      CHROMA_PU_420(32, 24);
4707 +      CHROMA_PU_420(24, 32);
4708 +      CHROMA_PU_420(32, 8);
4709 +      CHROMA_PU_420(8,  32);
4710 +
4711 +
4712 +
4713 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd   = NULL;
4714 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = pixel_satd_4x4_neon;
4715 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = satd8<8, 8>;
4716 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8<16, 16>;
4717 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8<32, 32>;
4718 +
4719 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd   = NULL;
4720 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd   = NULL;
4721 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = pixel_satd_8x4_neon;
4722 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = satd4<4, 8>;
4723 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = satd8<16, 8>;
4724 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = satd8<8, 16>;
4725 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8<32, 16>;
4726 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8<16, 32>;
4727 +
4728 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd   = NULL;
4729 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd   = NULL;
4730 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd   = NULL;
4731 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd   = NULL;
4732 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4<16, 12>;
4733 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4<12, 16>;
4734 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = satd4<16, 4>;
4735 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = satd4<4, 16>;
4736 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8<32, 24>;
4737 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8<24, 32>;
4738 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = satd8<32, 8>;
4739 +      p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = satd8<8, 32>;
4740 +
4741 +
4742 +  #define CHROMA_CU_420(W, H) \
4743 +      p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp  = sse_neon<W, H, pixel, pixel>; \
4744 +      p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4745 +      p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
4746 +      p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>;  \
4747 +      p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
4748 +      p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
4749 +
4750 +
4751 +      CHROMA_CU_420(4, 4)
4752 +      CHROMA_CU_420(8, 8)
4753 +      CHROMA_CU_420(16, 16)
4754 +      CHROMA_CU_420(32, 32)
4755 +
4756 +
4757 +      p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d   = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
4758 +      p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
4759 +      p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
4760 +      p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
4761 +
4762 +
4763 +  #define CHROMA_PU_422(W, H) \
4764 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[NONALIGNED]  = addAvg_neon<W, H>;         \
4765 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[ALIGNED]  = addAvg_neon<W, H>;         \
4766 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4767 +
4768 +
4769 +      CHROMA_PU_422(4, 8);
4770 +      CHROMA_PU_422(8, 16);
4771 +      CHROMA_PU_422(16, 32);
4772 +      CHROMA_PU_422(32, 64);
4773 +      CHROMA_PU_422(4, 4);
4774 +      CHROMA_PU_422(2, 8);
4775 +      CHROMA_PU_422(8, 8);
4776 +      CHROMA_PU_422(4, 16);
4777 +      CHROMA_PU_422(8, 12);
4778 +      CHROMA_PU_422(6, 16);
4779 +      CHROMA_PU_422(8, 4);
4780 +      CHROMA_PU_422(2, 16);
4781 +      CHROMA_PU_422(16, 16);
4782 +      CHROMA_PU_422(8, 32);
4783 +      CHROMA_PU_422(16, 24);
4784 +      CHROMA_PU_422(12, 32);
4785 +      CHROMA_PU_422(16, 8);
4786 +      CHROMA_PU_422(4,  32);
4787 +      CHROMA_PU_422(32, 32);
4788 +      CHROMA_PU_422(16, 64);
4789 +      CHROMA_PU_422(32, 48);
4790 +      CHROMA_PU_422(24, 64);
4791 +      CHROMA_PU_422(32, 16);
4792 +      CHROMA_PU_422(8,  64);
4793 +
4794 +
4795 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd   = NULL;
4796 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = satd4<4, 8>;
4797 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = satd8<8, 16>;
4798 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8<16, 32>;
4799 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8<32, 64>;
4800 +
4801 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = pixel_satd_4x4_neon;
4802 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd   = NULL;
4803 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = satd8<8, 8>;
4804 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = satd4<4, 16>;
4805 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8<16, 16>;
4806 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = satd8<8, 32>;
4807 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8<32, 32>;
4808 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8<16, 64>;
4809 +
4810 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = satd4<8, 12>;
4811 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd  = NULL;
4812 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = satd4<8, 4>;
4813 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd  = NULL;
4814 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8<16, 24>;
4815 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4<12, 32>;
4816 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = satd8<16, 8>;
4817 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = satd4<4, 32>;
4818 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8<32, 48>;
4819 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8<24, 64>;
4820 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8<32, 16>;
4821 +      p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd  = satd8<8, 64>;
4822 +
4823 +
4824 +  #define CHROMA_CU_422(W, H) \
4825 +      p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp  = sse_neon<W, H, pixel, pixel>;  \
4826 +      p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4827 +      p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
4828 +      p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
4829 +      p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
4830 +      p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
4831 +
4832 +
4833 +      CHROMA_CU_422(4, 8)
4834 +      CHROMA_CU_422(8, 16)
4835 +      CHROMA_CU_422(16, 32)
4836 +      CHROMA_CU_422(32, 64)
4837 +
4838 +      p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d   = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
4839 +      p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
4840 +      p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
4841 +      p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
4842 +
4843 +
4844 +}
4845 +
4846 +
4847 +}
4848 +
4849 +
4850 +#endif
4851 +
4852 diff -Naur ./source/common/arm64/pixel-prim.h ../x265_apple_patch/source/common/arm64/pixel-prim.h
4853 --- ./source/common/arm64/pixel-prim.h  1970-01-01 01:00:00.000000000 +0100
4854 +++ ../x265_apple_patch/source/common/arm64/pixel-prim.h        2021-05-08 13:08:01.000000000 +0100
4855 @@ -0,0 +1,22 @@
4856 +#ifndef PIXEL_PRIM_NEON_H__
4857 +#define PIXEL_PRIM_NEON_H__
4858 +
4859 +#include "common.h"
4860 +#include "slicetype.h"      // LOWRES_COST_MASK
4861 +#include "primitives.h"
4862 +#include "x265.h"
4863 +
4864 +
4865 +
4866 +namespace X265_NS {
4867 +
4868 +
4869 +
4870 +void setupPixelPrimitives_neon(EncoderPrimitives &p);
4871 +
4872 +
4873 +}
4874 +
4875 +
4876 +#endif
4877 +
4878 diff -Naur ./source/common/arm64/pixel.h ../x265_apple_patch/source/common/arm64/pixel.h
4879 --- ./source/common/arm64/pixel.h       1970-01-01 01:00:00.000000000 +0100
4880 +++ ../x265_apple_patch/source/common/arm64/pixel.h     2021-05-08 13:08:01.000000000 +0100
4881 @@ -0,0 +1,134 @@
4882 +/*****************************************************************************
4883 + * pixel.h: aarch64 pixel metrics
4884 + *****************************************************************************
4885 + * Copyright (C) 2009-2019 x265 project
4886 + *
4887 + * Authors: David Conrad <lessen42@gmail.com>
4888 + *          Janne Grunau <janne-x265@jannau.net>
4889 + *
4890 + * This program is free software; you can redistribute it and/or modify
4891 + * it under the terms of the GNU General Public License as published by
4892 + * the Free Software Foundation; either version 2 of the License, or
4893 + * (at your option) any later version.
4894 + *
4895 + * This program is distributed in the hope that it will be useful,
4896 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
4897 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
4898 + * GNU General Public License for more details.
4899 + *
4900 + * You should have received a copy of the GNU General Public License
4901 + * along with this program; if not, write to the Free Software
4902 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
4903 + *
4904 + * This program is also available under a commercial proprietary license.
4905 + * For more information, contact us at licensing@x265.com.
4906 + *****************************************************************************/
4907 +
4908 +#ifndef x265_AARCH64_PIXEL_H
4909 +#define x265_AARCH64_PIXEL_H
4910 +
4911 +#define x265_pixel_sad_16x16_neon x265_template(pixel_sad_16x16_neon)
4912 +#define x265_pixel_sad_16x8_neon x265_template(pixel_sad_16x8_neon)
4913 +#define x265_pixel_sad_4x16_neon x265_template(pixel_sad_4x16_neon)
4914 +#define x265_pixel_sad_4x4_neon x265_template(pixel_sad_4x4_neon)
4915 +#define x265_pixel_sad_4x8_neon x265_template(pixel_sad_4x8_neon)
4916 +#define x265_pixel_sad_8x16_neon x265_template(pixel_sad_8x16_neon)
4917 +#define x265_pixel_sad_8x4_neon x265_template(pixel_sad_8x4_neon)
4918 +#define x265_pixel_sad_8x8_neon x265_template(pixel_sad_8x8_neon)
4919 +#define x265_pixel_sad_x3_16x16_neon x265_template(pixel_sad_x3_16x16_neon)
4920 +#define x265_pixel_sad_x3_16x8_neon x265_template(pixel_sad_x3_16x8_neon)
4921 +#define x265_pixel_sad_x3_4x4_neon x265_template(pixel_sad_x3_4x4_neon)
4922 +#define x265_pixel_sad_x3_4x8_neon x265_template(pixel_sad_x3_4x8_neon)
4923 +#define x265_pixel_sad_x3_8x16_neon x265_template(pixel_sad_x3_8x16_neon)
4924 +#define x265_pixel_sad_x3_8x4_neon x265_template(pixel_sad_x3_8x4_neon)
4925 +#define x265_pixel_sad_x3_8x8_neon x265_template(pixel_sad_x3_8x8_neon)
4926 +#define x265_pixel_sad_x4_16x16_neon x265_template(pixel_sad_x4_16x16_neon)
4927 +#define x265_pixel_sad_x4_16x8_neon x265_template(pixel_sad_x4_16x8_neon)
4928 +#define x265_pixel_sad_x4_4x4_neon x265_template(pixel_sad_x4_4x4_neon)
4929 +#define x265_pixel_sad_x4_4x8_neon x265_template(pixel_sad_x4_4x8_neon)
4930 +#define x265_pixel_sad_x4_8x16_neon x265_template(pixel_sad_x4_8x16_neon)
4931 +#define x265_pixel_sad_x4_8x4_neon x265_template(pixel_sad_x4_8x4_neon)
4932 +#define x265_pixel_sad_x4_8x8_neon x265_template(pixel_sad_x4_8x8_neon)
4933 +#define x265_pixel_satd_16x16_neon x265_template(pixel_satd_16x16_neon)
4934 +#define x265_pixel_satd_16x8_neon x265_template(pixel_satd_16x8_neon)
4935 +#define x265_pixel_satd_4x16_neon x265_template(pixel_satd_4x16_neon)
4936 +#define x265_pixel_satd_4x4_neon x265_template(pixel_satd_4x4_neon)
4937 +#define x265_pixel_satd_4x8_neon x265_template(pixel_satd_4x8_neon)
4938 +#define x265_pixel_satd_8x16_neon x265_template(pixel_satd_8x16_neon)
4939 +#define x265_pixel_satd_8x4_neon x265_template(pixel_satd_8x4_neon)
4940 +#define x265_pixel_satd_8x8_neon x265_template(pixel_satd_8x8_neon)
4941 +#define x265_pixel_ssd_16x16_neon x265_template(pixel_ssd_16x16_neon)
4942 +#define x265_pixel_ssd_16x8_neon x265_template(pixel_ssd_16x8_neon)
4943 +#define x265_pixel_ssd_4x16_neon x265_template(pixel_ssd_4x16_neon)
4944 +#define x265_pixel_ssd_4x4_neon x265_template(pixel_ssd_4x4_neon)
4945 +#define x265_pixel_ssd_4x8_neon x265_template(pixel_ssd_4x8_neon)
4946 +#define x265_pixel_ssd_8x16_neon x265_template(pixel_ssd_8x16_neon)
4947 +#define x265_pixel_ssd_8x4_neon x265_template(pixel_ssd_8x4_neon)
4948 +#define x265_pixel_ssd_8x8_neon x265_template(pixel_ssd_8x8_neon)
4949 +#define DECL_PIXELS( ret, name, suffix, args ) \
4950 +    ret x265_pixel_##name##_16x16_##suffix args;\
4951 +    ret x265_pixel_##name##_16x8_##suffix args;\
4952 +    ret x265_pixel_##name##_8x16_##suffix args;\
4953 +    ret x265_pixel_##name##_8x8_##suffix args;\
4954 +    ret x265_pixel_##name##_8x4_##suffix args;\
4955 +    ret x265_pixel_##name##_4x16_##suffix args;\
4956 +    ret x265_pixel_##name##_4x8_##suffix args;\
4957 +    ret x265_pixel_##name##_4x4_##suffix args;\
4958 +
4959 +#define DECL_X1( name, suffix ) \
4960 +    DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
4961 +
4962 +#define DECL_X4( name, suffix ) \
4963 +    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
4964 +    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
4965 +
4966 +DECL_X1( sad, neon )
4967 +DECL_X4( sad, neon )
4968 +DECL_X1( satd, neon )
4969 +DECL_X1( ssd, neon )
4970 +
4971 +
4972 +#define x265_pixel_ssd_nv12_core_neon x265_template(pixel_ssd_nv12_core_neon)
4973 +void x265_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
4974 +
4975 +#define x265_pixel_vsad_neon x265_template(pixel_vsad_neon)
4976 +int x265_pixel_vsad_neon( uint8_t *, intptr_t, int );
4977 +
4978 +#define x265_pixel_sa8d_8x8_neon x265_template(pixel_sa8d_8x8_neon)
4979 +int x265_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
4980 +#define x265_pixel_sa8d_16x16_neon x265_template(pixel_sa8d_16x16_neon)
4981 +int x265_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
4982 +#define x265_pixel_sa8d_satd_16x16_neon x265_template(pixel_sa8d_satd_16x16_neon)
4983 +uint64_t x265_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
4984 +
4985 +#define x265_pixel_var_8x8_neon x265_template(pixel_var_8x8_neon)
4986 +uint64_t x265_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
4987 +#define x265_pixel_var_8x16_neon x265_template(pixel_var_8x16_neon)
4988 +uint64_t x265_pixel_var_8x16_neon ( uint8_t *, intptr_t );
4989 +#define x265_pixel_var_16x16_neon x265_template(pixel_var_16x16_neon)
4990 +uint64_t x265_pixel_var_16x16_neon( uint8_t *, intptr_t );
4991 +#define x265_pixel_var2_8x8_neon x265_template(pixel_var2_8x8_neon)
4992 +int x265_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
4993 +#define x265_pixel_var2_8x16_neon x265_template(pixel_var2_8x16_neon)
4994 +int x265_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
4995 +
4996 +#define x265_pixel_hadamard_ac_8x8_neon x265_template(pixel_hadamard_ac_8x8_neon)
4997 +uint64_t x265_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
4998 +#define x265_pixel_hadamard_ac_8x16_neon x265_template(pixel_hadamard_ac_8x16_neon)
4999 +uint64_t x265_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
5000 +#define x265_pixel_hadamard_ac_16x8_neon x265_template(pixel_hadamard_ac_16x8_neon)
5001 +uint64_t x265_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
5002 +#define x265_pixel_hadamard_ac_16x16_neon x265_template(pixel_hadamard_ac_16x16_neon)
5003 +uint64_t x265_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
5004 +
5005 +#define x265_pixel_ssim_4x4x2_core_neon x265_template(pixel_ssim_4x4x2_core_neon)
5006 +void x265_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
5007 +                                      const uint8_t *, intptr_t,
5008 +                                      int sums[2][4] );
5009 +#define x265_pixel_ssim_end4_neon x265_template(pixel_ssim_end4_neon)
5010 +float x265_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
5011 +
5012 +#define x265_pixel_asd8_neon x265_template(pixel_asd8_neon)
5013 +int x265_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
5014 +
5015 +#endif
5016 diff -Naur ./source/common/cpu.cpp ../x265_apple_patch/source/common/cpu.cpp
5017 --- ./source/common/cpu.cpp     2021-05-08 13:06:22.000000000 +0100
5018 +++ ../x265_apple_patch/source/common/cpu.cpp   2021-05-08 13:08:01.000000000 +0100
5019 @@ -104,7 +104,8 @@
5020      { "ARMv6",           X265_CPU_ARMV6 },
5021      { "NEON",            X265_CPU_NEON },
5022      { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
5023 -
5024 +#elif X265_ARCH_ARM64
5025 +    { "NEON",            X265_CPU_NEON },
5026  #elif X265_ARCH_POWER8
5027      { "Altivec",         X265_CPU_ALTIVEC },
5028
5029 @@ -374,6 +375,18 @@
5030  #endif // if HAVE_ARMV6
5031      return flags;
5032  }
5033 +#elif X265_ARCH_ARM64
5034 +
5035 +uint32_t cpu_detect(bool benableavx512)
5036 +{
5037 +    int flags = 0;
5038 +
5039 +#if HAVE_NEON
5040 +    flags |= X265_CPU_NEON;
5041 +#endif
5042 +
5043 +    return flags;
5044 +}
5045
5046  #elif X265_ARCH_POWER8
5047
5048 diff -Naur ./source/common/pixel.cpp ../x265_apple_patch/source/common/pixel.cpp
5049 --- ./source/common/pixel.cpp   2021-05-08 13:06:22.000000000 +0100
5050 +++ ../x265_apple_patch/source/common/pixel.cpp 2021-05-08 13:08:01.000000000 +0100
5051 @@ -266,7 +266,7 @@
5052  {
5053      int satd = 0;
5054
5055 -#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
5056 +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && 0
5057      pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
5058  #endif
5059
5060 @@ -284,7 +284,7 @@
5061  {
5062      int satd = 0;
5063
5064 -#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
5065 +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && 0
5066      pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
5067  #endif
5068
5069 diff -Naur ./source/common/version.cpp ../x265_apple_patch/source/common/version.cpp
5070 --- ./source/common/version.cpp 2021-05-08 13:06:22.000000000 +0100
5071 +++ ../x265_apple_patch/source/common/version.cpp       2021-05-08 13:47:38.000000000 +0100
5072 @@ -31,7 +31,7 @@
5073
5074  #if defined(__clang__)
5075  #define COMPILEDBY  "[clang " XSTR(__clang_major__) "." XSTR(__clang_minor__) "." XSTR(__clang_patchlevel__) "]"
5076 -#ifdef __IA64__
5077 +#ifdef __IA64__ || __arm64__ || __aarch64__
5078  #define ONARCH    "[on 64-bit] "
5079  #else
5080  #define ONARCH    "[on 32-bit] "
5081 @@ -71,7 +71,7 @@
5082  #define ONOS    "[Unk-OS]"
5083  #endif
5084
5085 -#if X86_64
5086 +#if X86_64 || __arm64__ || __aarch64__
5087  #define BITS    "[64 bit]"
5088  #else
5089  #define BITS    "[32 bit]"
5090 diff -Naur ./source/test/testharness.h ../x265_apple_patch/source/test/testharness.h
5091 --- ./source/test/testharness.h 2021-05-08 13:06:22.000000000 +0100
5092 +++ ../x265_apple_patch/source/test/testharness.h       2021-05-08 13:08:01.000000000 +0100
5093 @@ -64,7 +64,6 @@
5094
5095      uint64_t m_rand;
5096  };
5097 -
5098  #ifdef _MSC_VER
5099  #include <intrin.h>
5100  #elif HAVE_RDTSC
5101 @@ -73,7 +72,7 @@
5102  #include <x86intrin.h>
5103  #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
5104  #include <arm_neon.h>
5105 -#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
5106 +#else
5107  /* fallback for older GCC/MinGW */
5108  static inline uint32_t __rdtsc(void)
5109  {
5110 @@ -90,6 +89,12 @@
5111
5112      // TO-DO: replace clock() function with appropriate ARM cpu instructions
5113      a = clock();
5114 +#elif X265_ARCH_ARM64
5115 +    // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
5116 +    // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
5117 +
5118 +    // TO-DO: replace clock() function with appropriate ARM cpu instructions
5119 +    a = clock();
5120  #endif
5121  #endif
5122      return a;
5123 @@ -140,7 +145,7 @@
5124   * needs an explicit asm check because it only sometimes crashes in normal use. */
5125  intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
5126  float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
5127 -#elif X265_ARCH_ARM == 0
5128 +#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
5129  #define PFX(stack_pagealign)(func, align) func()
5130  #endif
5131
5132 diff -Naur ./source/test/testharness.h.orig ../x265_apple_patch/source/test/testharness.h.orig
5133 --- ./source/test/testharness.h.orig    1970-01-01 01:00:00.000000000 +0100
5134 +++ ../x265_apple_patch/source/test/testharness.h.orig  2021-05-08 13:08:01.000000000 +0100
5135 @@ -0,0 +1,184 @@
5136 +/*****************************************************************************
5137 + * Copyright (C) 2013-2020 MulticoreWare, Inc
5138 + *
5139 + * Authors: Steve Borho <steve@borho.org>
5140 + *          Min Chen <chenm003@163.com>
5141 + *          Yimeng Su <yimeng.su@huawei.com>
5142 + *
5143 + * This program is free software; you can redistribute it and/or modify
5144 + * it under the terms of the GNU General Public License as published by
5145 + * the Free Software Foundation; either version 2 of the License, or
5146 + * (at your option) any later version.
5147 + *
5148 + * This program is distributed in the hope that it will be useful,
5149 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
5150 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
5151 + * GNU General Public License for more details.
5152 + *
5153 + * You should have received a copy of the GNU General Public License
5154 + * along with this program; if not, write to the Free Software
5155 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
5156 + *
5157 + * This program is also available under a commercial proprietary license.
5158 + * For more information, contact us at license @ x265.com.
5159 + *****************************************************************************/
5160 +
5161 +#ifndef _TESTHARNESS_H_
5162 +#define _TESTHARNESS_H_ 1
5163 +
5164 +#include "common.h"
5165 +#include "primitives.h"
5166 +
5167 +#if _MSC_VER
5168 +#pragma warning(disable: 4324) // structure was padded due to __declspec(align())
5169 +#endif
5170 +
5171 +#define PIXEL_MIN 0
5172 +#define SHORT_MAX  32767
5173 +#define SHORT_MIN -32767
5174 +#define UNSIGNED_SHORT_MAX 65535
5175 +
5176 +using namespace X265_NS;
5177 +
5178 +extern const char* lumaPartStr[NUM_PU_SIZES];
5179 +extern const char* const* chromaPartStr[X265_CSP_COUNT];
5180 +
5181 +class TestHarness
5182 +{
5183 +public:
5184 +
5185 +    TestHarness() {}
5186 +
5187 +    virtual ~TestHarness() {}
5188 +
5189 +    virtual bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) = 0;
5190 +
5191 +    virtual void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt) = 0;
5192 +
5193 +    virtual const char *getName() const = 0;
5194 +
5195 +protected:
5196 +
5197 +    /* Temporary variables for stack checks */
5198 +    int      m_ok;
5199 +
5200 +    uint64_t m_rand;
5201 +};
5202 +
5203 +#ifdef _MSC_VER
5204 +#include <intrin.h>
5205 +#elif HAVE_RDTSC
5206 +#include <intrin.h>
5207 +#elif (!defined(__APPLE__) && (defined (__GNUC__) && (defined(__x86_64__) || defined(__i386__))))
5208 +#include <x86intrin.h>
5209 +#elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
5210 +#include <arm_neon.h>
5211 +#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
5212 +/* fallback for older GCC/MinGW */
5213 +static inline uint32_t __rdtsc(void)
5214 +{
5215 +    uint32_t a = 0;
5216 +
5217 +#if X265_ARCH_X86
5218 +    asm volatile("rdtsc" : "=a" (a) ::"edx");
5219 +#elif X265_ARCH_ARM
5220 +#if X265_ARCH_ARM64
5221 +    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
5222 +#else
5223 +    // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
5224 +    // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
5225 +
5226 +    // TO-DO: replace clock() function with appropriate ARM cpu instructions
5227 +    a = clock();
5228 +#endif
5229 +#endif
5230 +    return a;
5231 +}
5232 +#endif // ifdef _MSC_VER
5233 +
5234 +#define BENCH_RUNS 2000
5235 +
5236 +/* Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
5237 + * and discards invalid times. Repeats BENCH_RUNS times to get a good average.
5238 + * Then measures the C reference with BENCH_RUNS / 4 runs and reports X factor and average cycles.*/
5239 +#define REPORT_SPEEDUP(RUNOPT, RUNREF, ...) \
5240 +    { \
5241 +        uint32_t cycles = 0; int runs = 0; \
5242 +        RUNOPT(__VA_ARGS__); \
5243 +        for (int ti = 0; ti < BENCH_RUNS; ti++) { \
5244 +            uint32_t t0 = (uint32_t)__rdtsc(); \
5245 +            RUNOPT(__VA_ARGS__); \
5246 +            RUNOPT(__VA_ARGS__); \
5247 +            RUNOPT(__VA_ARGS__); \
5248 +            RUNOPT(__VA_ARGS__); \
5249 +            uint32_t t1 = (uint32_t)__rdtsc() - t0; \
5250 +            if (t1 * runs <= cycles * 4 && ti > 0) { cycles += t1; runs++; } \
5251 +        } \
5252 +        uint32_t refcycles = 0; int refruns = 0; \
5253 +        RUNREF(__VA_ARGS__); \
5254 +        for (int ti = 0; ti < BENCH_RUNS / 4; ti++) { \
5255 +            uint32_t t0 = (uint32_t)__rdtsc(); \
5256 +            RUNREF(__VA_ARGS__); \
5257 +            RUNREF(__VA_ARGS__); \
5258 +            RUNREF(__VA_ARGS__); \
5259 +            RUNREF(__VA_ARGS__); \
5260 +            uint32_t t1 = (uint32_t)__rdtsc() - t0; \
5261 +            if (t1 * refruns <= refcycles * 4 && ti > 0) { refcycles += t1; refruns++; } \
5262 +        } \
5263 +        x265_emms(); \
5264 +        float optperf = (10.0f * cycles / runs) / 4; \
5265 +        float refperf = (10.0f * refcycles / refruns) / 4; \
5266 +        printf("\t%3.2fx ", refperf / optperf); \
5267 +        printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
5268 +    }
5269 +
5270 +extern "C" {
5271 +#if X265_ARCH_X86
5272 +int PFX(stack_pagealign)(int (*func)(), int align);
5273 +
5274 +/* detect when callee-saved regs aren't saved
5275 + * needs an explicit asm check because it only sometimes crashes in normal use. */
5276 +intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
5277 +float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
5278 +#elif X265_ARCH_ARM == 0
5279 +#define PFX(stack_pagealign)(func, align) func()
5280 +#endif
5281 +
5282 +#if X86_64
5283 +
5284 +/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
5285 + * This is done by clobbering the stack with junk around the stack pointer and calling the
5286 + * assembly function through x265_checkasm_call with added dummy arguments which forces all
5287 + * real arguments to be passed on the stack and not in registers. For 32-bit argument the
5288 + * upper half of the 64-bit register location on the stack will now contain junk. Note that
5289 + * this is dependent on compiler behavior and that interrupts etc. at the wrong time may
5290 + * overwrite the junk written to the stack so there's no guarantee that it will always
5291 + * detect all functions that assumes zero-extension.
5292 + */
5293 +void PFX(checkasm_stack_clobber)(uint64_t clobber, ...);
5294 +#define checked(func, ...) ( \
5295 +        m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
5296 +        PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5297 +                                    m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5298 +                                    m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
5299 +        PFX(checkasm_call)((intptr_t(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
5300 +
5301 +#define checked_float(func, ...) ( \
5302 +        m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
5303 +        PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5304 +                                    m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5305 +                                    m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
5306 +        PFX(checkasm_call_float)((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
5307 +#define reportfail() if (!m_ok) { fflush(stdout); fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); }
5308 +#elif ARCH_X86
5309 +#define checked(func, ...) PFX(checkasm_call)((intptr_t(*)())func, &m_ok, __VA_ARGS__);
5310 +#define checked_float(func, ...) PFX(checkasm_call_float)((float(*)())func, &m_ok, __VA_ARGS__);
5311 +
5312 +#else // if X86_64
5313 +#define checked(func, ...) func(__VA_ARGS__)
5314 +#define checked_float(func, ...) func(__VA_ARGS__)
5315 +#define reportfail()
5316 +#endif // if X86_64
5317 +}
5318 +
5319 +#endif // ifndef _TESTHARNESS_H_