1 diff -Naur ./source/CMakeLists.txt ../x265_apple_patch/source/CMakeLists.txt
2 --- ./source/CMakeLists.txt 2021-05-08 13:06:22.000000000 +0100
3 +++ ../x265_apple_patch/source/CMakeLists.txt 2021-05-08 13:08:01.000000000 +0100
5 # System architecture detection
6 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
7 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
8 -set(ARM_ALIASES armv6l armv7l aarch64)
9 +set(ARM_ALIASES armv6l armv7l)
10 +set(ARM64_ALIASES arm64 arm64e aarch64)
11 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
12 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
13 +list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
14 set(POWER_ALIASES ppc64 ppc64le)
15 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
16 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
18 message(STATUS "Detected ARM target processor")
19 add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
21 +elseif(ARM64MATCH GREATER "-1")
22 + if(CROSS_COMPILE_ARM64)
23 + message(STATUS "Cross compiling for ARM64 arch")
25 + set(CROSS_COMPILE_ARM64 0)
27 + message(STATUS "Detected ARM64 target processor")
29 + add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
31 message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
32 message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
37 + if(ARM64 OR CROSS_COMPILE_ARM64)
38 + add_definitions(-DHAVE_NEON)
40 add_definitions(${ARM_ARGS})
47 -if(ARM OR CROSS_COMPILE_ARM)
48 +if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
49 option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
50 elseif(NASM_FOUND AND X86)
51 if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
53 ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
56 + elseif(ARM64 OR CROSS_COMPILE_ARM64)
57 + # compile ARM arch asm files here
58 + enable_language(ASM)
59 + foreach(ASM ${ARM_ASMS})
60 + set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm64/${ASM})
61 + list(APPEND ASM_SRCS ${ASM_SRC})
62 + list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
64 + OUTPUT ${ASM}.${SUFFIX}
65 + COMMAND ${CMAKE_CXX_COMPILER}
66 + ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
69 + elseif(ARM64 OR CROSS_COMPILE_ARM64)
70 + # compile ARM arch asm files here
71 + enable_language(ASM)
72 + foreach(ASM ${ARM_ASMS})
73 + set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm64/${ASM})
74 + list(APPEND ASM_SRCS ${ASM_SRC})
75 + list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
77 + OUTPUT ${ASM}.${SUFFIX}
78 + COMMAND ${CMAKE_CXX_COMPILER}
79 + ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
83 # compile X86 arch asm files here
84 foreach(ASM ${MSVC_ASMS})
85 diff -Naur ./source/common/CMakeLists.txt ../x265_apple_patch/source/common/CMakeLists.txt
86 --- ./source/common/CMakeLists.txt 2021-05-08 13:06:22.000000000 +0100
87 +++ ../x265_apple_patch/source/common/CMakeLists.txt 2021-05-08 13:08:01.000000000 +0100
89 source_group(Assembly FILES ${ASM_PRIMITIVES})
90 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
93 +if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
94 + set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h)
95 + enable_language(ASM)
96 + # add ARM assembly/intrinsic files here
98 + #set(VEC_PRIMITIVES)
100 + #set(ARM64_ASMS "${A_SRCS}" CACHE INTERNAL "ARM64 Assembly Sources")
101 + foreach(SRC ${C_SRCS})
102 + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm64/${SRC})
104 + source_group(Assembly FILES ${ASM_PRIMITIVES})
105 +endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
109 set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
111 diff -Naur ./source/common/arm64/arm64-utils.cpp ../x265_apple_patch/source/common/arm64/arm64-utils.cpp
112 --- ./source/common/arm64/arm64-utils.cpp 1970-01-01 01:00:00.000000000 +0100
113 +++ ../x265_apple_patch/source/common/arm64/arm64-utils.cpp 2021-05-08 13:08:01.000000000 +0100
117 +#include "arm64-utils.h"
118 +#include <arm_neon.h>
120 +#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
125 +void transpose8x8(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride)
127 + uint8x8_t a0,a1,a2,a3,a4,a5,a6,a7;
128 + uint8x8_t b0,b1,b2,b3,b4,b5,b6,b7;
130 + a0 = *(uint8x8_t *)(src + 0*sstride);
131 + a1 = *(uint8x8_t *)(src + 1*sstride);
132 + a2 = *(uint8x8_t *)(src + 2*sstride);
133 + a3 = *(uint8x8_t *)(src + 3*sstride);
134 + a4 = *(uint8x8_t *)(src + 4*sstride);
135 + a5 = *(uint8x8_t *)(src + 5*sstride);
136 + a6 = *(uint8x8_t *)(src + 6*sstride);
137 + a7 = *(uint8x8_t *)(src + 7*sstride);
139 + b0 = vtrn1_u32(a0,a4);
140 + b1 = vtrn1_u32(a1,a5);
141 + b2 = vtrn1_u32(a2,a6);
142 + b3 = vtrn1_u32(a3,a7);
143 + b4 = vtrn2_u32(a0,a4);
144 + b5 = vtrn2_u32(a1,a5);
145 + b6 = vtrn2_u32(a2,a6);
146 + b7 = vtrn2_u32(a3,a7);
148 + a0 = vtrn1_u16(b0,b2);
149 + a1 = vtrn1_u16(b1,b3);
150 + a2 = vtrn2_u16(b0,b2);
151 + a3 = vtrn2_u16(b1,b3);
152 + a4 = vtrn1_u16(b4,b6);
153 + a5 = vtrn1_u16(b5,b7);
154 + a6 = vtrn2_u16(b4,b6);
155 + a7 = vtrn2_u16(b5,b7);
157 + b0 = vtrn1_u8(a0,a1);
158 + b1 = vtrn2_u8(a0,a1);
159 + b2 = vtrn1_u8(a2,a3);
160 + b3 = vtrn2_u8(a2,a3);
161 + b4 = vtrn1_u8(a4,a5);
162 + b5 = vtrn2_u8(a4,a5);
163 + b6 = vtrn1_u8(a6,a7);
164 + b7 = vtrn2_u8(a6,a7);
166 + *(uint8x8_t *)(dst + 0*dstride) = b0;
167 + *(uint8x8_t *)(dst + 1*dstride) = b1;
168 + *(uint8x8_t *)(dst + 2*dstride) = b2;
169 + *(uint8x8_t *)(dst + 3*dstride) = b3;
170 + *(uint8x8_t *)(dst + 4*dstride) = b4;
171 + *(uint8x8_t *)(dst + 5*dstride) = b5;
172 + *(uint8x8_t *)(dst + 6*dstride) = b6;
173 + *(uint8x8_t *)(dst + 7*dstride) = b7;
181 +void transpose16x16(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride)
183 + uint16x8_t a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,aA,aB,aC,aD,aE,aF;
184 + uint16x8_t b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,bA,bB,bC,bD,bE,bF;
185 + uint16x8_t c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,cA,cB,cC,cD,cE,cF;
186 + uint16x8_t d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,dA,dB,dC,dD,dE,dF;
188 + a0 = *(uint16x8_t *)(src + 0*sstride);
189 + a1 = *(uint16x8_t *)(src + 1*sstride);
190 + a2 = *(uint16x8_t *)(src + 2*sstride);
191 + a3 = *(uint16x8_t *)(src + 3*sstride);
192 + a4 = *(uint16x8_t *)(src + 4*sstride);
193 + a5 = *(uint16x8_t *)(src + 5*sstride);
194 + a6 = *(uint16x8_t *)(src + 6*sstride);
195 + a7 = *(uint16x8_t *)(src + 7*sstride);
196 + a8 = *(uint16x8_t *)(src + 8*sstride);
197 + a9 = *(uint16x8_t *)(src + 9*sstride);
198 + aA = *(uint16x8_t *)(src + 10*sstride);
199 + aB = *(uint16x8_t *)(src + 11*sstride);
200 + aC = *(uint16x8_t *)(src + 12*sstride);
201 + aD = *(uint16x8_t *)(src + 13*sstride);
202 + aE = *(uint16x8_t *)(src + 14*sstride);
203 + aF = *(uint16x8_t *)(src + 15*sstride);
205 + b0 = vtrn1q_u64(a0, a8);
206 + b1 = vtrn1q_u64(a1, a9);
207 + b2 = vtrn1q_u64(a2, aA);
208 + b3 = vtrn1q_u64(a3, aB);
209 + b4 = vtrn1q_u64(a4, aC);
210 + b5 = vtrn1q_u64(a5, aD);
211 + b6 = vtrn1q_u64(a6, aE);
212 + b7 = vtrn1q_u64(a7, aF);
213 + b8 = vtrn2q_u64(a0, a8);
214 + b9 = vtrn2q_u64(a1, a9);
215 + bA = vtrn2q_u64(a2, aA);
216 + bB = vtrn2q_u64(a3, aB);
217 + bC = vtrn2q_u64(a4, aC);
218 + bD = vtrn2q_u64(a5, aD);
219 + bE = vtrn2q_u64(a6, aE);
220 + bF = vtrn2q_u64(a7, aF);
222 + c0 = vtrn1q_u32(b0, b4);
223 + c1 = vtrn1q_u32(b1, b5);
224 + c2 = vtrn1q_u32(b2, b6);
225 + c3 = vtrn1q_u32(b3, b7);
226 + c4 = vtrn2q_u32(b0, b4);
227 + c5 = vtrn2q_u32(b1, b5);
228 + c6 = vtrn2q_u32(b2, b6);
229 + c7 = vtrn2q_u32(b3, b7);
230 + c8 = vtrn1q_u32(b8, bC);
231 + c9 = vtrn1q_u32(b9, bD);
232 + cA = vtrn1q_u32(bA, bE);
233 + cB = vtrn1q_u32(bB, bF);
234 + cC = vtrn2q_u32(b8, bC);
235 + cD = vtrn2q_u32(b9, bD);
236 + cE = vtrn2q_u32(bA, bE);
237 + cF = vtrn2q_u32(bB, bF);
239 + d0 = vtrn1q_u16(c0, c2);
240 + d1 = vtrn1q_u16(c1, c3);
241 + d2 = vtrn2q_u16(c0, c2);
242 + d3 = vtrn2q_u16(c1, c3);
243 + d4 = vtrn1q_u16(c4, c6);
244 + d5 = vtrn1q_u16(c5, c7);
245 + d6 = vtrn2q_u16(c4, c6);
246 + d7 = vtrn2q_u16(c5, c7);
247 + d8 = vtrn1q_u16(c8, cA);
248 + d9 = vtrn1q_u16(c9, cB);
249 + dA = vtrn2q_u16(c8, cA);
250 + dB = vtrn2q_u16(c9, cB);
251 + dC = vtrn1q_u16(cC, cE);
252 + dD = vtrn1q_u16(cD, cF);
253 + dE = vtrn2q_u16(cC, cE);
254 + dF = vtrn2q_u16(cD, cF);
256 + *(uint16x8_t *)(dst + 0*dstride) = vtrn1q_u8(d0, d1);
257 + *(uint16x8_t *)(dst + 1*dstride) = vtrn2q_u8(d0, d1);
258 + *(uint16x8_t *)(dst + 2*dstride) = vtrn1q_u8(d2, d3);
259 + *(uint16x8_t *)(dst + 3*dstride) = vtrn2q_u8(d2, d3);
260 + *(uint16x8_t *)(dst + 4*dstride) = vtrn1q_u8(d4, d5);
261 + *(uint16x8_t *)(dst + 5*dstride) = vtrn2q_u8(d4, d5);
262 + *(uint16x8_t *)(dst + 6*dstride) = vtrn1q_u8(d6, d7);
263 + *(uint16x8_t *)(dst + 7*dstride) = vtrn2q_u8(d6, d7);
264 + *(uint16x8_t *)(dst + 8*dstride) = vtrn1q_u8(d8, d9);
265 + *(uint16x8_t *)(dst + 9*dstride) = vtrn2q_u8(d8, d9);
266 + *(uint16x8_t *)(dst + 10*dstride) = vtrn1q_u8(dA, dB);
267 + *(uint16x8_t *)(dst + 11*dstride) = vtrn2q_u8(dA, dB);
268 + *(uint16x8_t *)(dst + 12*dstride) = vtrn1q_u8(dC, dD);
269 + *(uint16x8_t *)(dst + 13*dstride) = vtrn2q_u8(dC, dD);
270 + *(uint16x8_t *)(dst + 14*dstride) = vtrn1q_u8(dE, dF);
271 + *(uint16x8_t *)(dst + 15*dstride) = vtrn2q_u8(dE, dF);
277 +void transpose32x32(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride)
279 + //assumption: there is no partial overlap
280 + transpose16x16(dst,src,dstride,sstride);
281 + transpose16x16(dst+16*dstride+16,src+16*sstride+16,dstride,sstride);
284 + uint8_t tmp[16*16] __attribute__((aligned(64)));
285 + transpose16x16(tmp,src + 16,16,sstride);
286 + transpose16x16(dst + 16, src + 16*sstride,dstride,sstride);
287 + for (int i=0;i<16;i++) COPY_16(dst+(16 + i)*dstride,tmp + 16*i);
291 + transpose16x16(dst+16*dstride,src + 16,dstride,sstride);
292 + transpose16x16(dst + 16, src + 16*sstride,dstride,sstride);
299 +void transpose8x8(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride)
301 + uint16x8_t a0,a1,a2,a3,a4,a5,a6,a7;
302 + uint16x8_t b0,b1,b2,b3,b4,b5,b6,b7;
304 + a0 = *(uint16x8_t *)(src + 0*sstride);
305 + a1 = *(uint16x8_t *)(src + 1*sstride);
306 + a2 = *(uint16x8_t *)(src + 2*sstride);
307 + a3 = *(uint16x8_t *)(src + 3*sstride);
308 + a4 = *(uint16x8_t *)(src + 4*sstride);
309 + a5 = *(uint16x8_t *)(src + 5*sstride);
310 + a6 = *(uint16x8_t *)(src + 6*sstride);
311 + a7 = *(uint16x8_t *)(src + 7*sstride);
313 + b0 = vtrn1q_u64(a0,a4);
314 + b1 = vtrn1q_u64(a1,a5);
315 + b2 = vtrn1q_u64(a2,a6);
316 + b3 = vtrn1q_u64(a3,a7);
317 + b4 = vtrn2q_u64(a0,a4);
318 + b5 = vtrn2q_u64(a1,a5);
319 + b6 = vtrn2q_u64(a2,a6);
320 + b7 = vtrn2q_u64(a3,a7);
322 + a0 = vtrn1q_u32(b0,b2);
323 + a1 = vtrn1q_u32(b1,b3);
324 + a2 = vtrn2q_u32(b0,b2);
325 + a3 = vtrn2q_u32(b1,b3);
326 + a4 = vtrn1q_u32(b4,b6);
327 + a5 = vtrn1q_u32(b5,b7);
328 + a6 = vtrn2q_u32(b4,b6);
329 + a7 = vtrn2q_u32(b5,b7);
331 + b0 = vtrn1q_u16(a0,a1);
332 + b1 = vtrn2q_u16(a0,a1);
333 + b2 = vtrn1q_u16(a2,a3);
334 + b3 = vtrn2q_u16(a2,a3);
335 + b4 = vtrn1q_u16(a4,a5);
336 + b5 = vtrn2q_u16(a4,a5);
337 + b6 = vtrn1q_u16(a6,a7);
338 + b7 = vtrn2q_u16(a6,a7);
340 + *(uint16x8_t *)(dst + 0*dstride) = b0;
341 + *(uint16x8_t *)(dst + 1*dstride) = b1;
342 + *(uint16x8_t *)(dst + 2*dstride) = b2;
343 + *(uint16x8_t *)(dst + 3*dstride) = b3;
344 + *(uint16x8_t *)(dst + 4*dstride) = b4;
345 + *(uint16x8_t *)(dst + 5*dstride) = b5;
346 + *(uint16x8_t *)(dst + 6*dstride) = b6;
347 + *(uint16x8_t *)(dst + 7*dstride) = b7;
350 +void transpose16x16(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride)
352 + //assumption: there is no partial overlap
353 + transpose8x8(dst,src,dstride,sstride);
354 + transpose8x8(dst+8*dstride+8,src+8*sstride+8,dstride,sstride);
359 + transpose8x8(tmp,src + 8,8,sstride);
360 + transpose8x8(dst + 8, src + 8*sstride,dstride,sstride);
361 + for (int i=0;i<8;i++) COPY_16(dst+(8 + i)*dstride,tmp + 8*i);
365 + transpose8x8(dst+8*dstride,src + 8,dstride,sstride);
366 + transpose8x8(dst + 8, src + 8*sstride,dstride,sstride);
373 +void transpose32x32(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride)
375 + //assumption: there is no partial overlap
376 + for (int i=0;i<4;i++)
378 + transpose8x8(dst+i*8*(1+dstride),src+i*8*(1+sstride),dstride,sstride);
379 + for (int j=i+1;j<4;j++)
383 + uint16_t tmp[8*8] __attribute__((aligned(64)));
384 + transpose8x8(tmp,src + 8*i + 8*j*sstride,8,sstride);
385 + transpose8x8(dst + 8*i + 8*j*dstride, src + 8*j + 8*i*sstride,dstride,sstride);
386 + for (int k=0;k<8;k++) COPY_16(dst+ 8*j + (8*i+k)*dstride,tmp + 8*k);
390 + transpose8x8(dst + 8*(j + i*dstride),src + 8*(i + j*sstride),dstride,sstride);
391 + transpose8x8(dst + 8*(i + j*dstride),src + 8*(j + i*sstride),dstride,sstride);
405 diff -Naur ./source/common/arm64/arm64-utils.h ../x265_apple_patch/source/common/arm64/arm64-utils.h
406 --- ./source/common/arm64/arm64-utils.h 1970-01-01 01:00:00.000000000 +0100
407 +++ ../x265_apple_patch/source/common/arm64/arm64-utils.h 2021-05-08 13:08:01.000000000 +0100
409 +#ifndef __ARM64_UTILS_H__
410 +#define __ARM64_UTILS_H__
414 +void transpose8x8(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
415 +void transpose16x16(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
416 +void transpose32x32(uint8_t* dst, const uint8_t* src, intptr_t dstride, intptr_t sstride);
417 +void transpose8x8(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
418 +void transpose16x16(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
419 +void transpose32x32(uint16_t* dst, const uint16_t* src, intptr_t dstride, intptr_t sstride);
423 diff -Naur ./source/common/arm64/asm-primitives.cpp ../x265_apple_patch/source/common/arm64/asm-primitives.cpp
424 --- ./source/common/arm64/asm-primitives.cpp 1970-01-01 01:00:00.000000000 +0100
425 +++ ../x265_apple_patch/source/common/arm64/asm-primitives.cpp 2021-05-08 13:08:01.000000000 +0100
427 +/*****************************************************************************
428 + * Copyright (C) 2013-2017 MulticoreWare, Inc
430 + * Authors: Steve Borho <steve@borho.org>
431 + * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
432 + * Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
433 + * Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
435 + * This program is free software; you can redistribute it and/or modify
436 + * it under the terms of the GNU General Public License as published by
437 + * the Free Software Foundation; either version 2 of the License, or
438 + * (at your option) any later version.
440 + * This program is distributed in the hope that it will be useful,
441 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
442 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
443 + * GNU General Public License for more details.
445 + * You should have received a copy of the GNU General Public License
446 + * along with this program; if not, write to the Free Software
447 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
449 + * This program is also available under a commercial proprietary license.
450 + * For more information, contact us at license @ x265.com.
451 + *****************************************************************************/
454 +#include "primitives.h"
458 +#include "pixel-prim.h"
459 +#include "filter-prim.h"
460 +#include "dct-prim.h"
461 +#include "loopfilter-prim.h"
462 +#include "intrapred-prim.h"
465 +// private x265 namespace
467 +void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
469 + if (cpuMask & X265_CPU_NEON)
471 + setupPixelPrimitives_neon(p);
472 + setupFilterPrimitives_neon(p);
473 + setupDCTPrimitives_neon(p);
474 + setupLoopFilterPrimitives_neon(p);
475 + setupIntraPrimitives_neon(p);
479 +} // namespace X265_NS
480 diff -Naur ./source/common/arm64/dct-prim.cpp ../x265_apple_patch/source/common/arm64/dct-prim.cpp
481 --- ./source/common/arm64/dct-prim.cpp 1970-01-01 01:00:00.000000000 +0100
482 +++ ../x265_apple_patch/source/common/arm64/dct-prim.cpp 2021-05-08 13:08:01.000000000 +0100
484 +#include "dct-prim.h"
489 +#include <arm_neon.h>
493 +using namespace X265_NS;
496 +static int16x8_t rev16(const int16x8_t a)
498 + static const int8x16_t tbl = {14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1};
499 + return vqtbx1q_u8(a,a,tbl);
502 +static int32x4_t rev32(const int32x4_t a)
504 + static const int8x16_t tbl = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
505 + return vqtbx1q_u8(a,a,tbl);
508 +static void transpose_4x4x16(int16x4_t& x0,int16x4_t& x1,int16x4_t& x2,int16x4_t& x3)
510 + int16x4_t s0,s1,s2,s3;
511 + s0 = vtrn1_s32(x0,x2);
512 + s1 = vtrn1_s32(x1,x3);
513 + s2 = vtrn2_s32(x0,x2);
514 + s3 = vtrn2_s32(x1,x3);
516 + x0 = vtrn1_s16(s0,s1);
517 + x1 = vtrn2_s16(s0,s1);
518 + x2 = vtrn1_s16(s2,s3);
519 + x3 = vtrn2_s16(s2,s3);
524 +static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
527 + // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
528 + // For clarity, left the original reference code in comments
529 + int scanPosLast = 0;
531 + uint16_t cSign = 0;
532 + uint16_t cFlag = 0;
535 + uint32_t prevcgIdx = 0;
538 + const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
540 + const uint32_t posLast = scan[scanPosLast];
542 + const int curCoeff = coeff[posLast];
543 + const uint32_t isNZCoeff = (curCoeff != 0);
545 + NOTE: the new algorithm is complicated, so I keep reference code here
546 + uint32_t posy = posLast >> log2TrSize;
547 + uint32_t posx = posLast - (posy << log2TrSize);
548 + uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
549 + const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
550 + sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
554 + numSig -= isNZCoeff;
556 + if (scanPosLast % (1<<MLS_CG_SIZE) == 0)
558 + coeffSign[prevcgIdx] = cSign;
559 + coeffFlag[prevcgIdx] = cFlag;
560 + coeffNum[prevcgIdx] = cNum;
565 + // TODO: optimize by instruction BTS
566 + cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
567 + cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
568 + cNum += (uint8_t)isNZCoeff;
572 + while (numSig > 0);
574 + coeffSign[prevcgIdx] = cSign;
575 + coeffFlag[prevcgIdx] = cFlag;
576 + coeffNum[prevcgIdx] = cNum;
577 + return scanPosLast - 1;
581 +#if (MLS_CG_SIZE == 4)
582 +template<int log2TrSize>
583 +static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
585 + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
586 + const int scaleBits = SCALE_BITS - 2 * transformShift;
587 + const uint32_t trSize = 1 << log2TrSize;
589 + int64x2_t vcost_sum_0 = vdupq_n_s64(0);
590 + int64x2_t vcost_sum_1 = vdupq_n_s64(0);
591 + for (int y = 0; y < MLS_CG_SIZE; y++)
593 + int16x4_t in = *(int16x4_t *)&m_resiDctCoeff[blkPos];
594 + int32x4_t mul = vmull_s16(in,in);
595 + int64x2_t cost0, cost1;
596 + cost0 = vshll_n_s32(vget_low_s32(mul),scaleBits);
597 + cost1 = vshll_high_n_s32(mul,scaleBits);
598 + *(int64x2_t *)&costUncoded[blkPos+0] = cost0;
599 + *(int64x2_t *)&costUncoded[blkPos+2] = cost1;
600 + vcost_sum_0 = vaddq_s64(vcost_sum_0,cost0);
601 + vcost_sum_1 = vaddq_s64(vcost_sum_1,cost1);
604 + int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0,vcost_sum_1));
605 + *totalUncodedCost += sum;
606 + *totalRdCost += sum;
609 +template<int log2TrSize>
610 +static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
612 + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
613 + const int scaleBits = SCALE_BITS - 2 * transformShift;
614 + const uint32_t trSize = 1 << log2TrSize;
615 + //using preprocessor to bypass clang bug
616 + const int max = X265_MAX(0, (2 * transformShift + 1));
618 + int64x2_t vcost_sum_0 = vdupq_n_s64(0);
619 + int64x2_t vcost_sum_1 = vdupq_n_s64(0);
620 + int32x4_t vpsy = vdupq_n_s32(*psyScale);
621 + for (int y = 0; y < MLS_CG_SIZE; y++)
623 + int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeff[blkPos]);
624 + int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeff[blkPos]),signCoef);
625 + int64x2_t cost0, cost1;
626 + cost0 = vmull_s32(vget_low_s32(signCoef),vget_low_s32(signCoef));
627 + cost1 = vmull_high_s32(signCoef,signCoef);
628 + cost0 = vshlq_n_s64(cost0,scaleBits);
629 + cost1 = vshlq_n_s64(cost1,scaleBits);
630 + int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef),vget_low_s32(vpsy));
631 + int64x2_t neg1 = vmull_high_s32(predictedCoef,vpsy);
633 + int64x2_t shift = vdupq_n_s64(-max);
634 + neg0 = vshlq_s64(neg0,shift);
635 + neg1 = vshlq_s64(neg1,shift);
637 + cost0 = vsubq_s64(cost0,neg0);
638 + cost1 = vsubq_s64(cost1,neg1);
639 + *(int64x2_t *)&costUncoded[blkPos+0] = cost0;
640 + *(int64x2_t *)&costUncoded[blkPos+2] = cost1;
641 + vcost_sum_0 = vaddq_s64(vcost_sum_0,cost0);
642 + vcost_sum_1 = vaddq_s64(vcost_sum_1,cost1);
646 + int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0,vcost_sum_1));
647 + *totalUncodedCost += sum;
648 + *totalRdCost += sum;
652 + #error "MLS_CG_SIZE must be 4 for neon version"
657 +template<int trSize>
658 +int count_nonzero_neon(const int16_t* quantCoeff)
660 + X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
662 + int16x8_t vcount = vdupq_n_s16(0);
663 + const int numCoeff = trSize * trSize;
665 + for (; (i + 8) <= numCoeff; i+=8)
667 + int16x8_t in = *(int16x8_t*)&quantCoeff[i];
668 + vcount = vaddq_s16(vcount,vtstq_s16(in,in));
670 + for (; i < numCoeff; i++)
672 + count += quantCoeff[i] != 0;
675 + return count - vaddvq_s16(vcount);
678 +template<int trSize>
679 +uint32_t copy_count_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
681 + uint32_t numSig = 0;
682 + int16x8_t vcount = vdupq_n_s16(0);
683 + for (int k = 0; k < trSize; k++)
686 + for (; (j + 8) <= trSize; j+=8)
688 + int16x8_t in = *(int16x8_t*)&residual[j];
689 + *(int16x8_t*)&coeff[j] = in;
690 + vcount = vaddq_s16(vcount,vtstq_s16(in,in));
692 + for (; j < trSize; j++)
694 + coeff[j] = residual[j];
695 + numSig += (residual[j] != 0);
697 + residual += resiStride;
701 + return numSig - vaddvq_s16(vcount);
705 +static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
708 + int32x4_t E[2], O[2];
710 + int32x2_t EEE, EEO;
711 + const int add = 1 << (shift - 1);
712 + const int32x4_t _vadd = {add,0};
714 + for (j = 0; j < line; j++)
716 + int16x8_t in0 = *(int16x8_t *)src;
717 + int16x8_t in1 = rev16(*(int16x8_t *)&src[8]);
719 + E[0] = vaddl_s16(vget_low_s16(in0),vget_low_s16(in1));
720 + O[0] = vsubl_s16(vget_low_s16(in0),vget_low_s16(in1));
721 + E[1] = vaddl_high_s16(in0,in1);
722 + O[1] = vsubl_high_s16(in0,in1);
724 + for (k = 1; k < 16; k += 2)
726 + int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16[k][0]);
727 + int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16[k][4]);
729 + int32x4_t res = _vadd;
730 + res = vmlaq_s32(res,c0,O[0]);
731 + res = vmlaq_s32(res,c1,O[1]);
732 + dst[k * line] = (int16_t)(vaddvq_s32(res) >> shift);
736 + EE = vaddq_s32(E[0],rev32(E[1]));
737 + EO = vsubq_s32(E[0],rev32(E[1]));
739 + for (k = 2; k < 16; k += 4)
741 + int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16[k][0]);
742 + int32x4_t res = _vadd;
743 + res = vmlaq_s32(res,c0,EO);
744 + dst[k * line] = (int16_t)(vaddvq_s32(res) >> shift);
748 + EEE[0] = EE[0] + EE[3];
749 + EEO[0] = EE[0] - EE[3];
750 + EEE[1] = EE[1] + EE[2];
751 + EEO[1] = EE[1] - EE[2];
753 + dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
754 + dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
755 + dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
756 + dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
765 +static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
768 + const int add = 1 << (shift - 1);
771 + for (j = 0; j < line; j++)
773 + int32x4_t VE[4], VO0,VO1,VO2,VO3;
774 + int32x4_t VEE[2], VEO[2];
775 + int32x4_t VEEE, VEEO;
776 + int EEEE[2], EEEO[2];
778 + int16x8x4_t inputs;
779 + inputs = *(int16x8x4_t *)&src[0];
780 + int16x8x4_t in_rev;
782 + in_rev.val[1] = rev16(inputs.val[2]);
783 + in_rev.val[0] = rev16(inputs.val[3]);
785 + VE[0] = vaddl_s16(vget_low_s16(inputs.val[0]),vget_low_s16(in_rev.val[0]));
786 + VE[1] = vaddl_high_s16(inputs.val[0],in_rev.val[0]);
787 + VO0 = vsubl_s16(vget_low_s16(inputs.val[0]),vget_low_s16(in_rev.val[0]));
788 + VO1 = vsubl_high_s16(inputs.val[0],in_rev.val[0]);
789 + VE[2] = vaddl_s16(vget_low_s16(inputs.val[1]),vget_low_s16(in_rev.val[1]));
790 + VE[3] = vaddl_high_s16(inputs.val[1],in_rev.val[1]);
791 + VO2 = vsubl_s16(vget_low_s16(inputs.val[1]),vget_low_s16(in_rev.val[1]));
792 + VO3 = vsubl_high_s16(inputs.val[1],in_rev.val[1]);
794 + for (k = 1; k < 32; k += 2)
796 + int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32[k][0]);
797 + int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32[k][4]);
798 + int32x4_t c2 = vmovl_s16(*(int16x4_t *)&g_t32[k][8]);
799 + int32x4_t c3 = vmovl_s16(*(int16x4_t *)&g_t32[k][12]);
800 + int32x4_t s = vmulq_s32(c0,VO0);
801 + s = vmlaq_s32(s,c1,VO1);
802 + s = vmlaq_s32(s,c2,VO2);
803 + s = vmlaq_s32(s,c3,VO3);
805 + dst[k * line] = (int16_t)((vaddvq_s32(s) + add) >> shift);
809 + int32x4_t rev_VE[2];
812 + rev_VE[0] = rev32(VE[3]);
813 + rev_VE[1] = rev32(VE[2]);
816 + for (k = 0; k < 2; k++)
818 + VEE[k] = vaddq_s32(VE[k],rev_VE[k]);
819 + VEO[k] = vsubq_s32(VE[k],rev_VE[k]);
821 + for (k = 2; k < 32; k += 4)
823 + int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32[k][0]);
824 + int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32[k][4]);
825 + int32x4_t s = vmulq_s32(c0,VEO[0]);
826 + s = vmlaq_s32(s,c1,VEO[1]);
828 + dst[k * line] = (int16_t)((vaddvq_s32(s) + add) >> shift);
832 + int32x4_t tmp = rev32(VEE[1]);
833 + VEEE = vaddq_s32(VEE[0],tmp);
834 + VEEO = vsubq_s32(VEE[0],tmp);
835 + for (k = 4; k < 32; k += 8)
837 + int32x4_t c = vmovl_s16(*(int16x4_t *)&g_t32[k][0]);
838 + int32x4_t s = vmulq_s32(c,VEEO);
840 + dst[k * line] = (int16_t)((vaddvq_s32(s) + add) >> shift);
843 + /* EEEE and EEEO */
844 + EEEE[0] = VEEE[0] + VEEE[3];
845 + EEEO[0] = VEEE[0] - VEEE[3];
846 + EEEE[1] = VEEE[1] + VEEE[2];
847 + EEEO[1] = VEEE[1] - VEEE[2];
849 + dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
850 + dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
851 + dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
852 + dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
861 +static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
866 + int add = 1 << (shift - 1);
868 + for (j = 0; j < line; j++)
871 + for (k = 0; k < 4; k++)
873 + E[k] = src[k] + src[7 - k];
874 + O[k] = src[k] - src[7 - k];
878 + EE[0] = E[0] + E[3];
879 + EO[0] = E[0] - E[3];
880 + EE[1] = E[1] + E[2];
881 + EO[1] = E[1] - E[2];
883 + dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
884 + dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
885 + dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
886 + dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
888 + dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
889 + dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
890 + dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
891 + dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
898 +static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
902 + int add = 1 << (shift - 1);
904 + for (j = 0; j < line; j++)
906 + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
907 + O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
908 + O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
909 + E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
910 + E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
912 + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
913 + dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
914 + dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
915 + dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
916 + dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
925 +static void partialButterflyInverse16_neon(const int16_t* src, int16_t* orig_dst, int shift, int line)
927 +#define FMAK(x,l) s[l] = vmlal_lane_s16(s[l],*(int16x4_t*)&src[(x)*line],*(int16x4_t *)&g_t16[x][k],l)
928 +#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&src[x*line],*(int16x4_t *)&g_t16[x][k],l);
929 +#define ODD3_15(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);
930 +#define EVEN6_14_STEP4(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);
934 + int32x4_t E[8], O[8];
935 + int32x4_t EE[4], EO[4];
936 + int32x4_t EEE[2], EEO[2];
937 + const int add = 1 << (shift - 1);
941 + for (j = 0; j < line; j+=4)
943 + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
946 + for (k=0;k<2;k++) {
948 + s = vmull_s16(vdup_n_s16(g_t16[4][k]),*(int16x4_t*)&src[4*line]);;
949 + EEO[k] = vmlal_s16(s,vdup_n_s16(g_t16[12][k]),*(int16x4_t*)&src[(12)*line]);
950 + s = vmull_s16(vdup_n_s16(g_t16[0][k]),*(int16x4_t*)&src[0*line]);;
951 + EEE[k] = vmlal_s16(s,vdup_n_s16(g_t16[8][k]),*(int16x4_t*)&src[(8)*line]);
954 + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
955 + EE[0] = vaddq_s32(EEE[0] , EEO[0]);
956 + EE[2] = vsubq_s32(EEE[1] , EEO[1]);
957 + EE[1] = vaddq_s32(EEE[1] , EEO[1]);
958 + EE[3] = vsubq_s32(EEE[0] , EEO[0]);
962 + for (k = 0; k < 4; k+=4)
983 + static const int32x4_t min = vdupq_n_s32(-32768);
984 + static const int32x4_t max = vdupq_n_s32(32767);
985 + const int32x4_t minus_shift = vdupq_n_s32(-shift);
988 + for (k = 0; k < 4; k++)
990 + E[k] = vaddq_s32(EE[k] , EO[k]);
991 + E[k + 4] = vsubq_s32(EE[3 - k] , EO[3 - k]);
995 + for (k = 0; k < 8; k+=4)
1011 + int16x4_t x0,x1,x2,x3;
1013 + E[k] = vaddq_s32(vdupq_n_s32(add),E[k]);
1014 + t = vaddq_s32(E[k],O[k]);
1015 + t = vshlq_s32(t,minus_shift);
1016 + t = vmaxq_s32(t,min);
1017 + t = vminq_s32(t,max);
1018 + x0 = vmovn_s32(t);
1020 + E[k+1] = vaddq_s32(vdupq_n_s32(add),E[k+1]);
1021 + t = vaddq_s32(E[k+1],O[k+1]);
1022 + t = vshlq_s32(t,minus_shift);
1023 + t = vmaxq_s32(t,min);
1024 + t = vminq_s32(t,max);
1025 + x1 = vmovn_s32(t);
1027 + E[k+2] = vaddq_s32(vdupq_n_s32(add),E[k+2]);
1028 + t = vaddq_s32(E[k+2],O[k+2]);
1029 + t = vshlq_s32(t,minus_shift);
1030 + t = vmaxq_s32(t,min);
1031 + t = vminq_s32(t,max);
1032 + x2 = vmovn_s32(t);
1034 + E[k+3] = vaddq_s32(vdupq_n_s32(add),E[k+3]);
1035 + t = vaddq_s32(E[k+3],O[k+3]);
1036 + t = vshlq_s32(t,minus_shift);
1037 + t = vmaxq_s32(t,min);
1038 + t = vminq_s32(t,max);
1039 + x3 = vmovn_s32(t);
1041 + transpose_4x4x16(x0,x1,x2,x3);
1042 + *(int16x4_t*)&orig_dst[0*16+k] = x0;
1043 + *(int16x4_t*)&orig_dst[1*16+k] = x1;
1044 + *(int16x4_t*)&orig_dst[2*16+k] = x2;
1045 + *(int16x4_t*)&orig_dst[3*16+k] = x3;
1050 + for (k = 0; k < 8; k+=4)
1053 + int16x4_t x0,x1,x2,x3;
1055 + t = vsubq_s32(E[7-k],O[7-k]);
1056 + t = vshlq_s32(t,minus_shift);
1057 + t = vmaxq_s32(t,min);
1058 + t = vminq_s32(t,max);
1059 + x0 = vmovn_s32(t);
1061 + t = vsubq_s32(E[6-k],O[6-k]);
1062 + t = vshlq_s32(t,minus_shift);
1063 + t = vmaxq_s32(t,min);
1064 + t = vminq_s32(t,max);
1065 + x1 = vmovn_s32(t);
1067 + t = vsubq_s32(E[5-k],O[5-k]);
1069 + t = vshlq_s32(t,minus_shift);
1070 + t = vmaxq_s32(t,min);
1071 + t = vminq_s32(t,max);
1072 + x2 = vmovn_s32(t);
1074 + t = vsubq_s32(E[4-k],O[4-k]);
1075 + t = vshlq_s32(t,minus_shift);
1076 + t = vmaxq_s32(t,min);
1077 + t = vminq_s32(t,max);
1078 + x3 = vmovn_s32(t);
1080 + transpose_4x4x16(x0,x1,x2,x3);
1081 + *(int16x4_t*)&orig_dst[0*16+k+8] = x0;
1082 + *(int16x4_t*)&orig_dst[1*16+k+8] = x1;
1083 + *(int16x4_t*)&orig_dst[2*16+k+8] = x2;
1084 + *(int16x4_t*)&orig_dst[3*16+k+8] = x3;
1095 +#undef EVEN6_14_STEP4
1102 +static void partialButterflyInverse32_neon(const int16_t* src, int16_t* orig_dst, int shift, int line)
1104 +#define MUL(x) vmull_s16(vdup_n_s16(g_t32[x][k]),*(int16x4_t*)&src[x*line]);
1105 +#define FMA(x) s = vmlal_s16(s,vdup_n_s16(g_t32[x][k]),*(int16x4_t*)&src[(x)*line])
1106 +#define FMAK(x,l) s[l] = vmlal_lane_s16(s[l],*(int16x4_t*)&src[(x)*line],*(int16x4_t *)&g_t32[x][k],l)
1107 +#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&src[x*line],*(int16x4_t *)&g_t32[x][k],l);
1108 +#define ODD31(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);FMAK(17,k);FMAK(19,k);FMAK(21,k);FMAK(23,k);FMAK(25,k);FMAK(27,k);FMAK(29,k);FMAK(31,k);
1110 +#define ODD15(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);FMAK(18,k);FMAK(22,k);FMAK(26,k);FMAK(30,k);
1111 +#define ODD7(k) FMAK(12,k);FMAK(20,k);FMAK(28,k);
1115 + int32x4_t E[16], O[16];
1116 + int32x4_t EE[8], EO[8];
1117 + int32x4_t EEE[4], EEO[4];
1118 + int32x4_t EEEE[2], EEEO[2];
1119 + int16x4_t dst[32];
1120 + int add = 1 << (shift - 1);
1123 + for (j = 0; j < line; j+=4)
1126 + for (k = 0; k < 16; k+=4)
1147 + for (k = 0; k < 8; k+=4)
1167 + for (k = 0; k < 4; k+=4)
1187 + for (k=0;k<2;k++) {
1190 + EEEO[k] = FMA(24);
1192 + EEEE[k] = FMA(16);
1194 + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
1195 + EEE[0] = vaddq_s32(EEEE[0],EEEO[0]);
1196 + EEE[3] = vsubq_s32(EEEE[0],EEEO[0]);
1197 + EEE[1] = vaddq_s32(EEEE[1],EEEO[1]);
1198 + EEE[2] = vsubq_s32(EEEE[1],EEEO[1]);
1201 + for (k = 0; k < 4; k++)
1203 + EE[k] = vaddq_s32(EEE[k],EEO[k]);
1204 + EE[k + 4] = vsubq_s32((EEE[3 - k]), (EEO[3 - k]));
1208 + for (k = 0; k < 8; k++)
1210 + E[k] = vaddq_s32(EE[k],EO[k]);
1211 + E[k + 8] = vsubq_s32((EE[7 - k]),(EO[7 - k]));
1214 + static const int32x4_t min = vdupq_n_s32(-32768);
1215 + static const int32x4_t max = vdupq_n_s32(32767);
1219 +#pragma unroll (16)
1220 + for (k = 0; k < 16; k++)
1222 + int32x4_t adde = vaddq_s32(vdupq_n_s32(add),E[k]);
1223 + int32x4_t s = vaddq_s32(adde,O[k]);
1224 + s = vshlq_s32(s,vdupq_n_s32(-shift));
1225 + s = vmaxq_s32(s,min);
1226 + s = vminq_s32(s,max);
1230 + dst[k] = vmovn_s32(s);
1231 + adde = vaddq_s32(vdupq_n_s32(add),(E[15-k]));
1232 + s =vsubq_s32(adde,(O[15-k]));
1233 + s = vshlq_s32(s,vdupq_n_s32(-shift));
1234 + s = vmaxq_s32(s,min);
1235 + s = vminq_s32(s,max);
1237 + dst[k+16] = vmovn_s32(s);
1242 + for (k = 0; k < 32; k+=4)
1244 + int16x4_t x0 = dst[k+0];
1245 + int16x4_t x1 = dst[k+1];
1246 + int16x4_t x2 = dst[k+2];
1247 + int16x4_t x3 = dst[k+3];
1248 + transpose_4x4x16(x0,x1,x2,x3);
1249 + *(int16x4_t*)&orig_dst[0*32+k] = x0;
1250 + *(int16x4_t*)&orig_dst[1*32+k] = x1;
1251 + *(int16x4_t*)&orig_dst[2*32+k] = x2;
1252 + *(int16x4_t*)&orig_dst[3*32+k] = x3;
1268 +static void dct8_neon(const int16_t* src, int16_t* dst, intptr_t srcStride)
1270 + const int shift_1st = 2 + X265_DEPTH - 8;
1271 + const int shift_2nd = 9;
1273 + ALIGN_VAR_32(int16_t, coef[8 * 8]);
1274 + ALIGN_VAR_32(int16_t, block[8 * 8]);
1276 + for (int i = 0; i < 8; i++)
1278 + memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
1281 + partialButterfly8(block, coef, shift_1st, 8);
1282 + partialButterfly8(coef, dst, shift_2nd, 8);
1285 +static void dct16_neon(const int16_t* src, int16_t* dst, intptr_t srcStride)
1287 + const int shift_1st = 3 + X265_DEPTH - 8;
1288 + const int shift_2nd = 10;
1290 + ALIGN_VAR_32(int16_t, coef[16 * 16]);
1291 + ALIGN_VAR_32(int16_t, block[16 * 16]);
1293 + for (int i = 0; i < 16; i++)
1295 + memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
1298 + partialButterfly16(block, coef, shift_1st, 16);
1299 + partialButterfly16(coef, dst, shift_2nd, 16);
1302 +static void dct32_neon(const int16_t* src, int16_t* dst, intptr_t srcStride)
1304 + const int shift_1st = 4 + X265_DEPTH - 8;
1305 + const int shift_2nd = 11;
1307 + ALIGN_VAR_32(int16_t, coef[32 * 32]);
1308 + ALIGN_VAR_32(int16_t, block[32 * 32]);
1310 + for (int i = 0; i < 32; i++)
1312 + memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
1315 + partialButterfly32(block, coef, shift_1st, 32);
1316 + partialButterfly32(coef, dst, shift_2nd, 32);
1319 +static void idct4_neon(const int16_t* src, int16_t* dst, intptr_t dstStride)
1321 + const int shift_1st = 7;
1322 + const int shift_2nd = 12 - (X265_DEPTH - 8);
1324 + ALIGN_VAR_32(int16_t, coef[4 * 4]);
1325 + ALIGN_VAR_32(int16_t, block[4 * 4]);
1327 + partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
1328 + partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
1330 + for (int i = 0; i < 4; i++)
1332 + memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
1336 +static void idct16_neon(const int16_t* src, int16_t* dst, intptr_t dstStride)
1338 + const int shift_1st = 7;
1339 + const int shift_2nd = 12 - (X265_DEPTH - 8);
1341 + ALIGN_VAR_32(int16_t, coef[16 * 16]);
1342 + ALIGN_VAR_32(int16_t, block[16 * 16]);
1344 + partialButterflyInverse16_neon(src, coef, shift_1st, 16);
1345 + partialButterflyInverse16_neon(coef, block, shift_2nd, 16);
1347 + for (int i = 0; i < 16; i++)
1349 + memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
1353 +static void idct32_neon(const int16_t* src, int16_t* dst, intptr_t dstStride)
1355 + const int shift_1st = 7;
1356 + const int shift_2nd = 12 - (X265_DEPTH - 8);
1358 + ALIGN_VAR_32(int16_t, coef[32 * 32]);
1359 + ALIGN_VAR_32(int16_t, block[32 * 32]);
1361 + partialButterflyInverse32_neon(src, coef, shift_1st, 32);
1362 + partialButterflyInverse32_neon(coef, block, shift_2nd, 32);
1364 + for (int i = 0; i < 32; i++)
1366 + memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
1374 +namespace X265_NS {
1375 +// x265 private namespace
1376 +void setupDCTPrimitives_neon(EncoderPrimitives& p) {
1377 + p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_neon<2>;
1378 + p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_neon<3>;
1379 + p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_neon<4>;
1380 + p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_neon<5>;
1381 + p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_neon<2>;
1382 + p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_neon<3>;
1383 + p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_neon<4>;
1384 + p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_neon<5>;
1385 + p.cu[BLOCK_8x8].dct = dct8_neon;
1386 + p.cu[BLOCK_16x16].dct = dct16_neon;
1387 + p.cu[BLOCK_32x32].dct = dct32_neon;
1388 + p.cu[BLOCK_4x4].idct = idct4_neon;
1389 + p.cu[BLOCK_16x16].idct = idct16_neon;
1390 + p.cu[BLOCK_32x32].idct = idct32_neon;
1391 + p.cu[BLOCK_4x4].count_nonzero = count_nonzero_neon<4>;
1392 + p.cu[BLOCK_8x8].count_nonzero = count_nonzero_neon<8>;
1393 + p.cu[BLOCK_16x16].count_nonzero = count_nonzero_neon<16>;
1394 + p.cu[BLOCK_32x32].count_nonzero = count_nonzero_neon<32>;
1396 + p.cu[BLOCK_4x4].copy_cnt = copy_count_neon<4>;
1397 + p.cu[BLOCK_8x8].copy_cnt = copy_count_neon<8>;
1398 + p.cu[BLOCK_16x16].copy_cnt = copy_count_neon<16>;
1399 + p.cu[BLOCK_32x32].copy_cnt = copy_count_neon<32>;
1400 + p.cu[BLOCK_4x4].psyRdoQuant_1p = nonPsyRdoQuant_neon<2>;
1401 + p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_neon<2>;
1402 + p.cu[BLOCK_8x8].psyRdoQuant_1p = nonPsyRdoQuant_neon<3>;
1403 + p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_neon<3>;
1404 + p.cu[BLOCK_16x16].psyRdoQuant_1p = nonPsyRdoQuant_neon<4>;
1405 + p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_neon<4>;
1406 + p.cu[BLOCK_32x32].psyRdoQuant_1p = nonPsyRdoQuant_neon<5>;
1407 + p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_neon<5>;
1409 + p.scanPosLast =scanPosLast_opt;
1417 diff -Naur ./source/common/arm64/dct-prim.h ../x265_apple_patch/source/common/arm64/dct-prim.h
1418 --- ./source/common/arm64/dct-prim.h 1970-01-01 01:00:00.000000000 +0100
1419 +++ ../x265_apple_patch/source/common/arm64/dct-prim.h 2021-05-08 13:08:01.000000000 +0100
1421 +#ifndef __DCT_PRIM_NEON_H__
1422 +#define __DCT_PRIM_NEON_H__
1425 +#include "common.h"
1426 +#include "primitives.h"
1427 +#include "contexts.h" // costCoeffNxN_c
1428 +#include "threading.h" // CLZ
1430 +namespace X265_NS {
1431 +// x265 private namespace
1432 +void setupDCTPrimitives_neon(EncoderPrimitives& p);
1439 diff -Naur ./source/common/arm64/filter-prim.cpp ../x265_apple_patch/source/common/arm64/filter-prim.cpp
1440 --- ./source/common/arm64/filter-prim.cpp 1970-01-01 01:00:00.000000000 +0100
1441 +++ ../x265_apple_patch/source/common/arm64/filter-prim.cpp 2021-05-08 13:08:01.000000000 +0100
1446 +#include "filter-prim.h"
1447 +#include <arm_neon.h>
1451 +using namespace X265_NS;
1454 +template<int width, int height>
1455 +void filterPixelToShort_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
1457 + const int shift = IF_INTERNAL_PREC - X265_DEPTH;
1459 + const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
1460 + for (row = 0; row < height; row++)
1463 + for (col = 0; col < width; col+=8)
1468 + in = *(int16x8_t *)&src[col];
1470 + in = vmovl_u8(*(uint8x8_t *)&src[col]);
1473 + int16x8_t tmp = vshlq_n_s16(in,shift);
1474 + tmp = vsubq_s16(tmp,off);
1475 + *(int16x8_t *)&dst[col] = tmp;
1485 +template<int N, int width, int height>
1486 +void interp_horiz_pp_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
1488 + const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1489 + int headRoom = IF_FILTER_PREC;
1490 + int offset = (1 << (headRoom - 1));
1491 + uint16_t maxVal = (1 << X265_DEPTH) - 1;
1494 + src -= (N / 2 - 1) * cStride;
1496 + vc = *(int16x8_t *)coeff;
1497 + int16x4_t low_vc = vget_low_s16(vc);
1498 + int16x4_t high_vc = vget_high_s16(vc);
1500 + const int32x4_t voffset = vdupq_n_s32(offset);
1501 + const int32x4_t vhr = vdupq_n_s32(-headRoom);
1504 + for (row = 0; row < height; row++)
1506 + for (col = 0; col < width; col+=8)
1508 + int32x4_t vsum1,vsum2;
1510 + int16x8_t input[N];
1512 + for (int i=0;i<N;i++)
1515 + input[i] = *(int16x8_t *)&src[col+i];
1517 + input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i]);
1523 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[0]),low_vc,0);
1524 + vsum2 = vmlal_high_lane_s16(vsum2,input[0],low_vc,0);
1526 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[1]),low_vc,1);
1527 + vsum2 = vmlal_high_lane_s16(vsum2,input[1],low_vc,1);
1529 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[2]),low_vc,2);
1530 + vsum2 = vmlal_high_lane_s16(vsum2,input[2],low_vc,2);
1532 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[3]),low_vc,3);
1533 + vsum2 = vmlal_high_lane_s16(vsum2,input[3],low_vc,3);
1537 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[4]),high_vc,0);
1538 + vsum2 = vmlal_high_lane_s16(vsum2,input[4],high_vc,0);
1539 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[5]),high_vc,1);
1540 + vsum2 = vmlal_high_lane_s16(vsum2,input[5],high_vc,1);
1541 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[6]),high_vc,2);
1542 + vsum2 = vmlal_high_lane_s16(vsum2,input[6],high_vc,2);
1543 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[7]),high_vc,3);
1544 + vsum2 = vmlal_high_lane_s16(vsum2,input[7],high_vc,3);
1548 + vsum1 = vshlq_s32(vsum1, vhr);
1549 + vsum2 = vshlq_s32(vsum2, vhr);
1551 + int16x8_t vsum = vuzp1q_s16(vsum1,vsum2);
1552 + vsum = vminq_s16(vsum,vdupq_n_s16(maxVal));
1553 + vsum = vmaxq_s16(vsum,vdupq_n_s16(0));
1555 + *(int16x8_t *)&dst[col] = vsum;
1557 + uint8x16_t usum = vuzp1q_u8(vsum,vsum);
1558 + *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
1570 +template<int N, int width, int height>
1571 +void interp_horiz_ps_neon(const uint16_t * src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1573 + const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1574 + const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1575 + const int shift = IF_FILTER_PREC - headRoom;
1576 + const int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1578 + int blkheight = height;
1583 + src -= (N / 2 - 1) * srcStride;
1584 + blkheight += N - 1;
1586 + int32x4_t vc0 = vmovl_s16(*(int16x4_t *)coeff);
1590 + vc1 = vmovl_s16(*(int16x4_t *)(coeff + 4));
1593 + const int32x4_t voffset = vdupq_n_s32(offset);
1594 + const int32x4_t vhr = vdupq_n_s32(-shift);
1597 + for (row = 0; row < blkheight; row++)
1599 + for (col = 0; col < width; col+=4)
1603 + int32x4_t input[N];
1605 + for (int i=0;i<N;i++)
1607 + input[i] = vmovl_s16(*(int16x4_t *)&src[col+i]);
1610 + vsum = vmlaq_laneq_s32(vsum,(input[0]),vc0,0);
1611 + vsum = vmlaq_laneq_s32(vsum,(input[1]),vc0,1);
1612 + vsum = vmlaq_laneq_s32(vsum,(input[2]),vc0,2);
1613 + vsum = vmlaq_laneq_s32(vsum,(input[3]),vc0,3);
1618 + vsum = vmlaq_laneq_s32(vsum,(input[4]),vc1,0);
1619 + vsum = vmlaq_laneq_s32(vsum,(input[5]),vc1,1);
1620 + vsum = vmlaq_laneq_s32(vsum,(input[6]),vc1,2);
1621 + vsum = vmlaq_laneq_s32(vsum,(input[7]),vc1,3);
1625 + vsum = vshlq_s32(vsum, vhr);
1626 + *(int16x4_t *)&dst[col] = vmovn_u32(vsum);
1637 +template<int N, int width, int height>
1638 +void interp_horiz_ps_neon(const uint8_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1640 + const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1641 + const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1642 + const int shift = IF_FILTER_PREC - headRoom;
1643 + const int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1645 + int blkheight = height;
1650 + src -= (N / 2 - 1) * srcStride;
1651 + blkheight += N - 1;
1654 + vc = *(int16x8_t *)coeff;
1656 + const int16x8_t voffset = vdupq_n_s16(offset);
1657 + const int16x8_t vhr = vdupq_n_s16(-shift);
1660 + for (row = 0; row < blkheight; row++)
1662 + for (col = 0; col < width; col+=8)
1666 + int16x8_t input[N];
1668 + for (int i=0;i<N;i++)
1670 + input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i]);
1673 + vsum = vmlaq_laneq_s16(vsum,(input[0]),vc,0);
1674 + vsum = vmlaq_laneq_s16(vsum,(input[1]),vc,1);
1675 + vsum = vmlaq_laneq_s16(vsum,(input[2]),vc,2);
1676 + vsum = vmlaq_laneq_s16(vsum,(input[3]),vc,3);
1681 + vsum = vmlaq_laneq_s16(vsum,(input[4]),vc,4);
1682 + vsum = vmlaq_laneq_s16(vsum,(input[5]),vc,5);
1683 + vsum = vmlaq_laneq_s16(vsum,(input[6]),vc,6);
1684 + vsum = vmlaq_laneq_s16(vsum,(input[7]),vc,7);
1688 + vsum = vshlq_s16(vsum, vhr);
1689 + *(int16x8_t *)&dst[col] = vsum;
1700 +template<int N, int width, int height>
1701 +void interp_vert_ss_neon(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1703 + const int16_t* c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
1704 + int shift = IF_FILTER_PREC;
1705 + src -= (N / 2 - 1) * srcStride;
1707 + vc = *(int16x8_t *)c;
1708 + int16x4_t low_vc = vget_low_s16(vc);
1709 + int16x4_t high_vc = vget_high_s16(vc);
1711 + const int32x4_t vhr = vdupq_n_s32(-shift);
1714 + for (row = 0; row < height; row++)
1716 + for (col = 0; col < width; col+=8)
1718 + int32x4_t vsum1,vsum2;
1720 + int16x8_t input[N];
1722 + for (int i=0;i<N;i++)
1724 + input[i] = *(int16x8_t *)&src[col+i*srcStride];
1727 + vsum1 = vmull_lane_s16(vget_low_s16(input[0]),low_vc,0);
1728 + vsum2 = vmull_high_lane_s16(input[0],low_vc,0);
1730 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[1]),low_vc,1);
1731 + vsum2 = vmlal_high_lane_s16(vsum2,input[1],low_vc,1);
1733 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[2]),low_vc,2);
1734 + vsum2 = vmlal_high_lane_s16(vsum2,input[2],low_vc,2);
1736 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[3]),low_vc,3);
1737 + vsum2 = vmlal_high_lane_s16(vsum2,input[3],low_vc,3);
1741 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[4]),high_vc,0);
1742 + vsum2 = vmlal_high_lane_s16(vsum2,input[4],high_vc,0);
1743 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[5]),high_vc,1);
1744 + vsum2 = vmlal_high_lane_s16(vsum2,input[5],high_vc,1);
1745 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[6]),high_vc,2);
1746 + vsum2 = vmlal_high_lane_s16(vsum2,input[6],high_vc,2);
1747 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[7]),high_vc,3);
1748 + vsum2 = vmlal_high_lane_s16(vsum2,input[7],high_vc,3);
1752 + vsum1 = vshlq_s32(vsum1, vhr);
1753 + vsum2 = vshlq_s32(vsum2, vhr);
1755 + int16x8_t vsum = vuzp1q_s16(vsum1,vsum2);
1756 + *(int16x8_t *)&dst[col] = vsum;
1768 +template<int N, int width, int height>
1769 +void interp_vert_pp_neon(const uint16_t* src, intptr_t srcStride, uint16_t* dst, intptr_t dstStride, int coeffIdx)
1772 + const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1773 + int shift = IF_FILTER_PREC;
1774 + int offset = 1 << (shift - 1);
1775 + const uint16_t maxVal = (1 << X265_DEPTH) - 1;
1777 + src -= (N / 2 - 1) * srcStride;
1779 + vc = *(int16x8_t *)c;
1780 + int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
1781 + int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
1783 + const int32x4_t voffset = vdupq_n_s32(offset);
1784 + const int32x4_t vhr = vdupq_n_s32(-shift);
1787 + for (row = 0; row < height; row++)
1789 + for (col = 0; col < width; col+=4)
1793 + int32x4_t input[N];
1795 + for (int i=0;i<N;i++)
1797 + input[i] = vmovl_u16(*(uint16x4_t *)&src[col+i*srcStride]);
1801 + vsum = vmlaq_laneq_s32(vsum,(input[0]),low_vc,0);
1802 + vsum = vmlaq_laneq_s32(vsum,(input[1]),low_vc,1);
1803 + vsum = vmlaq_laneq_s32(vsum,(input[2]),low_vc,2);
1804 + vsum = vmlaq_laneq_s32(vsum,(input[3]),low_vc,3);
1808 + vsum = vmlaq_laneq_s32(vsum,(input[4]),high_vc,0);
1809 + vsum = vmlaq_laneq_s32(vsum,(input[5]),high_vc,1);
1810 + vsum = vmlaq_laneq_s32(vsum,(input[6]),high_vc,2);
1811 + vsum = vmlaq_laneq_s32(vsum,(input[7]),high_vc,3);
1814 + vsum = vshlq_s32(vsum, vhr);
1815 + vsum = vminq_s32(vsum,vdupq_n_s32(maxVal));
1816 + vsum = vmaxq_s32(vsum,vdupq_n_s32(0));
1817 + *(uint16x4_t *)&dst[col] = vmovn_u32(vsum);
1829 +template<int N, int width, int height>
1830 +void interp_vert_pp_neon(const uint8_t* src, intptr_t srcStride, uint8_t* dst, intptr_t dstStride, int coeffIdx)
1833 + const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1834 + int shift = IF_FILTER_PREC;
1835 + int offset = 1 << (shift - 1);
1836 + const uint16_t maxVal = (1 << X265_DEPTH) - 1;
1838 + src -= (N / 2 - 1) * srcStride;
1840 + vc = *(int16x8_t *)c;
1842 + const int16x8_t voffset = vdupq_n_s16(offset);
1843 + const int16x8_t vhr = vdupq_n_s16(-shift);
1846 + for (row = 0; row < height; row++)
1848 + for (col = 0; col < width; col+=8)
1852 + int16x8_t input[N];
1854 + for (int i=0;i<N;i++)
1856 + input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i*srcStride]);
1860 + vsum = vmlaq_laneq_s16(vsum,(input[0]),vc,0);
1861 + vsum = vmlaq_laneq_s16(vsum,(input[1]),vc,1);
1862 + vsum = vmlaq_laneq_s16(vsum,(input[2]),vc,2);
1863 + vsum = vmlaq_laneq_s16(vsum,(input[3]),vc,3);
1867 + vsum = vmlaq_laneq_s16(vsum,(input[4]),vc,4);
1868 + vsum = vmlaq_laneq_s16(vsum,(input[5]),vc,5);
1869 + vsum = vmlaq_laneq_s16(vsum,(input[6]),vc,6);
1870 + vsum = vmlaq_laneq_s16(vsum,(input[7]),vc,7);
1874 + vsum = vshlq_s16(vsum, vhr);
1876 + vsum = vminq_s16(vsum,vdupq_n_s16(maxVal));
1877 + vsum = vmaxq_s16(vsum,vdupq_n_s16(0));
1878 + uint8x16_t usum = vuzp1q_u8(vsum,vsum);
1879 + *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
1894 +template<int N, int width, int height>
1895 +void interp_vert_ps_neon(const uint16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1897 + const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1898 + int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1899 + int shift = IF_FILTER_PREC - headRoom;
1900 + int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1901 + src -= (N / 2 - 1) * srcStride;
1904 + vc = *(int16x8_t *)c;
1905 + int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
1906 + int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
1908 + const int32x4_t voffset = vdupq_n_s32(offset);
1909 + const int32x4_t vhr = vdupq_n_s32(-shift);
1912 + for (row = 0; row < height; row++)
1914 + for (col = 0; col < width; col+=4)
1918 + int16x8_t input[N];
1920 + for (int i=0;i<N;i++)
1922 + input[i] = vmovl_u16(*(uint16x4_t *)&src[col+i*srcStride]);
1926 + vsum = vmlaq_laneq_s32(vsum,(input[0]),low_vc,0);
1927 + vsum = vmlaq_laneq_s32(vsum,(input[1]),low_vc,1);
1928 + vsum = vmlaq_laneq_s32(vsum,(input[2]),low_vc,2);
1929 + vsum = vmlaq_laneq_s32(vsum,(input[3]),low_vc,3);
1933 + int16x8_t vsum1 = vmulq_laneq_s32((input[4]),high_vc,0);
1934 + vsum1 = vmlaq_laneq_s32(vsum1,(input[5]),high_vc,1);
1935 + vsum1 = vmlaq_laneq_s32(vsum1,(input[6]),high_vc,2);
1936 + vsum1 = vmlaq_laneq_s32(vsum1,(input[7]),high_vc,3);
1937 + vsum = vaddq_s32(vsum,vsum1);
1940 + vsum = vshlq_s32(vsum, vhr);
1942 + *(uint16x4_t *)&dst[col] = vmovn_s32(vsum);
1952 +template<int N, int width, int height>
1953 +void interp_vert_ps_neon(const uint8_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1955 + const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1956 + int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
1957 + int shift = IF_FILTER_PREC - headRoom;
1958 + int offset = (unsigned)-IF_INTERNAL_OFFS << shift;
1959 + src -= (N / 2 - 1) * srcStride;
1962 + vc = *(int16x8_t *)c;
1964 + const int16x8_t voffset = vdupq_n_s16(offset);
1965 + const int16x8_t vhr = vdupq_n_s16(-shift);
1968 + for (row = 0; row < height; row++)
1970 + for (col = 0; col < width; col+=8)
1974 + int16x8_t input[N];
1976 + for (int i=0;i<N;i++)
1978 + input[i] = vmovl_u8(*(uint8x8_t *)&src[col+i*srcStride]);
1982 + vsum = vmlaq_laneq_s16(vsum,(input[0]),vc,0);
1983 + vsum = vmlaq_laneq_s16(vsum,(input[1]),vc,1);
1984 + vsum = vmlaq_laneq_s16(vsum,(input[2]),vc,2);
1985 + vsum = vmlaq_laneq_s16(vsum,(input[3]),vc,3);
1989 + int16x8_t vsum1 = vmulq_laneq_s16((input[4]),vc,4);
1990 + vsum1 = vmlaq_laneq_s16(vsum1,(input[5]),vc,5);
1991 + vsum1 = vmlaq_laneq_s16(vsum1,(input[6]),vc,6);
1992 + vsum1 = vmlaq_laneq_s16(vsum1,(input[7]),vc,7);
1993 + vsum = vaddq_s16(vsum,vsum1);
1996 + vsum = vshlq_s32(vsum, vhr);
1997 + *(int16x8_t *)&dst[col] = vsum;
2009 +template<int N, int width, int height>
2010 +void interp_vert_sp_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
2012 + int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
2013 + int shift = IF_FILTER_PREC + headRoom;
2014 + int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
2015 + uint16_t maxVal = (1 << X265_DEPTH) - 1;
2016 + const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
2018 + src -= (N / 2 - 1) * srcStride;
2021 + vc = *(int16x8_t *)coeff;
2022 + int16x4_t low_vc = vget_low_s16(vc);
2023 + int16x4_t high_vc = vget_high_s16(vc);
2025 + const int32x4_t voffset = vdupq_n_s32(offset);
2026 + const int32x4_t vhr = vdupq_n_s32(-shift);
2029 + for (row = 0; row < height; row++)
2031 + for (col = 0; col < width; col+=8)
2033 + int32x4_t vsum1,vsum2;
2035 + int16x8_t input[N];
2037 + for (int i=0;i<N;i++)
2039 + input[i] = *(int16x8_t *)&src[col+i*srcStride];
2044 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[0]),low_vc,0);
2045 + vsum2 = vmlal_high_lane_s16(vsum2,input[0],low_vc,0);
2047 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[1]),low_vc,1);
2048 + vsum2 = vmlal_high_lane_s16(vsum2,input[1],low_vc,1);
2050 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[2]),low_vc,2);
2051 + vsum2 = vmlal_high_lane_s16(vsum2,input[2],low_vc,2);
2053 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[3]),low_vc,3);
2054 + vsum2 = vmlal_high_lane_s16(vsum2,input[3],low_vc,3);
2058 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[4]),high_vc,0);
2059 + vsum2 = vmlal_high_lane_s16(vsum2,input[4],high_vc,0);
2060 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[5]),high_vc,1);
2061 + vsum2 = vmlal_high_lane_s16(vsum2,input[5],high_vc,1);
2062 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[6]),high_vc,2);
2063 + vsum2 = vmlal_high_lane_s16(vsum2,input[6],high_vc,2);
2064 + vsum1 = vmlal_lane_s16(vsum1,vget_low_s16(input[7]),high_vc,3);
2065 + vsum2 = vmlal_high_lane_s16(vsum2,input[7],high_vc,3);
2068 + vsum1 = vshlq_s32(vsum1, vhr);
2069 + vsum2 = vshlq_s32(vsum2, vhr);
2071 + int16x8_t vsum = vuzp1q_s16(vsum1,vsum2);
2072 + vsum = vminq_s16(vsum,vdupq_n_s16(maxVal));
2073 + vsum = vmaxq_s16(vsum,vdupq_n_s16(0));
2075 + *(int16x8_t *)&dst[col] = vsum;
2077 + uint8x16_t usum = vuzp1q_u8(vsum,vsum);
2078 + *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
2093 +template<int N, int width, int height>
2094 +void interp_hv_pp_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
2096 + ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
2098 + interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
2099 + interp_vert_sp_neon<N,width,height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
2109 +namespace X265_NS {
2110 + #define CHROMA_420(W, H) \
2111 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
2112 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hps = interp_horiz_ps_neon<4, W, H>; \
2113 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_neon<4, W, H>; \
2114 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_neon<4, W, H>; \
2115 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_neon<4, W, H>; \
2116 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_neon<4, W, H>; \
2117 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2118 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2120 + #define CHROMA_422(W, H) \
2121 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
2122 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hps = interp_horiz_ps_neon<4, W, H>; \
2123 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_neon<4, W, H>; \
2124 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_neon<4, W, H>; \
2125 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_neon<4, W, H>; \
2126 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_neon<4, W, H>; \
2127 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2128 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2130 + #define CHROMA_444(W, H) \
2131 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
2132 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_neon<4, W, H>; \
2133 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_neon<4, W, H>; \
2134 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_neon<4, W, H>; \
2135 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_neon<4, W, H>; \
2136 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_neon<4, W, H>; \
2137 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2138 + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2140 + #define LUMA(W, H) \
2141 + p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_neon<8, W, H>; \
2142 + p.pu[LUMA_ ## W ## x ## H].luma_hps = interp_horiz_ps_neon<8, W, H>; \
2143 + p.pu[LUMA_ ## W ## x ## H].luma_vpp = interp_vert_pp_neon<8, W, H>; \
2144 + p.pu[LUMA_ ## W ## x ## H].luma_vps = interp_vert_ps_neon<8, W, H>; \
2145 + p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_neon<8, W, H>; \
2146 + p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_neon<8, W, H>; \
2147 + p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_neon<8, W, H>; \
2148 + p.pu[LUMA_ ## W ## x ## H].convert_p2s[NONALIGNED] = filterPixelToShort_neon<W, H>;\
2149 + p.pu[LUMA_ ## W ## x ## H].convert_p2s[ALIGNED] = filterPixelToShort_neon<W, H>;
2152 +void setupFilterPrimitives_neon(EncoderPrimitives &p)
2155 + // All neon functions assume width of multiple of 8, (2,4,12 variants are not optimized)
2169 + CHROMA_420(16, 16);
2171 + CHROMA_420(16, 8);
2173 + CHROMA_420(8, 16);
2175 + CHROMA_420(16, 12);
2178 + CHROMA_420(16, 4);
2181 + CHROMA_420(32, 32);
2183 + CHROMA_420(32, 16);
2185 + CHROMA_420(16, 32);
2187 + CHROMA_420(32, 24);
2189 + CHROMA_420(24, 32);
2191 + CHROMA_420(32, 8);
2193 + CHROMA_420(8, 32);
2194 + CHROMA_422(8, 16);
2196 + CHROMA_422(8, 12);
2198 + CHROMA_422(16, 32);
2199 + CHROMA_422(16, 16);
2200 + CHROMA_422(8, 32);
2201 + CHROMA_422(16, 24);
2202 + CHROMA_422(16, 8);
2203 + CHROMA_422(32, 64);
2204 + CHROMA_422(32, 32);
2205 + CHROMA_422(16, 64);
2206 + CHROMA_422(32, 48);
2207 + CHROMA_422(24, 64);
2208 + CHROMA_422(32, 16);
2209 + CHROMA_422(8, 64);
2212 + CHROMA_444(16, 16);
2213 + CHROMA_444(16, 8);
2214 + CHROMA_444(8, 16);
2215 + CHROMA_444(16, 12);
2216 + CHROMA_444(16, 4);
2217 + CHROMA_444(32, 32);
2218 + CHROMA_444(32, 16);
2219 + CHROMA_444(16, 32);
2220 + CHROMA_444(32, 24);
2221 + CHROMA_444(24, 32);
2222 + CHROMA_444(32, 8);
2223 + CHROMA_444(8, 32);
2224 + CHROMA_444(64, 64);
2225 + CHROMA_444(64, 32);
2226 + CHROMA_444(32, 64);
2227 + CHROMA_444(64, 48);
2228 + CHROMA_444(48, 64);
2229 + CHROMA_444(64, 16);
2230 + CHROMA_444(16, 64);
2240 diff -Naur ./source/common/arm64/filter-prim.h ../x265_apple_patch/source/common/arm64/filter-prim.h
2241 --- ./source/common/arm64/filter-prim.h 1970-01-01 01:00:00.000000000 +0100
2242 +++ ../x265_apple_patch/source/common/arm64/filter-prim.h 2021-05-08 13:08:01.000000000 +0100
2244 +#ifndef _FILTER_PRIM_ARM64_H__
2245 +#define _FILTER_PRIM_ARM64_H__
2248 +#include "common.h"
2249 +#include "slicetype.h" // LOWRES_COST_MASK
2250 +#include "primitives.h"
2254 +namespace X265_NS {
2257 +void setupFilterPrimitives_neon(EncoderPrimitives &p);
2264 diff -Naur ./source/common/arm64/intrapred-prim.cpp ../x265_apple_patch/source/common/arm64/intrapred-prim.cpp
2265 --- ./source/common/arm64/intrapred-prim.cpp 1970-01-01 01:00:00.000000000 +0100
2266 +++ ../x265_apple_patch/source/common/arm64/intrapred-prim.cpp 2021-05-08 13:08:01.000000000 +0100
2268 +/*****************************************************************************
2269 + * Copyright (C) 2013-2017 MulticoreWare, Inc
2271 + * Authors: Min Chen <chenm003@163.com>
2273 + * This program is free software; you can redistribute it and/or modify
2274 + * it under the terms of the GNU General Public License as published by
2275 + * the Free Software Foundation; either version 2 of the License, or
2276 + * (at your option) any later version.
2278 + * This program is distributed in the hope that it will be useful,
2279 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2280 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2281 + * GNU General Public License for more details.
2283 + * You should have received a copy of the GNU General Public License
2284 + * along with this program; if not, write to the Free Software
2285 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
2287 + * This program is also available under a commercial proprietary license.
2288 + * For more information, contact us at license @ x265.com.
2289 + *****************************************************************************/
2292 +#include "common.h"
2293 +#include "primitives.h"
2297 +#include "arm64-utils.h"
2298 +#include <arm_neon.h>
2300 +using namespace X265_NS;
2306 +template<int width>
2307 +void intra_pred_ang_neon(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
2309 + int width2 = width << 1;
2310 + // Flip the neighbours in the horizontal case.
2311 + int horMode = dirMode < 18;
2312 + pixel neighbourBuf[129];
2313 + const pixel *srcPix = srcPix0;
2317 + neighbourBuf[0] = srcPix[0];
2318 + //for (int i = 0; i < width << 1; i++)
2320 + // neighbourBuf[1 + i] = srcPix[width2 + 1 + i];
2321 + // neighbourBuf[width2 + 1 + i] = srcPix[1 + i];
2323 + memcpy(&neighbourBuf[1],&srcPix[width2+1],sizeof(pixel)*(width << 1));
2324 + memcpy(&neighbourBuf[width2 + 1],&srcPix[1],sizeof(pixel)*(width << 1));
2325 + srcPix = neighbourBuf;
2328 + // Intra prediction angle and inverse angle tables.
2329 + const int8_t angleTable[17] = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
2330 + const int16_t invAngleTable[8] = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
2332 + // Get the prediction angle.
2333 + int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
2334 + int angle = angleTable[8 + angleOffset];
2336 + // Vertical Prediction.
2339 + for (int y = 0; y < width; y++) {
2340 + memcpy(&dst[y * dstStride],srcPix + 1,sizeof(pixel)*width);
2344 + int topLeft = srcPix[0], top = srcPix[1];
2345 + for (int y = 0; y < width; y++)
2346 + dst[y * dstStride] = x265_clip((int16_t)(top + ((srcPix[width2 + 1 + y] - topLeft) >> 1)));
2349 + else // Angular prediction.
2351 + // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf[1]).
2355 + // Use the projected left neighbours and the top neighbours.
2358 + // Number of neighbours projected.
2359 + int nbProjected = -((width * angle) >> 5) - 1;
2360 + pixel *ref_pix = refBuf + nbProjected + 1;
2362 + // Project the neighbours.
2363 + int invAngle = invAngleTable[- angleOffset - 1];
2364 + int invAngleSum = 128;
2365 + for (int i = 0; i < nbProjected; i++)
2367 + invAngleSum += invAngle;
2368 + ref_pix[- 2 - i] = srcPix[width2 + (invAngleSum >> 8)];
2371 + // Copy the top-left and top pixels.
2372 + //for (int i = 0; i < width + 1; i++)
2373 + //ref_pix[-1 + i] = srcPix[i];
2375 + memcpy(&ref_pix[-1],srcPix,(width+1)*sizeof(pixel));
2378 + else // Use the top and top-right neighbours.
2381 + // Pass every row.
2383 + for (int y = 0; y < width; y++)
2385 + angleSum += angle;
2386 + int offset = angleSum >> 5;
2387 + int fraction = angleSum & 31;
2389 + if (fraction) // Interpolate
2391 + if (width >= 8 && sizeof(pixel) == 1)
2393 + const int16x8_t f0 = vdupq_n_s16(32-fraction);
2394 + const int16x8_t f1 = vdupq_n_s16(fraction);
2395 + for (int x = 0;x<width;x+=8) {
2396 + uint8x8_t in0 = *(uint8x8_t *)&ref[offset + x];
2397 + uint8x8_t in1 = *(uint8x8_t *)&ref[offset+ x + 1];
2398 + int16x8_t lo = vmlaq_s16(vdupq_n_s16(16),vmovl_u8(in0),f0);
2399 + lo = vmlaq_s16(lo,vmovl_u8(in1),f1);
2400 + lo = vshrq_n_s16(lo,5);
2401 + *(uint8x8_t *)&dst[y * dstStride + x] = vmovn_u16(lo);
2404 + else if (width >= 4 && sizeof(pixel) == 2)
2406 + const int32x4_t f0 = vdupq_n_s32(32-fraction);
2407 + const int32x4_t f1 = vdupq_n_s32(fraction);
2408 + for (int x = 0;x<width;x+=4) {
2409 + uint16x4_t in0 = *(uint16x4_t *)&ref[offset + x];
2410 + uint16x4_t in1 = *(uint16x4_t *)&ref[offset+ x + 1];
2411 + int32x4_t lo = vmlaq_s32(vdupq_n_s32(16),vmovl_u16(in0),f0);
2412 + lo = vmlaq_s32(lo,vmovl_u16(in1),f1);
2413 + lo = vshrq_n_s32(lo,5);
2414 + *(uint16x4_t *)&dst[y * dstStride + x] = vmovn_u32(lo);
2418 + for (int x = 0; x < width; x++)
2419 + dst[y * dstStride + x] = (pixel)(((32 - fraction) * ref[offset + x] + fraction * ref[offset + x + 1] + 16) >> 5);
2424 + memcpy(&dst[y * dstStride],&ref[offset],sizeof(pixel)*width);
2429 + // Flip for horizontal.
2432 + if (width == 8) transpose8x8(dst,dst,dstStride,dstStride);
2433 + else if (width == 16) transpose16x16(dst,dst,dstStride,dstStride);
2434 + else if (width == 32) transpose32x32(dst,dst,dstStride,dstStride);
2436 + for (int y = 0; y < width - 1; y++)
2438 + for (int x = y + 1; x < width; x++)
2440 + pixel tmp = dst[y * dstStride + x];
2441 + dst[y * dstStride + x] = dst[x * dstStride + y];
2442 + dst[x * dstStride + y] = tmp;
2449 +template<int log2Size>
2450 +void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
2452 + const int size = 1 << log2Size;
2453 + for (int mode = 2; mode <= 34; mode++)
2455 + pixel *srcPix = (g_intraFilterFlags[mode] & size ? filtPix : refPix);
2456 + pixel *out = dest + ((mode - 2) << (log2Size * 2));
2458 + intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
2460 + // Optimize code don't flip buffer
2461 + bool modeHor = (mode < 18);
2463 + // transpose the block if this is a horizontal mode
2466 + if (size == 8) transpose8x8(out,out,size,size);
2467 + else if (size == 16) transpose16x16(out,out,size,size);
2468 + else if (size == 32) transpose32x32(out,out,size,size);
2470 + for (int k = 0; k < size - 1; k++)
2472 + for (int l = k + 1; l < size; l++)
2474 + pixel tmp = out[k * size + l];
2475 + out[k * size + l] = out[l * size + k];
2476 + out[l * size + k] = tmp;
2485 +namespace X265_NS {
2486 +// x265 private namespace
2488 +void setupIntraPrimitives_neon(EncoderPrimitives& p)
2490 +// p.cu[BLOCK_4x4].intra_filter = intraFilter<4>;
2491 +// p.cu[BLOCK_8x8].intra_filter = intraFilter<8>;
2492 +// p.cu[BLOCK_16x16].intra_filter = intraFilter<16>;
2493 +// p.cu[BLOCK_32x32].intra_filter = intraFilter<32>;
2495 +// p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_neon<2>;
2496 +// p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_neon<3>;
2497 +// p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_neon<4>;
2498 +// p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = planar_pred_neon<5>;
2500 +// p.cu[BLOCK_4x4].intra_pred[DC_IDX] = intra_pred_dc_neon<4>;
2501 +// p.cu[BLOCK_8x8].intra_pred[DC_IDX] = intra_pred_dc_neon<8>;
2502 +// p.cu[BLOCK_16x16].intra_pred[DC_IDX] = intra_pred_dc_neon<16>;
2503 +// p.cu[BLOCK_32x32].intra_pred[DC_IDX] = intra_pred_dc_neon<32>;
2505 + for (int i = 2; i < NUM_INTRA_MODE; i++)
2507 + p.cu[BLOCK_4x4].intra_pred[i] = intra_pred_ang_neon<4>;
2508 + p.cu[BLOCK_8x8].intra_pred[i] = intra_pred_ang_neon<8>;
2509 + p.cu[BLOCK_16x16].intra_pred[i] = intra_pred_ang_neon<16>;
2510 + p.cu[BLOCK_32x32].intra_pred[i] = intra_pred_ang_neon<32>;
2513 + p.cu[BLOCK_4x4].intra_pred_allangs = all_angs_pred_neon<2>;
2514 + p.cu[BLOCK_8x8].intra_pred_allangs = all_angs_pred_neon<3>;
2515 + p.cu[BLOCK_16x16].intra_pred_allangs = all_angs_pred_neon<4>;
2516 + p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_neon<5>;
2524 +namespace X265_NS {
2525 +// x265 private namespace
2526 +void setupIntraPrimitives_neon(EncoderPrimitives& p)
2534 diff -Naur ./source/common/arm64/intrapred-prim.h ../x265_apple_patch/source/common/arm64/intrapred-prim.h
2535 --- ./source/common/arm64/intrapred-prim.h 1970-01-01 01:00:00.000000000 +0100
2536 +++ ../x265_apple_patch/source/common/arm64/intrapred-prim.h 2021-05-08 13:08:01.000000000 +0100
2538 +#ifndef INTRAPRED_PRIM_H__
2540 +#if defined(__aarch64__)
2542 +namespace X265_NS {
2543 +// x265 private namespace
2545 +void setupIntraPrimitives_neon(EncoderPrimitives& p);
2552 diff -Naur ./source/common/arm64/loopfilter-prim.cpp ../x265_apple_patch/source/common/arm64/loopfilter-prim.cpp
2553 --- ./source/common/arm64/loopfilter-prim.cpp 1970-01-01 01:00:00.000000000 +0100
2554 +++ ../x265_apple_patch/source/common/arm64/loopfilter-prim.cpp 2021-05-08 13:08:01.000000000 +0100
2556 +/*****************************************************************************
2557 +* Copyright (C) 2013-2017 MulticoreWare, Inc
2559 +* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
2560 +* Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
2561 +* Min Chen <chenm003@163.com>
2563 +* This program is free software; you can redistribute it and/or modify
2564 +* it under the terms of the GNU General Public License as published by
2565 +* the Free Software Foundation; either version 2 of the License, or
2566 +* (at your option) any later version.
2568 +* This program is distributed in the hope that it will be useful,
2569 +* but WITHOUT ANY WARRANTY; without even the implied warranty of
2570 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2571 +* GNU General Public License for more details.
2573 +* You should have received a copy of the GNU General Public License
2574 +* along with this program; if not, write to the Free Software
2575 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
2577 +* This program is also available under a commercial proprietary license.
2578 +* For more information, contact us at license @ x265.com.
2579 +*****************************************************************************/
2580 +#include "loopfilter-prim.h"
2582 +#define PIXEL_MIN 0
2586 +#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
2587 +#include<arm_neon.h>
2592 +/* get the sign of input variable (TODO: this is a dup, make common) */
2593 +static inline int8_t signOf(int x)
2595 + return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
2598 +static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
2600 + int16x8_t in = vsubl_u8(in0,in1);
2601 + return vmovn_s16(vmaxq_s16(vminq_s16(in,vdupq_n_s16(1)),vdupq_n_s16(-1)));
2604 +static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
2607 + for (; (x + 8) <= endX; x += 8) {
2608 + *(int8x8_t *)&dst[x] = sign_diff_neon(*(uint8x8_t *)&src1[x],*(uint8x8_t *)&src2[x]);
2611 + for (; x < endX; x++)
2612 + dst[x] = signOf(src1[x] - src2[x]);
2615 +static void processSaoCUE0_neon(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
2620 + int8_t signRight, signLeft0;
2623 + for (y = 0; y < 2; y++)
2625 + signLeft0 = signLeft[y];
2629 + int8x8_t vsignRight;
2630 + int8x8x2_t shifter;
2631 + shifter.val[1][0] = signLeft0;
2632 + static const int8x8_t index = {8,0,1,2,3,4,5,6};
2633 + int8x8_t tbl = *(int8x8_t *)offsetEo;
2634 + for (; (x+8) <= width; x+=8)
2636 + uint8x8_t in = *(uint8x8_t *)&rec[x];
2637 + vsignRight = sign_diff_neon(in,*(uint8x8_t *)&rec[x+1]);
2638 + shifter.val[0] = vneg_s8(vsignRight);
2639 + int8x8_t tmp = shifter.val[0];
2640 + int8x8_t edge = vtbl2_s8(shifter,index);
2641 + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight,edge),vdup_n_s8(2));
2642 + shifter.val[1][0] = tmp[7];
2643 + int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2644 + t1 = vaddw_u8(t1,in);
2645 + t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2646 + t1 = vminq_s16(t1,vdupq_n_s16(255));
2647 + *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2649 + signLeft0 = shifter.val[1][0];
2651 + for (; x < width; x++)
2653 + signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
2654 + edgeType = signRight + signLeft0 + 2;
2655 + signLeft0 = -signRight;
2656 + rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2662 +static void processSaoCUE1_neon(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
2669 + int8x8_t tbl = *(int8x8_t *)offsetEo;
2670 + for (; (x+8) <= width; x+=8)
2672 + uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2673 + uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride];
2674 + int8x8_t vsignDown = sign_diff_neon(in0,in1);
2675 + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&upBuff1[x]),vdup_n_s8(2));
2676 + *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
2677 + int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2678 + t1 = vaddw_u8(t1,in0);
2679 + *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
2682 + for (; x < width; x++)
2684 + signDown = signOf(rec[x] - rec[x + stride]);
2685 + edgeType = signDown + upBuff1[x] + 2;
2686 + upBuff1[x] = -signDown;
2687 + rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2691 +static void processSaoCUE1_2Rows_neon(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
2697 + for (y = 0; y < 2; y++)
2701 + int8x8_t tbl = *(int8x8_t *)offsetEo;
2702 + for (; (x+8) <= width; x+=8)
2704 + uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2705 + uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride];
2706 + int8x8_t vsignDown = sign_diff_neon(in0,in1);
2707 + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&upBuff1[x]),vdup_n_s8(2));
2708 + *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
2709 + int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2710 + t1 = vaddw_u8(t1,in0);
2711 + t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2712 + t1 = vminq_s16(t1,vdupq_n_s16(255));
2713 + *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2717 + for (; x < width; x++)
2719 + signDown = signOf(rec[x] - rec[x + stride]);
2720 + edgeType = signDown + upBuff1[x] + 2;
2721 + upBuff1[x] = -signDown;
2722 + rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2728 +static void processSaoCUE2_neon(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
2732 + if (abs(buff1-bufft) < 16)
2734 + for (x = 0; x < width; x++)
2736 + int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
2737 + int edgeType = signDown + buff1[x] + 2;
2738 + bufft[x + 1] = -signDown;
2739 + rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);;
2744 + int8x8_t tbl = *(int8x8_t *)offsetEo;
2746 + for (; (x + 8) <= width; x+=8)
2748 + uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2749 + uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride+1];
2750 + int8x8_t vsignDown = sign_diff_neon(in0,in1);
2751 + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&buff1[x]),vdup_n_s8(2));
2752 + *(int8x8_t *)&bufft[x+1] = vneg_s8(vsignDown);
2753 + int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2754 + t1 = vaddw_u8(t1,in0);
2755 + t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2756 + t1 = vminq_s16(t1,vdupq_n_s16(255));
2757 + *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2759 + for (; x < width; x++)
2761 + int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
2762 + int edgeType = signDown + buff1[x] + 2;
2763 + bufft[x + 1] = -signDown;
2764 + rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);;
2771 +static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
2775 + int8x8_t tbl = *(int8x8_t *)offsetEo;
2777 + int x = startX + 1;
2778 + for (; (x+8) <= endX; x+=8 )
2780 + uint8x8_t in0 = *(uint8x8_t *)&rec[x];
2781 + uint8x8_t in1 = *(uint8x8_t *)&rec[x+stride];
2782 + int8x8_t vsignDown = sign_diff_neon(in0,in1);
2783 + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown,*(int8x8_t *)&upBuff1[x]),vdup_n_s8(2));
2784 + *(int8x8_t *)&upBuff1[x-1] = vneg_s8(vsignDown);
2785 + int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl,vedgeType));
2786 + t1 = vaddw_u8(t1,in0);
2787 + t1 = vmaxq_s16(t1,vdupq_n_s16(0));
2788 + t1 = vminq_s16(t1,vdupq_n_s16(255));
2789 + *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
2792 + for (; x < endX; x++)
2794 + signDown = signOf(rec[x] - rec[x + stride]);
2795 + edgeType = signDown + upBuff1[x] + 2;
2796 + upBuff1[x - 1] = -signDown;
2797 + rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
2801 +static void processSaoCUB0_neon(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
2803 + #define SAO_BO_BITS 5
2804 + const int boShift = X265_DEPTH - SAO_BO_BITS;
2807 + table = *(int8x8x4_t *)offset;
2809 + for (y = 0; y < ctuHeight; y++)
2812 + for (x = 0; (x+8) <= ctuWidth; x+=8)
2814 + int8x8_t in = *(int8x8_t*)&rec[x];
2815 + int8x8_t offsets = vtbl4_s8(table,vshr_n_u8(in,boShift));
2816 + int16x8_t tmp = vmovl_s8(offsets);
2817 + tmp = vaddw_u8(tmp,in);
2818 + tmp = vmaxq_s16(tmp,vdupq_n_s16(0));
2819 + tmp = vminq_s16(tmp,vdupq_n_s16(255));
2820 + *(uint8x8_t *)&rec[x] = vmovn_u16(tmp);
2822 + for (; x < ctuWidth; x++)
2824 + rec[x] = x265_clip(rec[x] + offset[rec[x] >> boShift]);
2834 +namespace X265_NS {
2835 +void setupLoopFilterPrimitives_neon(EncoderPrimitives &p)
2837 + p.saoCuOrgE0 = processSaoCUE0_neon;
2838 + p.saoCuOrgE1 = processSaoCUE1_neon;
2839 + p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows_neon;
2840 + p.saoCuOrgE2[0] = processSaoCUE2_neon;
2841 + p.saoCuOrgE2[1] = processSaoCUE2_neon;
2842 + p.saoCuOrgE3[0] = processSaoCUE3_neon;
2843 + p.saoCuOrgE3[1] = processSaoCUE3_neon;
2844 + p.saoCuOrgB0 = processSaoCUB0_neon;
2845 + p.sign = calSign_neon;
2849 +#else //HIGH_BIT_DEPTH
2852 +namespace X265_NS {
2853 +void setupLoopFilterPrimitives_neon(EncoderPrimitives &)
2861 diff -Naur ./source/common/arm64/loopfilter-prim.h ../x265_apple_patch/source/common/arm64/loopfilter-prim.h
2862 --- ./source/common/arm64/loopfilter-prim.h 1970-01-01 01:00:00.000000000 +0100
2863 +++ ../x265_apple_patch/source/common/arm64/loopfilter-prim.h 2021-05-08 13:08:01.000000000 +0100
2865 +#ifndef _LOOPFILTER_NEON_H__
2866 +#define _LOOPFILTER_NEON_H__
2869 +/*****************************************************************************
2870 +* Copyright (C) 2013-2017 MulticoreWare, Inc
2872 +* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
2873 +* Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
2874 +* Min Chen <chenm003@163.com>
2876 +* This program is free software; you can redistribute it and/or modify
2877 +* it under the terms of the GNU General Public License as published by
2878 +* the Free Software Foundation; either version 2 of the License, or
2879 +* (at your option) any later version.
2881 +* This program is distributed in the hope that it will be useful,
2882 +* but WITHOUT ANY WARRANTY; without even the implied warranty of
2883 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2884 +* GNU General Public License for more details.
2886 +* You should have received a copy of the GNU General Public License
2887 +* along with this program; if not, write to the Free Software
2888 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
2890 +* This program is also available under a commercial proprietary license.
2891 +* For more information, contact us at license @ x265.com.
2892 +*****************************************************************************/
2896 +#include "common.h"
2897 +#include "primitives.h"
2899 +#define PIXEL_MIN 0
2901 +namespace X265_NS {
2902 +void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
2908 diff -Naur ./source/common/arm64/pixel-prim.cpp ../x265_apple_patch/source/common/arm64/pixel-prim.cpp
2909 --- ./source/common/arm64/pixel-prim.cpp 1970-01-01 01:00:00.000000000 +0100
2910 +++ ../x265_apple_patch/source/common/arm64/pixel-prim.cpp 2021-05-08 13:08:01.000000000 +0100
2912 +#include "common.h"
2913 +#include "slicetype.h" // LOWRES_COST_MASK
2914 +#include "primitives.h"
2917 +#include "pixel-prim.h"
2918 +#include "arm64-utils.h"
2921 +#include <arm_neon.h>
2923 +using namespace X265_NS;
2930 +/* SATD SA8D variants - based on x264 */
2931 +static inline void SUMSUB_AB(int16x8_t& sum, int16x8_t& sub, const int16x8_t a, const int16x8_t b)
2933 + sum = vaddq_s16(a,b);
2934 + sub = vsubq_s16(a,b);
2937 +static inline void transpose_8h(int16x8_t& t1, int16x8_t& t2, const int16x8_t s1, const int16x8_t s2)
2939 + t1 = vtrn1q_s16(s1, s2);
2940 + t2 = vtrn2q_s16(s1, s2);
2943 +static inline void transpose_4s(int16x8_t& t1, int16x8_t& t2, const int16x8_t s1, const int16x8_t s2)
2945 + t1 = vtrn1q_s32(s1, s2);
2946 + t2 = vtrn2q_s32(s1, s2);
2949 +#if (X265_DEPTH <= 10)
2950 +static inline void transpose_2d(int16x8_t& t1, int16x8_t& t2, const int16x8_t s1, const int16x8_t s2)
2952 + t1 = vtrn1q_s64(s1, s2);
2953 + t2 = vtrn2q_s64(s1, s2);
2958 +static inline void SUMSUB_ABCD(int16x8_t& s1, int16x8_t& d1, int16x8_t& s2, int16x8_t& d2,
2959 + int16x8_t a,int16x8_t b,int16x8_t c,int16x8_t d)
2961 + SUMSUB_AB(s1,d1,a,b);
2962 + SUMSUB_AB(s2,d2,c,d);
2965 +static inline void HADAMARD4_V(int16x8_t& r1,int16x8_t& r2,int16x8_t& r3,int16x8_t& r4,
2966 + int16x8_t& t1,int16x8_t& t2,int16x8_t& t3,int16x8_t& t4)
2968 + SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
2969 + SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
2973 +static int _satd_4x8_8x4_end_neon(int16x8_t v0,int16x8_t v1,int16x8_t v2, int16x8_t v3)
2977 + int16x8_t v4,v5,v6,v7,v16,v17,v18,v19;
2980 + SUMSUB_AB (v16, v17, v0, v1);
2981 + SUMSUB_AB (v18, v19, v2, v3);
2983 + SUMSUB_AB (v4 , v6 , v16, v18);
2984 + SUMSUB_AB (v5 , v7 , v17, v19);
2986 + v0 = vtrn1q_s16(v4, v5);
2987 + v1 = vtrn2q_s16(v4, v5);
2988 + v2 = vtrn1q_s16(v6, v7);
2989 + v3 = vtrn2q_s16(v6, v7);
2991 + SUMSUB_AB (v16, v17, v0, v1);
2992 + SUMSUB_AB (v18, v19, v2, v3);
2994 + v0 = vtrn1q_s32(v16, v18);
2995 + v1 = vtrn2q_s32(v16, v18);
2996 + v2 = vtrn1q_s32(v17, v19);
2997 + v3 = vtrn2q_s32(v17, v19);
2999 + v0 = vabsq_s16(v0);
3000 + v1 = vabsq_s16(v1);
3001 + v2 = vabsq_s16(v2);
3002 + v3 = vabsq_s16(v3);
3004 + v0 = vmaxq_u16(v0, v1);
3005 + v1 = vmaxq_u16(v2, v3);
3007 + v0 = vaddq_u16(v0, v1);
3008 + return vaddlvq_u16(v0);
3011 +static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
3014 + SUMSUB_AB (v2, v3, v0, v1);
3016 + v0 = vzip1q_s64(v2,v3);
3017 + v1 = vzip2q_s64(v2,v3);
3018 + SUMSUB_AB (v2, v3, v0, v1);
3020 + v0 = vtrn1q_s16(v2,v3);
3021 + v1 = vtrn2q_s16(v2,v3);
3022 + SUMSUB_AB (v2, v3, v0, v1);
3024 + v0 = vtrn1q_s32(v2,v3);
3025 + v1 = vtrn2q_s32(v2,v3);
3027 + v0 = vabsq_s16(v0);
3028 + v1 = vabsq_s16(v1);
3029 + v0 = vmaxq_u16(v0, v1);
3031 + return vaddlvq_s16(v0);
3034 +static void _satd_8x4v_8x8h_neon(int16x8_t& v0,int16x8_t& v1, int16x8_t&v2,int16x8_t& v3,int16x8_t& v20,int16x8_t& v21, int16x8_t&v22,int16x8_t& v23)
3036 + int16x8_t v16,v17,v18,v19,v4,v5,v6,v7;
3038 + SUMSUB_AB(v16, v18, v0, v2);
3039 + SUMSUB_AB(v17, v19, v1, v3);
3041 + HADAMARD4_V (v20, v21, v22, v23, v0, v1, v2, v3);
3043 + transpose_8h( v0, v1, v16, v17);
3044 + transpose_8h( v2, v3, v18, v19);
3045 + transpose_8h( v4, v5, v20, v21);
3046 + transpose_8h( v6, v7, v22, v23);
3048 + SUMSUB_AB (v16, v17, v0, v1);
3049 + SUMSUB_AB (v18, v19, v2, v3);
3050 + SUMSUB_AB (v20, v21, v4, v5);
3051 + SUMSUB_AB (v22, v23, v6, v7);
3053 + transpose_4s( v0, v2, v16, v18);
3054 + transpose_4s( v1, v3, v17, v19);
3055 + transpose_4s( v4, v6, v20, v22);
3056 + transpose_4s( v5, v7, v21, v23);
3058 + v0 = vabsq_s16(v0);
3059 + v1 = vabsq_s16(v1);
3060 + v2 = vabsq_s16(v2);
3061 + v3 = vabsq_s16(v3);
3062 + v4 = vabsq_s16(v4);
3063 + v5 = vabsq_s16(v5);
3064 + v6 = vabsq_s16(v6);
3065 + v7 = vabsq_s16(v7);
3067 + v0 = vmaxq_u16(v0,v2);
3068 + v1 = vmaxq_u16(v1,v3);
3069 + v2 = vmaxq_u16(v4,v6);
3070 + v3 = vmaxq_u16(v5,v7);
3076 +#if (X265_DEPTH > 10)
3077 +static inline void transpose_2d(int32x4_t& t1, int32x4_t& t2, const int32x4_t s1, const int32x4_t s2)
3079 + t1 = vtrn1q_s64(s1, s2);
3080 + t2 = vtrn2q_s64(s1, s2);
3083 +static inline void ISUMSUB_AB(int32x4_t& sum, int32x4_t& sub, const int32x4_t a, const int32x4_t b)
3085 + sum = vaddq_s32(a,b);
3086 + sub = vsubq_s32(a,b);
3089 +static inline void ISUMSUB_AB_FROM_INT16(int32x4_t& suml, int32x4_t& sumh, int32x4_t& subl, int32x4_t& subh, const int16x8_t a, const int16x8_t b)
3091 + suml = vaddl_s16(vget_low_s16(a),vget_low_s16(b));
3092 + sumh = vaddl_high_s16(a,b);
3093 + subl = vsubl_s16(vget_low_s16(a),vget_low_s16(b));
3094 + subh = vsubl_high_s16(a, b);
3099 +static inline void _sub_8x8_fly(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2,
3100 + int16x8_t& v0,int16x8_t& v1, int16x8_t& v2,int16x8_t& v3,
3101 + int16x8_t& v20,int16x8_t& v21, int16x8_t& v22,int16x8_t& v23)
3103 + uint16x8_t r0,r1,r2,r3;
3104 + uint16x8_t t0,t1,t2,t3;
3105 + int16x8_t v16,v17;
3106 + int16x8_t v18,v19;
3108 + r0 = *(uint16x8_t*)(pix1 + 0*stride_pix1);
3109 + r1 = *(uint16x8_t*)(pix1 + 1*stride_pix1);
3110 + r2 = *(uint16x8_t*)(pix1 + 2*stride_pix1);
3111 + r3 = *(uint16x8_t*)(pix1 + 3*stride_pix1);
3113 + t0 = *(uint16x8_t*)(pix2 + 0*stride_pix2);
3114 + t1 = *(uint16x8_t*)(pix2 + 1*stride_pix2);
3115 + t2 = *(uint16x8_t*)(pix2 + 2*stride_pix2);
3116 + t3 = *(uint16x8_t*)(pix2 + 3*stride_pix2);
3118 + v16 = vsubq_u16(r0,t0);
3119 + v17 = vsubq_u16(r1,t1);
3120 + v18 = vsubq_u16(r2,t2);
3121 + v19 = vsubq_u16(r3,t3);
3123 + r0 = *(uint16x8_t*)(pix1 + 4*stride_pix1);
3124 + r1 = *(uint16x8_t*)(pix1 + 5*stride_pix1);
3125 + r2 = *(uint16x8_t*)(pix1 + 6*stride_pix1);
3126 + r3 = *(uint16x8_t*)(pix1 + 7*stride_pix1);
3128 + t0 = *(uint16x8_t*)(pix2 + 4*stride_pix2);
3129 + t1 = *(uint16x8_t*)(pix2 + 5*stride_pix2);
3130 + t2 = *(uint16x8_t*)(pix2 + 6*stride_pix2);
3131 + t3 = *(uint16x8_t*)(pix2 + 7*stride_pix2);
3133 + v20 = vsubq_u16(r0,t0);
3134 + v21 = vsubq_u16(r1,t1);
3135 + v22 = vsubq_u16(r2,t2);
3136 + v23 = vsubq_u16(r3,t3);
3138 + SUMSUB_AB (v0, v1, v16, v17);
3139 + SUMSUB_AB (v2, v3, v18, v19);
3146 +static void _satd_16x4_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2,
3147 + int16x8_t& v0,int16x8_t&v1, int16x8_t&v2,int16x8_t&v3)
3149 + uint8x16_t r0,r1,r2,r3;
3150 + uint8x16_t t0,t1,t2,t3;
3151 + int16x8_t v16,v17,v20,v21;
3152 + int16x8_t v18,v19,v22,v23;
3154 + r0 = *(int16x8_t*)(pix1 + 0*stride_pix1);
3155 + r1 = *(int16x8_t*)(pix1 + 1*stride_pix1);
3156 + r2 = *(int16x8_t*)(pix1 + 2*stride_pix1);
3157 + r3 = *(int16x8_t*)(pix1 + 3*stride_pix1);
3159 + t0 = *(int16x8_t*)(pix2 + 0*stride_pix2);
3160 + t1 = *(int16x8_t*)(pix2 + 1*stride_pix2);
3161 + t2 = *(int16x8_t*)(pix2 + 2*stride_pix2);
3162 + t3 = *(int16x8_t*)(pix2 + 3*stride_pix2);
3165 + v16 = vsubq_u16((r0),(t0) );
3166 + v17 = vsubq_u16((r1),(t1) );
3167 + v18 = vsubq_u16((r2),(t2) );
3168 + v19 = vsubq_u16((r3),(t3) );
3170 + r0 = *(int16x8_t*)(pix1 + 0*stride_pix1 + 8);
3171 + r1 = *(int16x8_t*)(pix1 + 1*stride_pix1 + 8);
3172 + r2 = *(int16x8_t*)(pix1 + 2*stride_pix1 + 8);
3173 + r3 = *(int16x8_t*)(pix1 + 3*stride_pix1 + 8);
3175 + t0 = *(int16x8_t*)(pix2 + 0*stride_pix2 + 8);
3176 + t1 = *(int16x8_t*)(pix2 + 1*stride_pix2 + 8);
3177 + t2 = *(int16x8_t*)(pix2 + 2*stride_pix2 + 8);
3178 + t3 = *(int16x8_t*)(pix2 + 3*stride_pix2 + 8);
3181 + v20 = vsubq_u16(r0,t0);
3182 + v21 = vsubq_u16(r1,t1);
3183 + v22 = vsubq_u16(r2,t2);
3184 + v23 = vsubq_u16(r3,t3);
3186 + SUMSUB_AB (v0, v1, v16, v17);
3187 + SUMSUB_AB (v2, v3, v18, v19);
3189 + _satd_8x4v_8x8h_neon(v0,v1,v2,v3,v20,v21,v22,v23);
3194 +int pixel_satd_4x4_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2)
3196 + uint64x2_t t0,t1,r0,r1;
3197 + t0[0] = *(uint64_t *)(pix1 + 0*stride_pix1);
3198 + t1[0] = *(uint64_t *)(pix1 + 1*stride_pix1);
3199 + t0[1] = *(uint64_t *)(pix1 + 2*stride_pix1);
3200 + t1[1] = *(uint64_t *)(pix1 + 3*stride_pix1);
3202 + r0[0] = *(uint64_t *)(pix2 + 0*stride_pix1);
3203 + r1[0] = *(uint64_t *)(pix2 + 1*stride_pix2);
3204 + r0[1] = *(uint64_t *)(pix2 + 2*stride_pix2);
3205 + r1[1] = *(uint64_t *)(pix2 + 3*stride_pix2);
3207 + return _satd_4x4_neon(vsubq_u16(t0,r0), vsubq_u16(r1,t1));
3215 +int pixel_satd_8x4_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2)
3217 + uint16x8_t i0,i1,i2,i3,i4,i5,i6,i7;
3219 + i0 = *(uint16x8_t *)(pix1 + 0*stride_pix1);
3220 + i1 = *(uint16x8_t *)(pix2 + 0*stride_pix2);
3221 + i2 = *(uint16x8_t *)(pix1 + 1*stride_pix1);
3222 + i3 = *(uint16x8_t *)(pix2 + 1*stride_pix2);
3223 + i4 = *(uint16x8_t *)(pix1 + 2*stride_pix1);
3224 + i5 = *(uint16x8_t *)(pix2 + 2*stride_pix2);
3225 + i6 = *(uint16x8_t *)(pix1 + 3*stride_pix1);
3226 + i7 = *(uint16x8_t *)(pix2 + 3*stride_pix2);
3228 + int16x8_t v0 = vsubq_u16(i0,i1);
3229 + int16x8_t v1 = vsubq_u16(i2,i3);
3230 + int16x8_t v2 = vsubq_u16(i4,i5);
3231 + int16x8_t v3 = vsubq_u16(i6,i7);
3233 + return _satd_4x8_8x4_end_neon(v0,v1,v2,v3);
3237 +int pixel_satd_16x16_neon(const uint16_t* pix1, intptr_t stride_pix1, const uint16_t* pix2, intptr_t stride_pix2)
3239 + int32x4_t v30 = vdupq_n_u32(0),v31= vdupq_n_u32(0);
3240 + int16x8_t v0,v1,v2,v3;
3242 + _satd_16x4_neon(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3);
3243 + v30 = vpadalq_u16(v30,v0);
3244 + v30 = vpadalq_u16(v30,v1);
3245 + v31 = vpadalq_u16(v31,v2);
3246 + v31 = vpadalq_u16(v31,v3);
3248 + _satd_16x4_neon(pix1 + 4*stride_pix1,stride_pix1,pix2+4*stride_pix2,stride_pix2,v0,v1,v2,v3);
3249 + v30 = vpadalq_u16(v30,v0);
3250 + v30 = vpadalq_u16(v30,v1);
3251 + v31 = vpadalq_u16(v31,v2);
3252 + v31 = vpadalq_u16(v31,v3);
3254 + _satd_16x4_neon(pix1 + 8*stride_pix1,stride_pix1,pix2+8*stride_pix2,stride_pix2,v0,v1,v2,v3);
3255 + v30 = vpadalq_u16(v30,v0);
3256 + v30 = vpadalq_u16(v30,v1);
3257 + v31 = vpadalq_u16(v31,v2);
3258 + v31 = vpadalq_u16(v31,v3);
3260 + _satd_16x4_neon(pix1 + 12*stride_pix1,stride_pix1,pix2+12*stride_pix2,stride_pix2,v0,v1,v2,v3);
3261 + v30 = vpadalq_u16(v30,v0);
3262 + v30 = vpadalq_u16(v30,v1);
3263 + v31 = vpadalq_u16(v31,v2);
3264 + v31 = vpadalq_u16(v31,v3);
3266 + return vaddvq_s32(vaddq_s32(v30,v31));
3270 +#else //HIGH_BIT_DEPTH
3272 +static void _satd_16x4_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2,
3273 + int16x8_t& v0,int16x8_t&v1, int16x8_t&v2,int16x8_t&v3)
3275 + uint8x16_t r0,r1,r2,r3;
3276 + uint8x16_t t0,t1,t2,t3;
3277 + int16x8_t v16,v17,v20,v21;
3278 + int16x8_t v18,v19,v22,v23;
3280 + r0 = *(uint8x16_t*)(pix1 + 0*stride_pix1);
3281 + r1 = *(uint8x16_t*)(pix1 + 1*stride_pix1);
3282 + r2 = *(uint8x16_t*)(pix1 + 2*stride_pix1);
3283 + r3 = *(uint8x16_t*)(pix1 + 3*stride_pix1);
3285 + t0 = *(uint8x16_t*)(pix2 + 0*stride_pix2);
3286 + t1 = *(uint8x16_t*)(pix2 + 1*stride_pix2);
3287 + t2 = *(uint8x16_t*)(pix2 + 2*stride_pix2);
3288 + t3 = *(uint8x16_t*)(pix2 + 3*stride_pix2);
3292 + v16 = vsubl_u8(vget_low_u8(r0),vget_low_u8(t0) );
3293 + v20 = vsubl_high_u8(r0,t0);
3294 + v17 = vsubl_u8(vget_low_u8(r1),vget_low_u8(t1) );
3295 + v21 = vsubl_high_u8(r1,t1);
3296 + v18 = vsubl_u8(vget_low_u8(r2),vget_low_u8(t2) );
3297 + v22 = vsubl_high_u8(r2,t2);
3298 + v19 = vsubl_u8(vget_low_u8(r3),vget_low_u8(t3) );
3299 + v23 = vsubl_high_u8(r3,t3);
3301 + SUMSUB_AB (v0, v1, v16, v17);
3302 + SUMSUB_AB (v2, v3, v18, v19);
3304 + _satd_8x4v_8x8h_neon(v0,v1,v2,v3,v20,v21,v22,v23);
3309 +static inline void _sub_8x8_fly(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2,
3310 + int16x8_t& v0,int16x8_t& v1, int16x8_t& v2,int16x8_t& v3,
3311 + int16x8_t& v20,int16x8_t& v21, int16x8_t& v22,int16x8_t& v23)
3313 + uint8x8_t r0,r1,r2,r3;
3314 + uint8x8_t t0,t1,t2,t3;
3315 + int16x8_t v16,v17;
3316 + int16x8_t v18,v19;
3318 + r0 = *(uint8x8_t*)(pix1 + 0*stride_pix1);
3319 + r1 = *(uint8x8_t*)(pix1 + 1*stride_pix1);
3320 + r2 = *(uint8x8_t*)(pix1 + 2*stride_pix1);
3321 + r3 = *(uint8x8_t*)(pix1 + 3*stride_pix1);
3323 + t0 = *(uint8x8_t*)(pix2 + 0*stride_pix2);
3324 + t1 = *(uint8x8_t*)(pix2 + 1*stride_pix2);
3325 + t2 = *(uint8x8_t*)(pix2 + 2*stride_pix2);
3326 + t3 = *(uint8x8_t*)(pix2 + 3*stride_pix2);
3328 + v16 = vsubl_u8(r0,t0);
3329 + v17 = vsubl_u8(r1,t1);
3330 + v18 = vsubl_u8(r2,t2);
3331 + v19 = vsubl_u8(r3,t3);
3333 + r0 = *(uint8x8_t*)(pix1 + 4*stride_pix1);
3334 + r1 = *(uint8x8_t*)(pix1 + 5*stride_pix1);
3335 + r2 = *(uint8x8_t*)(pix1 + 6*stride_pix1);
3336 + r3 = *(uint8x8_t*)(pix1 + 7*stride_pix1);
3338 + t0 = *(uint8x8_t*)(pix2 + 4*stride_pix2);
3339 + t1 = *(uint8x8_t*)(pix2 + 5*stride_pix2);
3340 + t2 = *(uint8x8_t*)(pix2 + 6*stride_pix2);
3341 + t3 = *(uint8x8_t*)(pix2 + 7*stride_pix2);
3343 + v20 = vsubl_u8(r0,t0);
3344 + v21 = vsubl_u8(r1,t1);
3345 + v22 = vsubl_u8(r2,t2);
3346 + v23 = vsubl_u8(r3,t3);
3349 + SUMSUB_AB (v0, v1, v16, v17);
3350 + SUMSUB_AB (v2, v3, v18, v19);
3354 +int pixel_satd_4x4_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2)
3356 + uint32x2_t t0,t1,r0,r1;
3357 + t0[0] = *(uint32_t *)(pix1 + 0*stride_pix1);
3358 + t1[0] = *(uint32_t *)(pix1 + 1*stride_pix1);
3359 + t0[1] = *(uint32_t *)(pix1 + 2*stride_pix1);
3360 + t1[1] = *(uint32_t *)(pix1 + 3*stride_pix1);
3362 + r0[0] = *(uint32_t *)(pix2 + 0*stride_pix1);
3363 + r1[0] = *(uint32_t *)(pix2 + 1*stride_pix2);
3364 + r0[1] = *(uint32_t *)(pix2 + 2*stride_pix2);
3365 + r1[1] = *(uint32_t *)(pix2 + 3*stride_pix2);
3367 + return _satd_4x4_neon(vsubl_u8(t0,r0), vsubl_u8(r1,t1));
3371 +int pixel_satd_8x4_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2)
3373 + uint8x8_t i0,i1,i2,i3,i4,i5,i6,i7;
3375 + i0 = *(uint8x8_t *)(pix1 + 0*stride_pix1);
3376 + i1 = *(uint8x8_t *)(pix2 + 0*stride_pix2);
3377 + i2 = *(uint8x8_t *)(pix1 + 1*stride_pix1);
3378 + i3 = *(uint8x8_t *)(pix2 + 1*stride_pix2);
3379 + i4 = *(uint8x8_t *)(pix1 + 2*stride_pix1);
3380 + i5 = *(uint8x8_t *)(pix2 + 2*stride_pix2);
3381 + i6 = *(uint8x8_t *)(pix1 + 3*stride_pix1);
3382 + i7 = *(uint8x8_t *)(pix2 + 3*stride_pix2);
3384 + int16x8_t v0 = vsubl_u8(i0,i1);
3385 + int16x8_t v1 = vsubl_u8(i2,i3);
3386 + int16x8_t v2 = vsubl_u8(i4,i5);
3387 + int16x8_t v3 = vsubl_u8(i6,i7);
3389 + return _satd_4x8_8x4_end_neon(v0,v1,v2,v3);
3392 +int pixel_satd_16x16_neon(const uint8_t* pix1, intptr_t stride_pix1, const uint8_t* pix2, intptr_t stride_pix2)
3394 + int16x8_t v30,v31;
3395 + int16x8_t v0,v1,v2,v3;
3397 + _satd_16x4_neon(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3);
3398 + v30 = vaddq_s16(v0,v1);
3399 + v31 = vaddq_s16(v2,v3);
3401 + _satd_16x4_neon(pix1 + 4*stride_pix1,stride_pix1,pix2+4*stride_pix2,stride_pix2,v0,v1,v2,v3);
3402 + v0 = vaddq_s16(v0,v1);
3403 + v1 = vaddq_s16(v2,v3);
3404 + v30 = vaddq_s16(v30, v0);
3405 + v31 = vaddq_s16(v31, v1);
3407 + _satd_16x4_neon(pix1 + 8*stride_pix1,stride_pix1,pix2+8*stride_pix2,stride_pix2,v0,v1,v2,v3);
3408 + v0 = vaddq_s16(v0,v1);
3409 + v1 = vaddq_s16(v2,v3);
3410 + v30 = vaddq_s16(v30, v0);
3411 + v31 = vaddq_s16(v31, v1);
3413 + _satd_16x4_neon(pix1 + 12*stride_pix1,stride_pix1,pix2+12*stride_pix2,stride_pix2,v0,v1,v2,v3);
3414 + v0 = vaddq_s16(v0,v1);
3415 + v1 = vaddq_s16(v2,v3);
3416 + v30 = vaddq_s16(v30, v0);
3417 + v31 = vaddq_s16(v31, v1);
3419 + int32x4_t sum0 = vpaddlq_u16(v30);
3420 + int32x4_t sum1 = vpaddlq_u16(v31);
3421 + sum0 = vaddq_s32(sum0,sum1);
3422 + return vaddvq_s32(sum0);
3425 +#endif //HIGH_BIT_DEPTH
3428 +static inline void _sa8d_8x8_neon_end(int16x8_t& v0,int16x8_t& v1,int16x8_t v2,int16x8_t v3,
3429 + int16x8_t v20,int16x8_t v21,int16x8_t v22,int16x8_t v23)
3431 + int16x8_t v16,v17,v18,v19;
3432 + int16x8_t v4,v5,v6,v7;
3434 + SUMSUB_AB (v16, v18, v0, v2);
3435 + SUMSUB_AB (v17, v19, v1, v3);
3437 + HADAMARD4_V (v20, v21, v22, v23, v0, v1, v2, v3);
3439 + SUMSUB_AB (v0, v16, v16, v20);
3440 + SUMSUB_AB (v1, v17, v17, v21);
3441 + SUMSUB_AB (v2, v18, v18, v22);
3442 + SUMSUB_AB (v3, v19, v19, v23);
3444 + transpose_8h (v20, v21, v16, v17);
3445 + transpose_8h (v4, v5, v0, v1);
3446 + transpose_8h (v22, v23, v18, v19);
3447 + transpose_8h (v6, v7, v2, v3);
3449 +#if (X265_DEPTH <= 10)
3451 + int16x8_t v24,v25;
3453 + SUMSUB_AB (v2, v3, v20, v21);
3454 + SUMSUB_AB (v24, v25, v4, v5);
3455 + SUMSUB_AB (v0, v1, v22, v23);
3456 + SUMSUB_AB (v4, v5, v6, v7);
3458 + transpose_4s (v20, v22, v2, v0);
3459 + transpose_4s (v21, v23, v3, v1);
3460 + transpose_4s (v16, v18, v24, v4);
3461 + transpose_4s (v17, v19, v25, v5);
3463 + SUMSUB_AB (v0, v2, v20, v22);
3464 + SUMSUB_AB (v1, v3, v21, v23);
3465 + SUMSUB_AB (v4, v6, v16, v18);
3466 + SUMSUB_AB (v5, v7, v17, v19);
3468 + transpose_2d (v16, v20, v0, v4);
3469 + transpose_2d (v17, v21, v1, v5);
3470 + transpose_2d (v18, v22, v2, v6);
3471 + transpose_2d (v19, v23, v3, v7);
3474 + v16 = vabsq_s16(v16);
3475 + v17 = vabsq_s16(v17);
3476 + v18 = vabsq_s16(v18);
3477 + v19 = vabsq_s16(v19);
3478 + v20 = vabsq_s16(v20);
3479 + v21 = vabsq_s16(v21);
3480 + v22 = vabsq_s16(v22);
3481 + v23 = vabsq_s16(v23);
3483 + v16 = vmaxq_u16(v16,v20);
3484 + v17 = vmaxq_u16(v17,v21);
3485 + v18 = vmaxq_u16(v18,v22);
3486 + v19 = vmaxq_u16(v19,v23);
3489 + v0 = vpaddlq_u16(v16);
3490 + v1 = vpaddlq_u16(v17);
3491 + v0 = vpadalq_u16(v0,v18);
3492 + v1 = vpadalq_u16(v1,v19);
3494 +#else //HIGH_BIT_DEPTH
3496 + v0 = vaddq_u16(v16,v17);
3497 + v1 = vaddq_u16(v18,v19);
3499 +#endif //HIGH_BIT_DEPTH
3501 +#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
3503 + int32x4_t v2l,v2h,v3l,v3h,v24l,v24h,v25l,v25h,v0l,v0h,v1l,v1h;
3504 + int32x4_t v22l,v22h,v23l,v23h;
3505 + int32x4_t v4l,v4h,v5l,v5h;
3506 + int32x4_t v6l,v6h,v7l,v7h;
3507 + int32x4_t v16l,v16h,v17l,v17h;
3508 + int32x4_t v18l,v18h,v19l,v19h;
3509 + int32x4_t v20l,v20h,v21l,v21h;
3511 + ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
3512 + ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
3514 + v22l = vmovl_s16(vget_low_s16(v22));
3515 + v22h = vmovl_high_s16(v22);
3516 + v23l = vmovl_s16(vget_low_s16(v23));
3517 + v23h = vmovl_high_s16(v23);
3519 + ISUMSUB_AB(v0l, v1l, v22l, v23l);
3520 + ISUMSUB_AB(v0h, v1h, v22h, v23h);
3522 + v6l = vmovl_s16(vget_low_s16(v6));
3523 + v6h = vmovl_high_s16(v6);
3524 + v7l = vmovl_s16(vget_low_s16(v7));
3525 + v7h = vmovl_high_s16(v7);
3527 + ISUMSUB_AB (v4l, v5l, v6l, v7l);
3528 + ISUMSUB_AB (v4h, v5h, v6h, v7h);
3530 + transpose_2d (v20l, v22l, v2l, v0l);
3531 + transpose_2d (v21l, v23l, v3l, v1l);
3532 + transpose_2d (v16l, v18l, v24l, v4l);
3533 + transpose_2d (v17l, v19l, v25l, v5l);
3535 + transpose_2d (v20h, v22h, v2h, v0h);
3536 + transpose_2d (v21h, v23h, v3h, v1h);
3537 + transpose_2d (v16h, v18h, v24h, v4h);
3538 + transpose_2d (v17h, v19h, v25h, v5h);
3540 + ISUMSUB_AB (v0l, v2l, v20l, v22l);
3541 + ISUMSUB_AB (v1l, v3l, v21l, v23l);
3542 + ISUMSUB_AB (v4l, v6l, v16l, v18l);
3543 + ISUMSUB_AB (v5l, v7l, v17l, v19l);
3545 + ISUMSUB_AB (v0h, v2h, v20h, v22h);
3546 + ISUMSUB_AB (v1h, v3h, v21h, v23h);
3547 + ISUMSUB_AB (v4h, v6h, v16h, v18h);
3548 + ISUMSUB_AB (v5h, v7h, v17h, v19h);
3570 + v16l = vabsq_s32(v16l);
3571 + v17l = vabsq_s32(v17l);
3572 + v18l = vabsq_s32(v18l);
3573 + v19l = vabsq_s32(v19l);
3574 + v20l = vabsq_s32(v20l);
3575 + v21l = vabsq_s32(v21l);
3576 + v22l = vabsq_s32(v22l);
3577 + v23l = vabsq_s32(v23l);
3579 + v16h = vabsq_s32(v16h);
3580 + v17h = vabsq_s32(v17h);
3581 + v18h = vabsq_s32(v18h);
3582 + v19h = vabsq_s32(v19h);
3583 + v20h = vabsq_s32(v20h);
3584 + v21h = vabsq_s32(v21h);
3585 + v22h = vabsq_s32(v22h);
3586 + v23h = vabsq_s32(v23h);
3588 + v16l = vmaxq_u32(v16l,v20l);
3589 + v17l = vmaxq_u32(v17l,v21l);
3590 + v18l = vmaxq_u32(v18l,v22l);
3591 + v19l = vmaxq_u32(v19l,v23l);
3593 + v16h = vmaxq_u32(v16h,v20h);
3594 + v17h = vmaxq_u32(v17h,v21h);
3595 + v18h = vmaxq_u32(v18h,v22h);
3596 + v19h = vmaxq_u32(v19h,v23h);
3598 + v16l = vaddq_u32(v16l,v16h);
3599 + v17l = vaddq_u32(v17l,v17h);
3600 + v18l = vaddq_u32(v18l,v18h);
3601 + v19l = vaddq_u32(v19l,v19h);
3603 + v0 = vaddq_u32(v16l, v17l);
3604 + v1 = vaddq_u32(v18l,v19l);
3613 +static inline void _satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2,
3614 + int16x8_t& v0,int16x8_t&v1, int16x8_t&v2,int16x8_t&v3)
3617 + int16x8_t v20,v21,v22,v23;
3618 + _sub_8x8_fly(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3,v20,v21,v22,v23);
3619 + _satd_8x4v_8x8h_neon(v0,v1,v2,v3,v20,v21,v22,v23);
3625 +int pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3627 + int16x8_t v30,v31;
3628 + int16x8_t v0,v1,v2,v3;
3630 + _satd_8x8_neon(pix1,stride_pix1,pix2,stride_pix2,v0,v1,v2,v3);
3631 +#if !(HIGH_BIT_DEPTH)
3632 + v30 = vaddq_u16(v0,v1);
3633 + v31 = vaddq_u16(v2,v3);
3635 + uint16x8_t sum = vaddq_u16(v30,v31);
3636 + return vaddvq_s32(vpaddlq_u16(sum));
3639 + v30 = vaddq_u16(v0,v1);
3640 + v31 = vaddq_u16(v2,v3);
3642 + int32x4_t sum = vpaddlq_u16(v30);
3643 + sum = vpadalq_u16(sum, v31);
3644 + return vaddvq_s32(sum);
3649 +int pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3651 + int16x8_t v0,v1,v2,v3;
3652 + int16x8_t v20,v21,v22,v23;
3654 + _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3655 + _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3658 +//#if 1//HIGH_BIT_DEPTH
3659 + int32x4_t s = vaddq_u32(v0,v1);
3660 + return (vaddvq_u32(s) + 1) >> 1;
3662 + return (vaddlvq_s16(vaddq_u16(v0, v1)) + 1) >> 1;
3670 +int pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3672 + int16x8_t v0,v1,v2,v3;
3673 + int16x8_t v20,v21,v22,v23;
3674 + int32x4_t v30,v31;
3676 + _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3677 + _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3679 +#if !(HIGH_BIT_DEPTH)
3680 + v30 = vpaddlq_u16(v0);
3681 + v31 = vpaddlq_u16(v1);
3683 + v30 = vaddq_s32(v0,v1);
3686 + _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3687 + _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3689 +#if !(HIGH_BIT_DEPTH)
3690 + v30 = vpadalq_u16(v30,v0);
3691 + v31 = vpadalq_u16(v31,v1);
3693 + v31 = vaddq_s32(v0,v1);
3697 + _sub_8x8_fly(pix1 + 8*stride_pix1, stride_pix1, pix2 + 8*stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3698 + _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3700 +#if !(HIGH_BIT_DEPTH)
3701 + v30 = vpadalq_u16(v30,v0);
3702 + v31 = vpadalq_u16(v31,v1);
3704 + v30 = vaddq_s32(v30,v0);
3705 + v31 = vaddq_s32(v31,v1);
3708 + _sub_8x8_fly(pix1 + 8*stride_pix1 + 8, stride_pix1, pix2 + 8*stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
3709 + _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
3711 +#if !(HIGH_BIT_DEPTH)
3712 + v30 = vpadalq_u16(v30,v0);
3713 + v31 = vpadalq_u16(v31,v1);
3715 + v30 = vaddq_s32(v30,v0);
3716 + v31 = vaddq_s32(v31,v1);
3719 + v30 = vaddq_u32(v30,v31);
3721 + return (vaddvq_u32(v30) + 1) >> 1;
3732 +void blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
3734 + for (int y = 0; y < size; y++) {
3736 + int16x8_t v = vdupq_n_s16(val);
3737 + for (; (x + 8) <= size; x+=8) {
3738 + *(int16x8_t*)&dst[y * dstride + x] = v;
3740 + for (; x < size; x++) {
3741 + dst[y * dstride + x] = val;
3746 +template<int lx, int ly>
3747 +int sad_pp_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3752 + for (int y = 0; y < ly; y++)
3756 + uint16x8_t vsum16_1 = vdupq_n_u16(0);
3757 + for (; (x + 8) <= lx; x+=8) {
3758 + uint16x8_t p1 = *(uint16x8_t*)&pix1[x];
3759 + uint16x8_t p2 = *(uint16x8_t*)&pix2[x];
3760 + vsum16_1 = vabaq_s16(vsum16_1,p1,p2);
3764 + uint16x4_t p1 = *(uint16x4_t*)&pix1[x];
3765 + uint16x4_t p2 = *(uint16x4_t*)&pix2[x];
3766 + sum += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p2));
3770 + sum += vaddlvq_s16(vsum16_1);
3776 + uint16x8_t vsum16_1 = vdupq_n_u16(0);
3777 + uint16x8_t vsum16_2 = vdupq_n_u16(0);
3779 + for (; (x + 16) <= lx; x+=16) {
3780 + uint8x16_t p1 = *(uint8x16_t*)&pix1[x];
3781 + uint8x16_t p2 = *(uint8x16_t*)&pix2[x];
3782 + vsum16_1 = vabal_u8(vsum16_1,vget_low_u8(p1),vget_low_u8(p2));
3783 + vsum16_2 = vabal_high_u8(vsum16_2,p1,p2);
3786 + uint8x8_t p1 = *(uint8x8_t*)&pix1[x];
3787 + uint8x8_t p2 = *(uint8x8_t*)&pix2[x];
3788 + vsum16_1 = vabal_u8(vsum16_1,p1,p2);
3792 + uint32x2_t p1 = vdup_n_u32(0);
3793 + p1[0] = *(uint32_t*)&pix1[x];
3794 + uint32x2_t p2 = vdup_n_u32(0);
3795 + p2[0] = *(uint32_t*)&pix2[x];
3796 + vsum16_1 = vabal_u8(vsum16_1,p1,p2);
3800 + vsum16_1 = vaddq_u16(vsum16_1,vsum16_2);
3803 + sum += vaddvq_u16(vsum16_1);
3807 + if (lx & 3) for (; x < lx; x++) {
3808 + sum += abs(pix1[x] - pix2[x]);
3811 + pix1 += stride_pix1;
3812 + pix2 += stride_pix2;
3818 +template<int lx, int ly>
3819 +void sad_x3_neon(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
3824 + for (int y = 0; y < ly; y++)
3827 + uint16x8_t vsum16_0 = vdupq_n_u16(0);
3828 + uint16x8_t vsum16_1 = vdupq_n_u16(0);
3829 + uint16x8_t vsum16_2 = vdupq_n_u16(0);
3831 + for (; (x + 8) <= lx; x+=8) {
3832 + uint16x8_t p1 = *(uint16x8_t*)&pix1[x];
3833 + uint16x8_t p2 = *(uint16x8_t*)&pix2[x];
3834 + uint16x8_t p3 = *(uint16x8_t*)&pix3[x];
3835 + uint16x8_t p4 = *(uint16x8_t*)&pix4[x];
3836 + vsum16_0 = vabaq_s16(vsum16_0,p1,p2);
3837 + vsum16_1 = vabaq_s16(vsum16_1,p1,p3);
3838 + vsum16_2 = vabaq_s16(vsum16_2,p1,p4);
3842 + uint16x4_t p1 = *(uint16x4_t*)&pix1[x];
3843 + uint16x4_t p2 = *(uint16x4_t*)&pix2[x];
3844 + uint16x4_t p3 = *(uint16x4_t*)&pix3[x];
3845 + uint16x4_t p4 = *(uint16x4_t*)&pix4[x];
3846 + res[0] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p2));
3847 + res[1] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p3));
3848 + res[2] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p4));
3852 + res[0] += vaddlvq_s16(vsum16_0);
3853 + res[1] += vaddlvq_s16(vsum16_1);
3854 + res[2] += vaddlvq_s16(vsum16_2);
3858 + for (; (x + 16) <= lx; x+=16) {
3859 + uint8x16_t p1 = *(uint8x16_t*)&pix1[x];
3860 + uint8x16_t p2 = *(uint8x16_t*)&pix2[x];
3861 + uint8x16_t p3 = *(uint8x16_t*)&pix3[x];
3862 + uint8x16_t p4 = *(uint8x16_t*)&pix4[x];
3863 + vsum16_0 = vabal_u8(vsum16_0,vget_low_u8(p1),vget_low_u8(p2));
3864 + vsum16_0 = vabal_high_u8(vsum16_0,p1,p2);
3865 + vsum16_1 = vabal_u8(vsum16_1,vget_low_u8(p1),vget_low_u8(p3));
3866 + vsum16_1 = vabal_high_u8(vsum16_1,p1,p3);
3867 + vsum16_2 = vabal_u8(vsum16_2,vget_low_u8(p1),vget_low_u8(p4));
3868 + vsum16_2 = vabal_high_u8(vsum16_2,p1,p4);
3871 + uint8x8_t p1 = *(uint8x8_t*)&pix1[x];
3872 + uint8x8_t p2 = *(uint8x8_t*)&pix2[x];
3873 + uint8x8_t p3 = *(uint8x8_t*)&pix3[x];
3874 + uint8x8_t p4 = *(uint8x8_t*)&pix4[x];
3875 + vsum16_0 = vabal_u8(vsum16_0,p1,p2);
3876 + vsum16_1 = vabal_u8(vsum16_1,p1,p3);
3877 + vsum16_2 = vabal_u8(vsum16_2,p1,p4);
3881 + uint32x2_t p1 = vdup_n_u32(0);
3882 + p1[0] = *(uint32_t*)&pix1[x];
3883 + uint32x2_t p2 = vdup_n_u32(0);
3884 + p2[0] = *(uint32_t*)&pix2[x];
3885 + uint32x2_t p3 = vdup_n_u32(0);
3886 + p3[0] = *(uint32_t*)&pix3[x];
3887 + uint32x2_t p4 = vdup_n_u32(0);
3888 + p4[0] = *(uint32_t*)&pix4[x];
3889 + vsum16_0 = vabal_u8(vsum16_0,p1,p2);
3890 + vsum16_1 = vabal_u8(vsum16_1,p1,p3);
3891 + vsum16_2 = vabal_u8(vsum16_2,p1,p4);
3895 + res[0] += vaddvq_u16(vsum16_0);
3896 + res[1] += vaddvq_u16(vsum16_1);
3897 + res[2] += vaddvq_u16(vsum16_2);
3901 + if (lx & 3) for (; x < lx; x++)
3903 + res[0] += abs(pix1[x] - pix2[x]);
3904 + res[1] += abs(pix1[x] - pix3[x]);
3905 + res[2] += abs(pix1[x] - pix4[x]);
3908 + pix1 += FENC_STRIDE;
3909 + pix2 += frefstride;
3910 + pix3 += frefstride;
3911 + pix4 += frefstride;
3915 +template<int lx, int ly>
3916 +void sad_x4_neon(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
3922 + for (int y = 0; y < ly; y++)
3925 + uint16x8_t vsum16_0 = vdupq_n_u16(0);
3926 + uint16x8_t vsum16_1 = vdupq_n_u16(0);
3927 + uint16x8_t vsum16_2 = vdupq_n_u16(0);
3928 + uint16x8_t vsum16_3 = vdupq_n_u16(0);
3930 + for (; (x + 8) <= lx; x+=8) {
3931 + uint16x8_t p1 = *(uint16x8_t*)&pix1[x];
3932 + uint16x8_t p2 = *(uint16x8_t*)&pix2[x];
3933 + uint16x8_t p3 = *(uint16x8_t*)&pix3[x];
3934 + uint16x8_t p4 = *(uint16x8_t*)&pix4[x];
3935 + uint16x8_t p5 = *(uint16x8_t*)&pix5[x];
3936 + vsum16_0 = vabaq_s16(vsum16_0,p1,p2);
3937 + vsum16_1 = vabaq_s16(vsum16_1,p1,p3);
3938 + vsum16_2 = vabaq_s16(vsum16_2,p1,p4);
3939 + vsum16_3 = vabaq_s16(vsum16_3,p1,p5);
3943 + uint16x4_t p1 = *(uint16x4_t*)&pix1[x];
3944 + uint16x4_t p2 = *(uint16x4_t*)&pix2[x];
3945 + uint16x4_t p3 = *(uint16x4_t*)&pix3[x];
3946 + uint16x4_t p4 = *(uint16x4_t*)&pix4[x];
3947 + uint16x4_t p5 = *(uint16x4_t*)&pix5[x];
3948 + res[0] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p2));
3949 + res[1] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p3));
3950 + res[2] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p4));
3951 + res[3] += vaddlv_s16(vaba_s16(vdup_n_s16(0),p1,p5));
3955 + res[0] += vaddlvq_s16(vsum16_0);
3956 + res[1] += vaddlvq_s16(vsum16_1);
3957 + res[2] += vaddlvq_s16(vsum16_2);
3958 + res[3] += vaddlvq_s16(vsum16_3);
3963 + for (; (x + 16) <= lx; x+=16) {
3964 + uint8x16_t p1 = *(uint8x16_t*)&pix1[x];
3965 + uint8x16_t p2 = *(uint8x16_t*)&pix2[x];
3966 + uint8x16_t p3 = *(uint8x16_t*)&pix3[x];
3967 + uint8x16_t p4 = *(uint8x16_t*)&pix4[x];
3968 + uint8x16_t p5 = *(uint8x16_t*)&pix5[x];
3969 + vsum16_0 = vabal_u8(vsum16_0,vget_low_u8(p1),vget_low_u8(p2));
3970 + vsum16_0 = vabal_high_u8(vsum16_0,p1,p2);
3971 + vsum16_1 = vabal_u8(vsum16_1,vget_low_u8(p1),vget_low_u8(p3));
3972 + vsum16_1 = vabal_high_u8(vsum16_1,p1,p3);
3973 + vsum16_2 = vabal_u8(vsum16_2,vget_low_u8(p1),vget_low_u8(p4));
3974 + vsum16_2 = vabal_high_u8(vsum16_2,p1,p4);
3975 + vsum16_3 = vabal_u8(vsum16_3,vget_low_u8(p1),vget_low_u8(p5));
3976 + vsum16_3 = vabal_high_u8(vsum16_3,p1,p5);
3979 + uint8x8_t p1 = *(uint8x8_t*)&pix1[x];
3980 + uint8x8_t p2 = *(uint8x8_t*)&pix2[x];
3981 + uint8x8_t p3 = *(uint8x8_t*)&pix3[x];
3982 + uint8x8_t p4 = *(uint8x8_t*)&pix4[x];
3983 + uint8x8_t p5 = *(uint8x8_t*)&pix5[x];
3984 + vsum16_0 = vabal_u8(vsum16_0,p1,p2);
3985 + vsum16_1 = vabal_u8(vsum16_1,p1,p3);
3986 + vsum16_2 = vabal_u8(vsum16_2,p1,p4);
3987 + vsum16_3 = vabal_u8(vsum16_3,p1,p5);
3991 + uint32x2_t p1 = vdup_n_u32(0);
3992 + p1[0] = *(uint32_t*)&pix1[x];
3993 + uint32x2_t p2 = vdup_n_u32(0);
3994 + p2[0] = *(uint32_t*)&pix2[x];
3995 + uint32x2_t p3 = vdup_n_u32(0);
3996 + p3[0] = *(uint32_t*)&pix3[x];
3997 + uint32x2_t p4 = vdup_n_u32(0);
3998 + p4[0] = *(uint32_t*)&pix4[x];
3999 + uint32x2_t p5 = vdup_n_u32(0);
4000 + p5[0] = *(uint32_t*)&pix5[x];
4001 + vsum16_0 = vabal_u8(vsum16_0,p1,p2);
4002 + vsum16_1 = vabal_u8(vsum16_1,p1,p3);
4003 + vsum16_2 = vabal_u8(vsum16_2,p1,p4);
4004 + vsum16_3 = vabal_u8(vsum16_3,p1,p5);
4008 + res[0] += vaddvq_u16(vsum16_0);
4009 + res[1] += vaddvq_u16(vsum16_1);
4010 + res[2] += vaddvq_u16(vsum16_2);
4011 + res[3] += vaddvq_u16(vsum16_3);
4015 + if (lx & 3) for (; x < lx; x++)
4017 + res[0] += abs(pix1[x] - pix2[x]);
4018 + res[1] += abs(pix1[x] - pix3[x]);
4019 + res[2] += abs(pix1[x] - pix4[x]);
4020 + res[3] += abs(pix1[x] - pix5[x]);
4023 + pix1 += FENC_STRIDE;
4024 + pix2 += frefstride;
4025 + pix3 += frefstride;
4026 + pix4 += frefstride;
4027 + pix5 += frefstride;
4032 +template<int lx, int ly, class T1, class T2>
4033 +sse_t sse_neon(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
4037 + int32x4_t vsum1 = vdupq_n_s32(0);
4038 + int32x4_t vsum2 = vdupq_n_s32(0);
4039 + for (int y = 0; y < ly; y++)
4042 + for (; (x+8) <= lx; x+=8)
4045 + if (sizeof(T1) == 2 && sizeof(T2) == 2) {
4046 + tmp = vsubq_s16(*(int16x8_t *)&pix1[x],*(int16x8_t *)&pix2[x]);
4047 + } else if (sizeof(T1) == 1 && sizeof(T2) == 1){
4048 + tmp = vsubl_u8(*(uint8x8_t *)&pix1[x],*(uint8x8_t *)&pix2[x]);
4051 + X265_CHECK(false,"unsupported sse");
4053 + vsum1 = vmlal_s16(vsum1,vget_low_s16(tmp),vget_low_s16(tmp));
4054 + vsum2 = vmlal_high_s16(vsum2,tmp,tmp);
4056 + for (; x < lx; x++)
4058 + int tmp = pix1[x] - pix2[x];
4059 + sum += (tmp * tmp);
4062 + if (sizeof(T1) == 2 && sizeof(T2) == 2)
4064 + int32x4_t vsum = vaddq_u32(vsum1,vsum2);;
4065 + sum += vaddvq_u32(vsum);
4066 + vsum1 = vsum2 = vdupq_n_u16(0);
4069 + pix1 += stride_pix1;
4070 + pix2 += stride_pix2;
4072 + int32x4_t vsum = vaddq_u32(vsum1,vsum2);
4074 + return sum + vaddvq_u32(vsum);
4078 +template<int bx, int by>
4079 +void blockcopy_ps_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
4081 + for (int y = 0; y < by; y++)
4084 + for (; (x + 8) <= bx; x+=8)
4087 + *(int16x8_t *)&a[x] = *(int16x8_t *)&b[x];
4089 + *(int16x8_t *)&a[x] = vmovl_u8(*(int8x8_t *)&b[x]);
4092 + for (; x < bx; x++) {
4093 + a[x] = (int16_t)b[x];
4102 +template<int bx, int by>
4103 +void blockcopy_pp_neon(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
4105 + for (int y = 0; y < by; y++)
4109 + for (; (x + 8) <= bx; x+=8)
4111 + *(int16x8_t *)&a[x] = *(int16x8_t *)&b[x];
4115 + *(uint64_t *)&a[x] = *(uint64_t *)&b[x];
4119 + for (; (x + 16) <= bx; x+=16)
4121 + *(uint8x16_t *)&a[x] = *(uint8x16_t *)&b[x];
4125 + *(uint8x8_t *)&a[x] = *(uint8x8_t *)&b[x];
4130 + *(uint32_t *)&a[x] = *(uint32_t *)&b[x];
4134 + for (; x < bx; x++) {
4144 +template<int bx, int by>
4145 +void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
4147 + for (int y = 0; y < by; y++)
4150 + for (; (x + 8) <= bx; x+=8) {
4152 + *(int16x8_t *)&a[x] = vsubq_s16(*(int16x8_t *)&b0[x], *(int16x8_t *)&b1[x]);
4154 + *(int16x8_t *)&a[x] = vsubl_u8(*(uint8x8_t *)&b0[x], *(uint8x8_t *)&b1[x]);
4157 + for (; x < bx; x++)
4158 + a[x] = (int16_t)(b0[x] - b1[x]);
4166 +template<int bx, int by>
4167 +void pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
4169 + for (int y = 0; y < by; y++)
4172 + for (; (x + 8) <= bx; x+=8) {
4174 + int16x8_t b1e = *(int16x8_t *)&b1[x];
4177 + b0e = *(int16x8_t *)&b0[x];
4178 + t = vaddq_s16(b0e,b1e);
4179 + t = vminq_s16(t,vdupq_n_s16((1 << X265_DEPTH) - 1));
4180 + t = vmaxq_s16(t,vdupq_n_s16(0));
4181 + *(int16x8_t *)&a[x] = t;
4183 + b0e = vmovl_u8(*(uint8x8_t *)&b0[x]);
4184 + t = vaddq_s16(b0e,b1e);
4185 + *(uint8x8_t *)&a[x] = vqmovun_s16(t);
4188 + for (; x < bx; x++)
4189 + a[x] = (int16_t)x265_clip(b0[x] + b1[x]);
4197 +template<int bx, int by>
4198 +void addAvg_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
4201 + const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
4202 + const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
4204 + const int32x4_t addon = vdupq_n_s32(offset);
4205 + for (int y = 0; y < by; y++)
4209 + for (; (x + 8) <= bx; x += 8)
4211 + int16x8_t in0 = *(int16x8_t*)&src0[x];
4212 + int16x8_t in1 = *(int16x8_t*)&src1[x];
4213 + int32x4_t t1 = vaddl_s16(vget_low_s16(in0),vget_low_s16(in1));
4214 + int32x4_t t2 = vaddl_high_s16(in0,in1);
4215 + t1 = vaddq_s32(t1,addon);
4216 + t2 = vaddq_s32(t2,addon);
4217 + t1 = vshrq_n_s32(t1,shiftNum);
4218 + t2 = vshrq_n_s32(t2,shiftNum);
4219 + int16x8_t t = vuzp1q_s16(t1,t2);
4221 + t = vminq_s16(t,vdupq_n_s16((1 << X265_DEPTH) - 1));
4222 + t = vmaxq_s16(t,vdupq_n_s16(0));
4223 + *(int16x8_t *)&dst[x] = t;
4225 + *(uint8x8_t *)&dst[x] = vqmovun_s16(t);
4228 + for (; x < bx; x += 2)
4230 + dst[x + 0] = x265_clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
4231 + dst[x + 1] = x265_clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
4234 + src0 += src0Stride;
4235 + src1 += src1Stride;
4240 +template<int lx, int ly>
4241 +void pixelavg_pp_neon(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
4243 + for (int y = 0; y < ly; y++)
4246 + for (; (x+8) <= lx; x+=8) {
4248 + int16x8_t in0 = *(int16x8_t *)&src0[x];
4249 + int16x8_t in1 = *(int16x8_t *)&src1[x];
4250 + int16x8_t t = vaddq_s16(in0,in1);
4251 + t = vaddq_s16(t,vdupq_n_s16(1));
4252 + t = vshrq_n_s16(t,1);
4253 + *(int16x8_t *)&dst[x] = t;
4255 + int16x8_t in0 = vmovl_u8(*(uint8x8_t *)&src0[x]);
4256 + int16x8_t in1 = vmovl_u8(*(uint8x8_t *)&src1[x]);
4257 + int16x8_t t = vaddq_s16(in0,in1);
4258 + t = vaddq_s16(t,vdupq_n_s16(1));
4259 + t = vshrq_n_s16(t,1);
4260 + *(uint8x8_t *)&dst[x] = vmovn_u16(t);
4263 + for (; x < lx; x++)
4264 + dst[x] = (src0[x] + src1[x] + 1) >> 1;
4274 +void cpy1Dto2D_shl_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4276 + X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
4277 + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
4278 + X265_CHECK(shift >= 0, "invalid shift\n");
4280 + for (int i = 0; i < size; i++)
4283 + for (; (j+8) <= size; j+=8)
4285 + *(int16x8_t *)&dst[j] = vshlq_s16(*(int16x8_t*)&src[j],vdupq_n_s16(shift));
4287 + for (; j < size; j++)
4289 + dst[j] = src[j] << shift;
4298 +uint64_t pixel_var_neon(const uint8_t* pix, intptr_t i_stride)
4300 + uint32_t sum = 0, sqr = 0;
4302 + int32x4_t vsqr = vdupq_n_s32(0);
4303 + for (int y = 0; y < size; y++)
4306 + int16x8_t vsum = vdupq_n_s16(0);
4307 + for (; (x + 8) <= size; x+=8)
4310 + in = vmovl_u8(*(uint8x8_t*)&pix[x]);
4311 + vsum = vaddq_u16(vsum,in);
4312 + vsqr = vmlal_s16(vsqr,vget_low_s16(in),vget_low_s16(in));
4313 + vsqr = vmlal_high_s16(vsqr,in,in);
4315 + for (; x < size; x++)
4318 + sqr += pix[x] * pix[x];
4320 + sum += vaddvq_s16(vsum);
4324 + sqr += vaddvq_u32(vsqr);
4325 + return sum + ((uint64_t)sqr << 32);
4328 +template<int blockSize>
4329 +void getResidual_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
4331 + for (int y = 0; y < blockSize; y++)
4334 + for (; (x + 8) < blockSize; x+=8) {
4335 + int16x8_t vfenc,vpred;
4337 + vfenc = *(int16x8_t *)&fenc[x];
4338 + vpred = *(int16x8_t *)&pred[x];
4340 + vfenc = vmovl_u8(*(uint8x8_t *)&fenc[x]);
4341 + vpred = vmovl_u8(*(uint8x8_t *)&pred[x]);
4343 + *(int16x8_t*)&residual[x] = vsubq_s16(vfenc,vpred);
4345 + for (; x < blockSize; x++) {
4346 + residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
4349 + residual += stride;
4354 +#if 1//!(HIGH_BIT_DEPTH)
4356 +int psyCost_pp_neon(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
4358 + static pixel zeroBuf[8] /* = { 0 } */;
4362 + int dim = 1 << (size + 2);
4363 + uint32_t totEnergy = 0;
4364 + for (int i = 0; i < dim; i += 8)
4366 + for (int j = 0; j < dim; j+= 8)
4368 + /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
4369 + int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
4370 + (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
4371 + int reconEnergy = pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
4372 + (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
4374 + totEnergy += abs(sourceEnergy - reconEnergy);
4381 + /* 4x4 is too small for sa8d */
4382 + int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf, 0) >> 2);
4383 + int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
4384 + return abs(sourceEnergy - reconEnergy);
4389 +template<int w, int h>
4390 +// Calculate sa8d in blocks of 8x8
4391 +int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4395 + for (int y = 0; y < h; y += 8)
4396 + for (int x = 0; x < w; x += 8)
4397 + cost += pixel_sa8d_8x8_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
4402 +template<int w, int h>
4403 +// Calculate sa8d in blocks of 16x16
4404 +int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4408 + for (int y = 0; y < h; y += 16)
4409 + for (int x = 0; x < w; x += 16)
4410 + cost += pixel_sa8d_16x16_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
4417 +void cpy2Dto1D_shl_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
4419 + X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
4420 + X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
4421 + X265_CHECK(shift >= 0, "invalid shift\n");
4423 + for (int i = 0; i < size; i++)
4425 + for (int j = 0; j < size; j++)
4426 + dst[j] = src[j] << shift;
4434 +#if 1//!(HIGH_BIT_DEPTH)
4435 +template<int w, int h>
4436 +// calculate satd in blocks of 4x4
4437 +int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
4441 + for (int row = 0; row < h; row += 4)
4442 + for (int col = 0; col < w; col += 4)
4443 + satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4444 + pix2 + row * stride_pix2 + col, stride_pix2);
4449 +template<int w, int h>
4450 +// calculate satd in blocks of 8x4
4451 +int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
4455 + if (((w | h) & 15) == 0)
4457 + for (int row = 0; row < h; row += 16)
4458 + for (int col = 0; col < w; col += 16)
4459 + satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4460 + pix2 + row * stride_pix2 + col, stride_pix2);
4464 + if (((w | h) & 7) == 0)
4466 + for (int row = 0; row < h; row += 8)
4467 + for (int col = 0; col < w; col += 8)
4468 + satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4469 + pix2 + row * stride_pix2 + col, stride_pix2);
4474 + for (int row = 0; row < h; row += 4)
4475 + for (int col = 0; col < w; col += 8)
4476 + satd += pixel_satd_8x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
4477 + pix2 + row * stride_pix2 + col, stride_pix2);
4485 +template<int blockSize>
4486 +void transpose_neon(pixel* dst, const pixel* src, intptr_t stride)
4488 + for (int k = 0; k < blockSize; k++)
4489 + for (int l = 0; l < blockSize; l++)
4490 + dst[k * blockSize + l] = src[l * stride + k];
4495 +void transpose_neon<8>(pixel* dst, const pixel* src, intptr_t stride)
4497 + transpose8x8(dst,src,8,stride);
4501 +void transpose_neon<16>(pixel* dst, const pixel* src, intptr_t stride)
4503 + transpose16x16(dst,src,16,stride);
4507 +void transpose_neon<32>(pixel* dst, const pixel* src, intptr_t stride)
4509 + transpose32x32(dst,src,32,stride);
4514 +void transpose_neon<64>(pixel* dst, const pixel* src, intptr_t stride)
4516 + transpose32x32(dst,src,64,stride);
4517 + transpose32x32(dst+32*64+32,src+32*stride+32,64,stride);
4518 + transpose32x32(dst+32*64,src+32,64,stride);
4519 + transpose32x32(dst+32,src+32*stride,64,stride);
4524 +sse_t pixel_ssd_s_neon(const int16_t* a, intptr_t dstride)
4529 + int32x4_t vsum = vdupq_n_s32(0);
4531 + for (int y = 0; y < size; y++)
4535 + for (; (x + 8) <= size; x+=8) {
4536 + int16x8_t in = *(int16x8_t*)&a[x];
4537 + vsum = vmlal_s16(vsum,vget_low_s16(in),vget_low_s16(in));
4538 + vsum = vmlal_high_s16(vsum,(in),(in));
4540 + for (; x < size; x++) {
4541 + sum += a[x] * a[x];
4546 + return sum + vaddvq_s32(vsum);
4555 +namespace X265_NS {
4558 +void setupPixelPrimitives_neon(EncoderPrimitives &p)
4560 + #define LUMA_PU(W, H) \
4561 + p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4562 + p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
4563 + p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
4564 + p.pu[LUMA_ ## W ## x ## H].sad = sad_pp_neon<W, H>; \
4565 + p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_neon<W, H>; \
4566 + p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
4567 + p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
4568 + p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
4570 +#if !(HIGH_BIT_DEPTH)
4572 +#define LUMA_CU(W, H) \
4573 + p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
4574 + p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
4575 + p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
4576 + p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
4577 + p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4578 + p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
4579 + p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_neon<W>; \
4580 + p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
4581 + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
4582 + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
4583 + p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
4584 + p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>; \
4585 + p.cu[BLOCK_ ## W ## x ## H].var = pixel_var_neon<W>; \
4586 + p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED] = getResidual_neon<W>; \
4587 + p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED] = getResidual_neon<W>; \
4591 + #define LUMA_CU(W, H) \
4592 + p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
4593 + p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
4594 + p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
4595 + p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4596 + p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
4597 + p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4598 + p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
4599 + p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_neon<W>; \
4600 + p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
4601 + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
4602 + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
4603 + p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
4604 + p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>; \
4605 + /*p.cu[BLOCK_ ## W ## x ## H].var = pixel_var_neon<W>;*/ \
4606 + p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED] = getResidual_neon<W>; \
4607 + p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED] = getResidual_neon<W>; \
4640 + p.pu[LUMA_4x4].satd = pixel_satd_4x4_neon;
4641 + p.pu[LUMA_8x8].satd = satd8<8, 8>;
4642 + p.pu[LUMA_8x4].satd = pixel_satd_8x4_neon;
4643 + p.pu[LUMA_4x8].satd = satd4<4, 8>;
4644 + p.pu[LUMA_16x16].satd = satd8<16, 16>;
4645 + p.pu[LUMA_16x8].satd = satd8<16, 8>;
4646 + p.pu[LUMA_8x16].satd = satd8<8, 16>;
4647 + p.pu[LUMA_16x12].satd = satd8<16, 12>;
4648 + p.pu[LUMA_12x16].satd = satd4<12, 16>;
4649 + p.pu[LUMA_16x4].satd = satd8<16, 4>;
4650 + p.pu[LUMA_4x16].satd = satd4<4, 16>;
4651 + p.pu[LUMA_32x32].satd = satd8<32, 32>;
4652 + p.pu[LUMA_32x16].satd = satd8<32, 16>;
4653 + p.pu[LUMA_16x32].satd = satd8<16, 32>;
4654 + p.pu[LUMA_32x24].satd = satd8<32, 24>;
4655 + p.pu[LUMA_24x32].satd = satd8<24, 32>;
4656 + p.pu[LUMA_32x8].satd = satd8<32, 8>;
4657 + p.pu[LUMA_8x32].satd = satd8<8, 32>;
4658 + p.pu[LUMA_64x64].satd = satd8<64, 64>;
4659 + p.pu[LUMA_64x32].satd = satd8<64, 32>;
4660 + p.pu[LUMA_32x64].satd = satd8<32, 64>;
4661 + p.pu[LUMA_64x48].satd = satd8<64, 48>;
4662 + p.pu[LUMA_48x64].satd = satd8<48, 64>;
4663 + p.pu[LUMA_64x16].satd = satd8<64, 16>;
4664 + p.pu[LUMA_16x64].satd = satd8<16, 64>;
4674 + p.cu[BLOCK_4x4].sa8d = pixel_satd_4x4_neon;
4675 + p.cu[BLOCK_8x8].sa8d = pixel_sa8d_8x8_neon;
4676 + p.cu[BLOCK_16x16].sa8d = pixel_sa8d_16x16_neon;
4677 + p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
4678 + p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
4681 + #define CHROMA_PU_420(W, H) \
4682 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
4683 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
4684 + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4687 + CHROMA_PU_420(4, 4);
4688 + CHROMA_PU_420(8, 8);
4689 + CHROMA_PU_420(16, 16);
4690 + CHROMA_PU_420(32, 32);
4691 + CHROMA_PU_420(4, 2);
4692 + CHROMA_PU_420(8, 4);
4693 + CHROMA_PU_420(4, 8);
4694 + CHROMA_PU_420(8, 6);
4695 + CHROMA_PU_420(6, 8);
4696 + CHROMA_PU_420(8, 2);
4697 + CHROMA_PU_420(2, 8);
4698 + CHROMA_PU_420(16, 8);
4699 + CHROMA_PU_420(8, 16);
4700 + CHROMA_PU_420(16, 12);
4701 + CHROMA_PU_420(12, 16);
4702 + CHROMA_PU_420(16, 4);
4703 + CHROMA_PU_420(4, 16);
4704 + CHROMA_PU_420(32, 16);
4705 + CHROMA_PU_420(16, 32);
4706 + CHROMA_PU_420(32, 24);
4707 + CHROMA_PU_420(24, 32);
4708 + CHROMA_PU_420(32, 8);
4709 + CHROMA_PU_420(8, 32);
4713 + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd = NULL;
4714 + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = pixel_satd_4x4_neon;
4715 + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd8<8, 8>;
4716 + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8<16, 16>;
4717 + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8<32, 32>;
4719 + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd = NULL;
4720 + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd = NULL;
4721 + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = pixel_satd_8x4_neon;
4722 + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = satd4<4, 8>;
4723 + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd8<16, 8>;
4724 + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd8<8, 16>;
4725 + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8<32, 16>;
4726 + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8<16, 32>;
4728 + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd = NULL;
4729 + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd = NULL;
4730 + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd = NULL;
4731 + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd = NULL;
4732 + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4<16, 12>;
4733 + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4<12, 16>;
4734 + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd4<16, 4>;
4735 + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = satd4<4, 16>;
4736 + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8<32, 24>;
4737 + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8<24, 32>;
4738 + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd8<32, 8>;
4739 + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd8<8, 32>;
4742 + #define CHROMA_CU_420(W, H) \
4743 + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \
4744 + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4745 + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
4746 + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
4747 + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
4748 + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
4751 + CHROMA_CU_420(4, 4)
4752 + CHROMA_CU_420(8, 8)
4753 + CHROMA_CU_420(16, 16)
4754 + CHROMA_CU_420(32, 32)
4757 + p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
4758 + p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
4759 + p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
4760 + p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
4763 + #define CHROMA_PU_422(W, H) \
4764 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
4765 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
4766 + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4769 + CHROMA_PU_422(4, 8);
4770 + CHROMA_PU_422(8, 16);
4771 + CHROMA_PU_422(16, 32);
4772 + CHROMA_PU_422(32, 64);
4773 + CHROMA_PU_422(4, 4);
4774 + CHROMA_PU_422(2, 8);
4775 + CHROMA_PU_422(8, 8);
4776 + CHROMA_PU_422(4, 16);
4777 + CHROMA_PU_422(8, 12);
4778 + CHROMA_PU_422(6, 16);
4779 + CHROMA_PU_422(8, 4);
4780 + CHROMA_PU_422(2, 16);
4781 + CHROMA_PU_422(16, 16);
4782 + CHROMA_PU_422(8, 32);
4783 + CHROMA_PU_422(16, 24);
4784 + CHROMA_PU_422(12, 32);
4785 + CHROMA_PU_422(16, 8);
4786 + CHROMA_PU_422(4, 32);
4787 + CHROMA_PU_422(32, 32);
4788 + CHROMA_PU_422(16, 64);
4789 + CHROMA_PU_422(32, 48);
4790 + CHROMA_PU_422(24, 64);
4791 + CHROMA_PU_422(32, 16);
4792 + CHROMA_PU_422(8, 64);
4795 + p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL;
4796 + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd4<4, 8>;
4797 + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd8<8, 16>;
4798 + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8<16, 32>;
4799 + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8<32, 64>;
4801 + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = pixel_satd_4x4_neon;
4802 + p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd = NULL;
4803 + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = satd8<8, 8>;
4804 + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd4<4, 16>;
4805 + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8<16, 16>;
4806 + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = satd8<8, 32>;
4807 + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8<32, 32>;
4808 + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8<16, 64>;
4810 + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd4<8, 12>;
4811 + p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd = NULL;
4812 + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd4<8, 4>;
4813 + p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd = NULL;
4814 + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8<16, 24>;
4815 + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4<12, 32>;
4816 + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd8<16, 8>;
4817 + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd4<4, 32>;
4818 + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8<32, 48>;
4819 + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8<24, 64>;
4820 + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8<32, 16>;
4821 + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = satd8<8, 64>;
4824 + #define CHROMA_CU_422(W, H) \
4825 + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \
4826 + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
4827 + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
4828 + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
4829 + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
4830 + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
4833 + CHROMA_CU_422(4, 8)
4834 + CHROMA_CU_422(8, 16)
4835 + CHROMA_CU_422(16, 32)
4836 + CHROMA_CU_422(32, 64)
4838 + p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
4839 + p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
4840 + p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
4841 + p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
4852 diff -Naur ./source/common/arm64/pixel-prim.h ../x265_apple_patch/source/common/arm64/pixel-prim.h
4853 --- ./source/common/arm64/pixel-prim.h 1970-01-01 01:00:00.000000000 +0100
4854 +++ ../x265_apple_patch/source/common/arm64/pixel-prim.h 2021-05-08 13:08:01.000000000 +0100
4856 +#ifndef PIXEL_PRIM_NEON_H__
4857 +#define PIXEL_PRIM_NEON_H__
4859 +#include "common.h"
4860 +#include "slicetype.h" // LOWRES_COST_MASK
4861 +#include "primitives.h"
4866 +namespace X265_NS {
4870 +void setupPixelPrimitives_neon(EncoderPrimitives &p);
4878 diff -Naur ./source/common/arm64/pixel.h ../x265_apple_patch/source/common/arm64/pixel.h
4879 --- ./source/common/arm64/pixel.h 1970-01-01 01:00:00.000000000 +0100
4880 +++ ../x265_apple_patch/source/common/arm64/pixel.h 2021-05-08 13:08:01.000000000 +0100
4882 +/*****************************************************************************
4883 + * pixel.h: aarch64 pixel metrics
4884 + *****************************************************************************
4885 + * Copyright (C) 2009-2019 x265 project
4887 + * Authors: David Conrad <lessen42@gmail.com>
4888 + * Janne Grunau <janne-x265@jannau.net>
4890 + * This program is free software; you can redistribute it and/or modify
4891 + * it under the terms of the GNU General Public License as published by
4892 + * the Free Software Foundation; either version 2 of the License, or
4893 + * (at your option) any later version.
4895 + * This program is distributed in the hope that it will be useful,
4896 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
4897 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4898 + * GNU General Public License for more details.
4900 + * You should have received a copy of the GNU General Public License
4901 + * along with this program; if not, write to the Free Software
4902 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
4904 + * This program is also available under a commercial proprietary license.
4905 + * For more information, contact us at licensing@x265.com.
4906 + *****************************************************************************/
4908 +#ifndef x265_AARCH64_PIXEL_H
4909 +#define x265_AARCH64_PIXEL_H
4911 +#define x265_pixel_sad_16x16_neon x265_template(pixel_sad_16x16_neon)
4912 +#define x265_pixel_sad_16x8_neon x265_template(pixel_sad_16x8_neon)
4913 +#define x265_pixel_sad_4x16_neon x265_template(pixel_sad_4x16_neon)
4914 +#define x265_pixel_sad_4x4_neon x265_template(pixel_sad_4x4_neon)
4915 +#define x265_pixel_sad_4x8_neon x265_template(pixel_sad_4x8_neon)
4916 +#define x265_pixel_sad_8x16_neon x265_template(pixel_sad_8x16_neon)
4917 +#define x265_pixel_sad_8x4_neon x265_template(pixel_sad_8x4_neon)
4918 +#define x265_pixel_sad_8x8_neon x265_template(pixel_sad_8x8_neon)
4919 +#define x265_pixel_sad_x3_16x16_neon x265_template(pixel_sad_x3_16x16_neon)
4920 +#define x265_pixel_sad_x3_16x8_neon x265_template(pixel_sad_x3_16x8_neon)
4921 +#define x265_pixel_sad_x3_4x4_neon x265_template(pixel_sad_x3_4x4_neon)
4922 +#define x265_pixel_sad_x3_4x8_neon x265_template(pixel_sad_x3_4x8_neon)
4923 +#define x265_pixel_sad_x3_8x16_neon x265_template(pixel_sad_x3_8x16_neon)
4924 +#define x265_pixel_sad_x3_8x4_neon x265_template(pixel_sad_x3_8x4_neon)
4925 +#define x265_pixel_sad_x3_8x8_neon x265_template(pixel_sad_x3_8x8_neon)
4926 +#define x265_pixel_sad_x4_16x16_neon x265_template(pixel_sad_x4_16x16_neon)
4927 +#define x265_pixel_sad_x4_16x8_neon x265_template(pixel_sad_x4_16x8_neon)
4928 +#define x265_pixel_sad_x4_4x4_neon x265_template(pixel_sad_x4_4x4_neon)
4929 +#define x265_pixel_sad_x4_4x8_neon x265_template(pixel_sad_x4_4x8_neon)
4930 +#define x265_pixel_sad_x4_8x16_neon x265_template(pixel_sad_x4_8x16_neon)
4931 +#define x265_pixel_sad_x4_8x4_neon x265_template(pixel_sad_x4_8x4_neon)
4932 +#define x265_pixel_sad_x4_8x8_neon x265_template(pixel_sad_x4_8x8_neon)
4933 +#define x265_pixel_satd_16x16_neon x265_template(pixel_satd_16x16_neon)
4934 +#define x265_pixel_satd_16x8_neon x265_template(pixel_satd_16x8_neon)
4935 +#define x265_pixel_satd_4x16_neon x265_template(pixel_satd_4x16_neon)
4936 +#define x265_pixel_satd_4x4_neon x265_template(pixel_satd_4x4_neon)
4937 +#define x265_pixel_satd_4x8_neon x265_template(pixel_satd_4x8_neon)
4938 +#define x265_pixel_satd_8x16_neon x265_template(pixel_satd_8x16_neon)
4939 +#define x265_pixel_satd_8x4_neon x265_template(pixel_satd_8x4_neon)
4940 +#define x265_pixel_satd_8x8_neon x265_template(pixel_satd_8x8_neon)
4941 +#define x265_pixel_ssd_16x16_neon x265_template(pixel_ssd_16x16_neon)
4942 +#define x265_pixel_ssd_16x8_neon x265_template(pixel_ssd_16x8_neon)
4943 +#define x265_pixel_ssd_4x16_neon x265_template(pixel_ssd_4x16_neon)
4944 +#define x265_pixel_ssd_4x4_neon x265_template(pixel_ssd_4x4_neon)
4945 +#define x265_pixel_ssd_4x8_neon x265_template(pixel_ssd_4x8_neon)
4946 +#define x265_pixel_ssd_8x16_neon x265_template(pixel_ssd_8x16_neon)
4947 +#define x265_pixel_ssd_8x4_neon x265_template(pixel_ssd_8x4_neon)
4948 +#define x265_pixel_ssd_8x8_neon x265_template(pixel_ssd_8x8_neon)
4949 +#define DECL_PIXELS( ret, name, suffix, args ) \
4950 + ret x265_pixel_##name##_16x16_##suffix args;\
4951 + ret x265_pixel_##name##_16x8_##suffix args;\
4952 + ret x265_pixel_##name##_8x16_##suffix args;\
4953 + ret x265_pixel_##name##_8x8_##suffix args;\
4954 + ret x265_pixel_##name##_8x4_##suffix args;\
4955 + ret x265_pixel_##name##_4x16_##suffix args;\
4956 + ret x265_pixel_##name##_4x8_##suffix args;\
4957 + ret x265_pixel_##name##_4x4_##suffix args;\
4959 +#define DECL_X1( name, suffix ) \
4960 + DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
4962 +#define DECL_X4( name, suffix ) \
4963 + DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
4964 + DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
4966 +DECL_X1( sad, neon )
4967 +DECL_X4( sad, neon )
4968 +DECL_X1( satd, neon )
4969 +DECL_X1( ssd, neon )
4972 +#define x265_pixel_ssd_nv12_core_neon x265_template(pixel_ssd_nv12_core_neon)
4973 +void x265_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
4975 +#define x265_pixel_vsad_neon x265_template(pixel_vsad_neon)
4976 +int x265_pixel_vsad_neon( uint8_t *, intptr_t, int );
4978 +#define x265_pixel_sa8d_8x8_neon x265_template(pixel_sa8d_8x8_neon)
4979 +int x265_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
4980 +#define x265_pixel_sa8d_16x16_neon x265_template(pixel_sa8d_16x16_neon)
4981 +int x265_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
4982 +#define x265_pixel_sa8d_satd_16x16_neon x265_template(pixel_sa8d_satd_16x16_neon)
4983 +uint64_t x265_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
4985 +#define x265_pixel_var_8x8_neon x265_template(pixel_var_8x8_neon)
4986 +uint64_t x265_pixel_var_8x8_neon ( uint8_t *, intptr_t );
4987 +#define x265_pixel_var_8x16_neon x265_template(pixel_var_8x16_neon)
4988 +uint64_t x265_pixel_var_8x16_neon ( uint8_t *, intptr_t );
4989 +#define x265_pixel_var_16x16_neon x265_template(pixel_var_16x16_neon)
4990 +uint64_t x265_pixel_var_16x16_neon( uint8_t *, intptr_t );
4991 +#define x265_pixel_var2_8x8_neon x265_template(pixel_var2_8x8_neon)
4992 +int x265_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
4993 +#define x265_pixel_var2_8x16_neon x265_template(pixel_var2_8x16_neon)
4994 +int x265_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
4996 +#define x265_pixel_hadamard_ac_8x8_neon x265_template(pixel_hadamard_ac_8x8_neon)
4997 +uint64_t x265_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
4998 +#define x265_pixel_hadamard_ac_8x16_neon x265_template(pixel_hadamard_ac_8x16_neon)
4999 +uint64_t x265_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
5000 +#define x265_pixel_hadamard_ac_16x8_neon x265_template(pixel_hadamard_ac_16x8_neon)
5001 +uint64_t x265_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
5002 +#define x265_pixel_hadamard_ac_16x16_neon x265_template(pixel_hadamard_ac_16x16_neon)
5003 +uint64_t x265_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
5005 +#define x265_pixel_ssim_4x4x2_core_neon x265_template(pixel_ssim_4x4x2_core_neon)
5006 +void x265_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
5007 + const uint8_t *, intptr_t,
5009 +#define x265_pixel_ssim_end4_neon x265_template(pixel_ssim_end4_neon)
5010 +float x265_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
5012 +#define x265_pixel_asd8_neon x265_template(pixel_asd8_neon)
5013 +int x265_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
5016 diff -Naur ./source/common/cpu.cpp ../x265_apple_patch/source/common/cpu.cpp
5017 --- ./source/common/cpu.cpp 2021-05-08 13:06:22.000000000 +0100
5018 +++ ../x265_apple_patch/source/common/cpu.cpp 2021-05-08 13:08:01.000000000 +0100
5020 { "ARMv6", X265_CPU_ARMV6 },
5021 { "NEON", X265_CPU_NEON },
5022 { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
5024 +#elif X265_ARCH_ARM64
5025 + { "NEON", X265_CPU_NEON },
5026 #elif X265_ARCH_POWER8
5027 { "Altivec", X265_CPU_ALTIVEC },
5029 @@ -374,6 +375,18 @@
5030 #endif // if HAVE_ARMV6
5033 +#elif X265_ARCH_ARM64
5035 +uint32_t cpu_detect(bool benableavx512)
5040 + flags |= X265_CPU_NEON;
5046 #elif X265_ARCH_POWER8
5048 diff -Naur ./source/common/pixel.cpp ../x265_apple_patch/source/common/pixel.cpp
5049 --- ./source/common/pixel.cpp 2021-05-08 13:06:22.000000000 +0100
5050 +++ ../x265_apple_patch/source/common/pixel.cpp 2021-05-08 13:08:01.000000000 +0100
5055 -#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
5056 +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && 0
5057 pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
5064 -#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
5065 +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && 0
5066 pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
5069 diff -Naur ./source/common/version.cpp ../x265_apple_patch/source/common/version.cpp
5070 --- ./source/common/version.cpp 2021-05-08 13:06:22.000000000 +0100
5071 +++ ../x265_apple_patch/source/common/version.cpp 2021-05-08 13:47:38.000000000 +0100
5074 #if defined(__clang__)
5075 #define COMPILEDBY "[clang " XSTR(__clang_major__) "." XSTR(__clang_minor__) "." XSTR(__clang_patchlevel__) "]"
5077 +#ifdef __IA64__ || __arm64__ || __aarch64__
5078 #define ONARCH "[on 64-bit] "
5080 #define ONARCH "[on 32-bit] "
5082 #define ONOS "[Unk-OS]"
5086 +#if X86_64 || __arm64__ || __aarch64__
5087 #define BITS "[64 bit]"
5089 #define BITS "[32 bit]"
5090 diff -Naur ./source/test/testharness.h ../x265_apple_patch/source/test/testharness.h
5091 --- ./source/test/testharness.h 2021-05-08 13:06:22.000000000 +0100
5092 +++ ../x265_apple_patch/source/test/testharness.h 2021-05-08 13:08:01.000000000 +0100
5102 #include <x86intrin.h>
5103 #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
5104 #include <arm_neon.h>
5105 -#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
5107 /* fallback for older GCC/MinGW */
5108 static inline uint32_t __rdtsc(void)
5112 // TO-DO: replace clock() function with appropriate ARM cpu instructions
5114 +#elif X265_ARCH_ARM64
5115 + // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
5116 + // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
5118 + // TO-DO: replace clock() function with appropriate ARM cpu instructions
5124 * needs an explicit asm check because it only sometimes crashes in normal use. */
5125 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
5126 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
5127 -#elif X265_ARCH_ARM == 0
5128 +#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
5129 #define PFX(stack_pagealign)(func, align) func()
5132 diff -Naur ./source/test/testharness.h.orig ../x265_apple_patch/source/test/testharness.h.orig
5133 --- ./source/test/testharness.h.orig 1970-01-01 01:00:00.000000000 +0100
5134 +++ ../x265_apple_patch/source/test/testharness.h.orig 2021-05-08 13:08:01.000000000 +0100
5136 +/*****************************************************************************
5137 + * Copyright (C) 2013-2020 MulticoreWare, Inc
5139 + * Authors: Steve Borho <steve@borho.org>
5140 + * Min Chen <chenm003@163.com>
5141 + * Yimeng Su <yimeng.su@huawei.com>
5143 + * This program is free software; you can redistribute it and/or modify
5144 + * it under the terms of the GNU General Public License as published by
5145 + * the Free Software Foundation; either version 2 of the License, or
5146 + * (at your option) any later version.
5148 + * This program is distributed in the hope that it will be useful,
5149 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
5150 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
5151 + * GNU General Public License for more details.
5153 + * You should have received a copy of the GNU General Public License
5154 + * along with this program; if not, write to the Free Software
5155 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
5157 + * This program is also available under a commercial proprietary license.
5158 + * For more information, contact us at license @ x265.com.
5159 + *****************************************************************************/
5161 +#ifndef _TESTHARNESS_H_
5162 +#define _TESTHARNESS_H_ 1
5164 +#include "common.h"
5165 +#include "primitives.h"
5168 +#pragma warning(disable: 4324) // structure was padded due to __declspec(align())
5171 +#define PIXEL_MIN 0
5172 +#define SHORT_MAX 32767
5173 +#define SHORT_MIN -32767
5174 +#define UNSIGNED_SHORT_MAX 65535
5176 +using namespace X265_NS;
5178 +extern const char* lumaPartStr[NUM_PU_SIZES];
5179 +extern const char* const* chromaPartStr[X265_CSP_COUNT];
5187 + virtual ~TestHarness() {}
5189 + virtual bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) = 0;
5191 + virtual void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt) = 0;
5193 + virtual const char *getName() const = 0;
5197 + /* Temporary variables for stack checks */
5204 +#include <intrin.h>
5206 +#include <intrin.h>
5207 +#elif (!defined(__APPLE__) && (defined (__GNUC__) && (defined(__x86_64__) || defined(__i386__))))
5208 +#include <x86intrin.h>
5209 +#elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
5210 +#include <arm_neon.h>
5211 +#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
5212 +/* fallback for older GCC/MinGW */
5213 +static inline uint32_t __rdtsc(void)
5218 + asm volatile("rdtsc" : "=a" (a) ::"edx");
5219 +#elif X265_ARCH_ARM
5220 +#if X265_ARCH_ARM64
5221 + asm volatile("mrs %0, cntvct_el0" : "=r"(a));
5223 + // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
5224 + // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
5226 + // TO-DO: replace clock() function with appropriate ARM cpu instructions
5232 +#endif // ifdef _MSC_VER
5234 +#define BENCH_RUNS 2000
5236 +/* Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
5237 + * and discards invalid times. Repeats BENCH_RUNS times to get a good average.
5238 + * Then measures the C reference with BENCH_RUNS / 4 runs and reports X factor and average cycles.*/
5239 +#define REPORT_SPEEDUP(RUNOPT, RUNREF, ...) \
5241 + uint32_t cycles = 0; int runs = 0; \
5242 + RUNOPT(__VA_ARGS__); \
5243 + for (int ti = 0; ti < BENCH_RUNS; ti++) { \
5244 + uint32_t t0 = (uint32_t)__rdtsc(); \
5245 + RUNOPT(__VA_ARGS__); \
5246 + RUNOPT(__VA_ARGS__); \
5247 + RUNOPT(__VA_ARGS__); \
5248 + RUNOPT(__VA_ARGS__); \
5249 + uint32_t t1 = (uint32_t)__rdtsc() - t0; \
5250 + if (t1 * runs <= cycles * 4 && ti > 0) { cycles += t1; runs++; } \
5252 + uint32_t refcycles = 0; int refruns = 0; \
5253 + RUNREF(__VA_ARGS__); \
5254 + for (int ti = 0; ti < BENCH_RUNS / 4; ti++) { \
5255 + uint32_t t0 = (uint32_t)__rdtsc(); \
5256 + RUNREF(__VA_ARGS__); \
5257 + RUNREF(__VA_ARGS__); \
5258 + RUNREF(__VA_ARGS__); \
5259 + RUNREF(__VA_ARGS__); \
5260 + uint32_t t1 = (uint32_t)__rdtsc() - t0; \
5261 + if (t1 * refruns <= refcycles * 4 && ti > 0) { refcycles += t1; refruns++; } \
5264 + float optperf = (10.0f * cycles / runs) / 4; \
5265 + float refperf = (10.0f * refcycles / refruns) / 4; \
5266 + printf("\t%3.2fx ", refperf / optperf); \
5267 + printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
5272 +int PFX(stack_pagealign)(int (*func)(), int align);
5274 +/* detect when callee-saved regs aren't saved
5275 + * needs an explicit asm check because it only sometimes crashes in normal use. */
5276 +intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
5277 +float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
5278 +#elif X265_ARCH_ARM == 0
5279 +#define PFX(stack_pagealign)(func, align) func()
5284 +/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
5285 + * This is done by clobbering the stack with junk around the stack pointer and calling the
5286 + * assembly function through x265_checkasm_call with added dummy arguments which forces all
5287 + * real arguments to be passed on the stack and not in registers. For 32-bit argument the
5288 + * upper half of the 64-bit register location on the stack will now contain junk. Note that
5289 + * this is dependent on compiler behavior and that interrupts etc. at the wrong time may
5290 + * overwrite the junk written to the stack so there's no guarantee that it will always
5291 + * detect all functions that assumes zero-extension.
5293 +void PFX(checkasm_stack_clobber)(uint64_t clobber, ...);
5294 +#define checked(func, ...) ( \
5295 + m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
5296 + PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5297 + m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5298 + m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
5299 + PFX(checkasm_call)((intptr_t(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
5301 +#define checked_float(func, ...) ( \
5302 + m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
5303 + PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5304 + m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
5305 + m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
5306 + PFX(checkasm_call_float)((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
5307 +#define reportfail() if (!m_ok) { fflush(stdout); fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); }
5309 +#define checked(func, ...) PFX(checkasm_call)((intptr_t(*)())func, &m_ok, __VA_ARGS__);
5310 +#define checked_float(func, ...) PFX(checkasm_call_float)((float(*)())func, &m_ok, __VA_ARGS__);
5313 +#define checked(func, ...) func(__VA_ARGS__)
5314 +#define checked_float(func, ...) func(__VA_ARGS__)
5315 +#define reportfail()
5316 +#endif // if X86_64
5319 +#endif // ifndef _TESTHARNESS_H_