diff --git a/src/arm/64/satd.S b/src/arm/64/satd.S
index 31a53976cc..261cfde03a 100644
--- a/src/arm/64/satd.S
+++ b/src/arm/64/satd.S
@@ -140,7 +140,7 @@ function satd4x4_neon, export=1
     #undef dst_stride
 endfunc
 
-.macro DOUBLE_HADAMARD_4X4
+.macro DOUBLE_HADAMARD_4X4 hbd=0
     // Horizontal transform
 
     butterfly v2, v3, v0, v1
@@ -163,8 +163,13 @@ endfunc
     interleave v0, v1, v2, v3
     interleave v4, v5, v6, v7
 
+.if \hbd == 0
     butterfly v2, v3, v0, v1
     butterfly v6, v7, v4, v5
+.else
+    butterflyw v2, v3, v16, v17, v0, v1
+    butterflyw v6, v7, v18, v19, v4, v5
+.endif
 .endm
 
 .macro SUM_DOUBLE_HADAMARD_4X4
@@ -1364,3 +1369,105 @@ function satd4x16_hbd_neon, export=1
     mov  w10, wzr
     b    L(satd_w4_hbd)
 endfunc
+
+.macro SUM_DOUBLE_HADAMARD_4X4_HBD \
+       a0 a1 a2 a3 c0 c1 c2 c3
+
+    // absolute value of transform coefficients
+    abs  v\a0\().4s, v\a0\().4s
+    abs  v\a1\().4s, v\a1\().4s
+    abs  v\a2\().4s, v\a2\().4s
+    abs  v\a3\().4s, v\a3\().4s
+    abs  v\c0\().4s, v\c0\().4s
+    abs  v\c1\().4s, v\c1\().4s
+    abs  v\c2\().4s, v\c2\().4s
+    abs  v\c3\().4s, v\c3\().4s
+
+    // stage 1 sum
+    add  v\a0\().4s, v\a0\().4s, v\a1\().4s
+    add  v\a2\().4s, v\a2\().4s, v\a3\().4s
+    add  v\c0\().4s, v\c0\().4s, v\c1\().4s
+    add  v\c2\().4s, v\c2\().4s, v\c3\().4s
+
+    // stage 2 sum
+    add  v\a0\().4s, v\a0\().4s, v\a2\().4s
+    add  v\c0\().4s, v\c0\().4s, v\c2\().4s
+
+    // stage 3 sum
+    add  v0.4s, v\a0\().4s, v\c0\().4s
+    addv s0, v0.4s
+.endm
+
+function satd8x4_hbd_neon, export=1
+    #define src         x0
+    #define src_stride  x1
+    #define dst         x2
+    #define dst_stride  x3
+
+    #define subtotal    w9
+    #define total       w10
+    #define width       w12
+
+    mov  width, 8
+    mov  total, wzr
+
+L(satd_h4_hbd):
+    ldr  q0, [src]
+    ldr  q1, [dst]
+    sub  v0.8h, v0.8h, v1.8h
+
+    ldr  q1, [src, src_stride]
+    ldr  q2, [dst, dst_stride]
+    sub  v1.8h, v1.8h, v2.8h
+
+    lsl  x8, src_stride, 1
+    lsl  x9, dst_stride, 1
+
+    ldr  q2, [src, x8]
+    ldr  q3, [dst, x9]
+    sub  v2.8h, v2.8h, v3.8h
+
+    add  x8, src_stride, src_stride, lsl 1
+    add  x9, dst_stride, dst_stride, lsl 1
+
+    ldr  q3, [src, x8]
+    ldr  q4, [dst, x9]
+    sub  v3.8h, v3.8h, v4.8h
+
+    ext  v4.16b, v0.16b, v0.16b, 8
+    ext  v5.16b, v1.16b, v1.16b, 8
+    mov  v0.d[1], v2.d[0]
+    mov  v1.d[1], v3.d[0]
+    mov  v4.d[1], v2.d[1]
+    mov  v5.d[1], v3.d[1]
+
+    DOUBLE_HADAMARD_4X4 hbd=1
+    SUM_DOUBLE_HADAMARD_4X4_HBD 2, 3, 16, 17, 6, 7, 18, 19
+
+    fmov subtotal, s0
+    add  total, subtotal, total
+
+    add  src, src, #16
+    add  dst, dst, #16
+    subs width, width, #8
+    bne  L(satd_h4_hbd)
+
+    mov  w0, total
+    normalize_4
+    ret
+
+    #undef src
+    #undef src_stride
+    #undef dst
+    #undef dst_stride
+
+    #undef subtotal
+    #undef total
+    #undef width
+endfunc
+
+function satd16x4_hbd_neon, export=1
+    mov  w12, 16
+    mov  w10, wzr
+    b    L(satd_h4_hbd)
+endfunc
diff --git a/src/asm/aarch64/dist.rs b/src/asm/aarch64/dist.rs
index e805c4de12..148046dfe1 100644
--- a/src/asm/aarch64/dist.rs
+++ b/src/asm/aarch64/dist.rs
@@ -89,9 +89,11 @@ declare_asm_dist_fn![
   (rav1e_satd4x4_hbd_neon, u16),
   (rav1e_satd4x8_hbd_neon, u16),
   (rav1e_satd4x16_hbd_neon, u16),
+  (rav1e_satd8x4_hbd_neon, u16),
   (rav1e_satd8x8_hbd_neon, u16),
   (rav1e_satd8x16_hbd_neon, u16),
   (rav1e_satd8x32_hbd_neon, u16),
+  (rav1e_satd16x4_hbd_neon, u16),
   (rav1e_satd16x8_hbd_neon, u16),
   (rav1e_satd16x16_hbd_neon, u16),
   (rav1e_satd16x32_hbd_neon, u16),
@@ -280,6 +282,8 @@ static SATD_HBD_FNS_NEON: [Option<SatdHbdFn>; DIST_FNS_LENGTH] = {
   out[BLOCK_4X4 as usize] = Some(rav1e_satd4x4_hbd_neon);
   out[BLOCK_4X8 as usize] = Some(rav1e_satd4x8_hbd_neon);
   out[BLOCK_4X16 as usize] = Some(rav1e_satd4x16_hbd_neon);
+  out[BLOCK_8X4 as usize] = Some(rav1e_satd8x4_hbd_neon);
+  out[BLOCK_16X4 as usize] = Some(rav1e_satd16x4_hbd_neon);
 
   out[BLOCK_8X8 as usize] = Some(rav1e_satd8x8_hbd_neon);
   out[BLOCK_8X16 as usize] = Some(rav1e_satd8x16_hbd_neon);
@@ -428,9 +432,11 @@ mod test {
     (4, 4),
     (4, 8),
     (4, 16),
+    (8, 4),
     (8, 8),
     (8, 16),
     (8, 32),
+    (16, 4),
     (16, 8),
     (16, 16),
     (16, 32),
@@ -455,9 +461,11 @@ mod test {
     (4, 4),
     (4, 8),
     (4, 16),
+    (8, 4),
     (8, 8),
     (8, 16),
     (8, 32),
+    (16, 4),
     (16, 8),
     (16, 16),
     (16, 32),