Port folder renaming changes from AOM

Manually cherry-picked commits: ceef058 libvpx->libaom part2 3d26d91 libvpx -> libaom cfea7dd vp10/ -> av1/ 3a8eff7 Fix a build issue for a test bf4202e Rename vpx to aom Change-Id: I1b0eb5a40796e3aaf41c58984b4229a439a597dc
author: Yaowu Xu <yaowu@google.com> 2016-08-22 16:08:15 -0700
committer: Yaowu Xu <yaowu@google.com> 2016-08-31 17:26:24 -0700
commit: c27fc14b02dd41f677290022b0de2d296f5e6dca (patch)
tree: 4efe2eb5549de86dc09449e694be310236f867cf /aom_dsp/arm
parent: b1fb998c460e22d9b119d5b66b6609c4a544876f (diff)
52 files changed, 15493 insertions, 0 deletions
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
new file mode 100644
index 000000000..ff9cbb997
--- /dev/null
+++ b/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "aom/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
+  uint16x8_t v_sum;
+  uint32x2_t v_s0 = vdup_n_u32(0);
+  uint32x2_t v_s1 = vdup_n_u32(0);
+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
+unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vpx_satd_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}
+
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+  const int shift_factor = ((height >> 5) + 3) * -1;
+  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+  for (i = 0; i < height; i += 8) {
+    const uint8x16_t vec_row1 = vld1q_u8(ref);
+    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+    ref += ref_stride * 8;
+  }
+
+  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}
+
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  int i;
+  uint16x8_t vec_sum = vdupq_n_u16(0);
+
+  for (i = 0; i < width; i += 16) {
+    const uint8x16_t vec_row = vld1q_u8(ref);
+    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+    ref += 16;
+  }
+
+  return horizontal_add_u16x8(vec_sum);
+}
+
+// ref, src = [0, 510] - max diff = 16-bits
+// bwl = {2, 3, 4}, width = {16, 32, 64}
+int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+  int width = 4 << bwl;
+  int32x4_t sse = vdupq_n_s32(0);
+  int16x8_t total = vdupq_n_s16(0);
+
+  assert(width >= 8);
+  assert((width % 8) == 0);
+
+  do {
+    const int16x8_t r = vld1q_s16(ref);
+    const int16x8_t s = vld1q_s16(src);
+    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
+    sse = vmlal_s16(sse, diff_hi, diff_hi);
+    total = vaddq_s16(total, diff);  // dynamic range 16 bits.
+
+    ref += 8;
+    src += 8;
+    width -= 8;
+  } while (width != 0);
+
+  {
+    // Note: 'total''s pairwise addition could be implemented similarly to
+    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+    // with the summation of 'sse' performed better on a Cortex-A15.
+    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
+    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    const int32x2_t t2 = vpadd_s32(t1, t1);
+    const int t = vget_lane_s32(t2, 0);
+    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int s = vget_lane_s32(s1, 0);
+    const int shift_factor = bwl + 2;
+    return s - ((t * t) >> shift_factor);
+  }
+}
+
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                         int b_stride, int *min, int *max) {
+  // Load and concatenate.
+  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
+  const uint8x16_t a23 =
+      vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
+  const uint8x16_t a45 =
+      vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
+  const uint8x16_t a67 =
+      vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
+
+  const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
+  const uint8x16_t b23 =
+      vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
+  const uint8x16_t b45 =
+      vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
+  const uint8x16_t b67 =
+      vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
+
+  // Absolute difference.
+  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+  // Max values between the Q vectors.
+  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+  // Split to D and start doing pairwise.
+  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+  // Enough runs of vpmax/min propogate the max/min values to every position.
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u8((uint8_t *)max, ab_max, 0);
+  vst1_lane_u8((uint8_t *)min, ab_min, 0);
+}
diff --git a/aom_dsp/arm/bilinear_filter_media.asm b/aom_dsp/arm/bilinear_filter_media.asm
new file mode 100644
index 000000000..f3f9754c1
--- /dev/null
+++ b/aom_dsp/arm/bilinear_filter_media.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_filter_block2d_bil_first_pass_media|
+    EXPORT  |vpx_filter_block2d_bil_second_pass_media|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vpx_filter_block2d_bil_first_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vpx_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;---------------------------------
+|vpx_filter_block2d_bil_second_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vpx_filter_block2d_second_pass_media|
+
+    END
diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 000000000..4763cdb87
--- /dev/null
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "aom_dsp/txfm_common.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from vpx_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 000000000..46b2755ea
--- /dev/null
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
+// reversing transpose order which may make it easier for the compiler to
+// reconcile the vtrn.64 moves.
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                         int16x8_t *a6, int16x8_t *a7) {
+  // Swap 64 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 08 09 10 11 12 13 14 15
+  // a2: 16 17 18 19 20 21 22 23
+  // a3: 24 25 26 27 28 29 30 31
+  // a4: 32 33 34 35 36 37 38 39
+  // a5: 40 41 42 43 44 45 46 47
+  // a6: 48 49 50 51 52 53 54 55
+  // a7: 56 57 58 59 60 61 62 63
+  // to:
+  // a04_lo: 00 01 02 03 32 33 34 35
+  // a15_lo: 08 09 10 11 40 41 42 43
+  // a26_lo: 16 17 18 19 48 49 50 51
+  // a37_lo: 24 25 26 27 56 57 58 59
+  // a04_hi: 04 05 06 07 36 37 38 39
+  // a15_hi: 12 13 14 15 44 45 46 47
+  // a26_hi: 20 21 22 23 52 53 54 55
+  // a37_hi: 28 29 30 31 60 61 62 63
+  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
+  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
+  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
+  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
+  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
+  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
+  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
+  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
+
+  // Swap 32 bit elements resulting in:
+  // a0246_lo:
+  // 00 01 16 17 32 33 48 49
+  // 02 03 18 19 34 35 50 51
+  // a1357_lo:
+  // 08 09 24 25 40 41 56 57
+  // 10 11 26 27 42 43 58 59
+  // a0246_hi:
+  // 04 05 20 21 36 37 52 53
+  // 06 07 22 23 38 39 54 55
+  // a1657_hi:
+  // 12 13 28 29 44 45 60 61
+  // 14 15 30 31 46 47 62 63
+  const int32x4x2_t a0246_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
+  const int32x4x2_t a1357_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
+  const int32x4x2_t a0246_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
+  const int32x4x2_t a1357_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
+
+  // Swap 16 bit elements resulting in:
+  // b0:
+  // 00 08 16 24 32 40 48 56
+  // 01 09 17 25 33 41 49 57
+  // b1:
+  // 02 10 18 26 34 42 50 58
+  // 03 11 19 27 35 43 51 59
+  // b2:
+  // 04 12 20 28 36 44 52 60
+  // 05 13 21 29 37 45 53 61
+  // b3:
+  // 06 14 22 30 38 46 54 62
+  // 07 15 23 31 39 47 55 63
+  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
+  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
+  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
+  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
+
+  *a0 = b0.val[0];
+  *a1 = b0.val[1];
+  *a2 = b1.val[0];
+  *a3 = b1.val[1];
+  *a4 = b2.val[0];
+  *a5 = b2.val[1];
+  *a6 = b3.val[0];
+  *a7 = b3.val[1];
+}
+
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+                           int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int i;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/aom_dsp/arm/idct16x16_1_add_neon.asm b/aom_dsp/arm/idct16x16_1_add_neon.asm
new file mode 100644
index 000000000..dc459e20d
--- /dev/null
+++ b/aom_dsp/arm/idct16x16_1_add_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vpx_idct16x16_1_add_neon|
+
+    END
diff --git a/aom_dsp/arm/idct16x16_1_add_neon.c b/aom_dsp/arm/idct16x16_1_add_neon.c
new file mode 100644
index 000000000..a37e53c93
--- /dev/null
+++ b/aom_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d2u8, d3u8, d30u8, d31u8;
+  uint64x1_t d2u64, d3u64, d4u64, d5u64;
+  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, j, a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  q0s16 = vdupq_n_s16(a1);
+  q0u16 = vreinterpretq_u16_s16(q0s16);
+
+  for (d1 = d2 = dest, i = 0; i < 4; i++) {
+    for (j = 0; j < 2; j++) {
+      d2u64 = vld1_u64((const uint64_t *)d1);
+      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+      d1 += dest_stride;
+      d4u64 = vld1_u64((const uint64_t *)d1);
+      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+      d1 += dest_stride;
+
+      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+      d2 += dest_stride;
+      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+      d2 += dest_stride;
+    }
+  }
+  return;
+}
diff --git a/aom_dsp/arm/idct16x16_add_neon.asm b/aom_dsp/arm/idct16x16_add_neon.asm
new file mode 100644
index 000000000..22a0c9594
--- /dev/null
+++ b/aom_dsp/arm/idct16x16_add_neon.asm
@@ -0,0 +1,1179 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct16x16_256_add_neon_pass1|
+    EXPORT  |vpx_idct16x16_256_add_neon_pass2|
+    EXPORT  |vpx_idct16x16_10_add_neon_pass1|
+    EXPORT  |vpx_idct16x16_10_add_neon_pass2|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
+;                                          int16_t *output, int output_stride)
+;
+; r0  int16_t input
+; r1  int16_t *output
+; r2  int  output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_256_add_neon_pass1| PROC
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q1,q2}, [r0]!
+    vmov.s16        q15, q1
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0xc00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r12, #0x3e00
+    add             r12, #0xc5
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r12                   ; duplicate cospi_4_64
+
+    ; preloading to avoid stall
+    ; generate cospi_12_64 = 13623
+    mov             r3, #0x3500
+    add             r3, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r12, #0x2300
+    add             r12, #0x8e
+
+    ; step2[4] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; step2[4] * cospi_4_64
+    vmull.s16       q5, d18, d1
+    vmull.s16       q6, d19, d1
+
+    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
+    vmlal.s16       q5, d30, d0
+    vmlal.s16       q6, d31, d0
+
+    vdup.16         d2, r3                    ; duplicate cospi_12_64
+    vdup.16         d3, r12                   ; duplicate cospi_20_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d14, q5, #14              ; >> 14
+    vqrshrn.s32     d15, q6, #14              ; >> 14
+
+    ; preloading to avoid stall
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; step2[5] * cospi_12_64
+    vmull.s16       q2, d26, d2
+    vmull.s16       q3, d27, d2
+
+    ; step2[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q15, d27, d3
+
+    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q2, d22, d3
+    vmlsl.s16       q3, d23, d3
+
+    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q15, d23, d2
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q2, #14              ; >> 14
+    vqrshrn.s32     d11, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q15, #14             ; >> 14
+
+    ; stage 4
+    vdup.16         d30, r3                   ; cospi_16_64
+
+    ; step1[0] * cospi_16_64
+    vmull.s16       q2, d16, d30
+    vmull.s16       q11, d17, d30
+
+    ; step1[1] * cospi_16_64
+    vmull.s16       q0, d24, d30
+    vmull.s16       q1, d25, d30
+
+    ; generate cospi_8_64 = 15137
+    mov             r3, #0x3b00
+    add             r3, #0x21
+
+    vdup.16         d30, r12                  ; duplicate cospi_24_64
+    vdup.16         d31, r3                   ; duplicate cospi_8_64
+
+    ; temp1 = (step1[0] + step1[1]) * cospi_16_64
+    vadd.s32        q3, q2, q0
+    vadd.s32        q12, q11, q1
+
+    ; temp2 = (step1[0] - step1[1]) * cospi_16_64
+    vsub.s32        q13, q2, q0
+    vsub.s32        q1, q11, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d16, q3, #14              ; >> 14
+    vqrshrn.s32     d17, q12, #14             ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d18, q13, #14             ; >> 14
+    vqrshrn.s32     d19, q1, #14              ; >> 14
+
+    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+    ; step1[2] * cospi_8_64
+    vmull.s16       q0, d20, d31
+    vmull.s16       q1, d21, d31
+
+    ; step1[2] * cospi_24_64
+    vmull.s16       q12, d20, d30
+    vmull.s16       q13, d21, d30
+
+    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q0, d28, d30
+    vmlal.s16       q1, d29, d30
+
+    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q12, d28, d31
+    vmlsl.s16       q13, d29, d31
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d22, q0, #14              ; >> 14
+    vqrshrn.s32     d23, q1, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d20, q12, #14             ; >> 14
+    vqrshrn.s32     d21, q13, #14             ; >> 14
+
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
+    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
+
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; stage 5
+    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
+    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
+    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
+    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
+
+    vdup.16         d16, r3;                  ; duplicate cospi_16_64
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q6, q9, q11
+    vsub.s32        q13, q10, q12
+
+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q6, #14              ; >> 14
+    vqrshrn.s32     d11, q13, #14             ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; stage 6
+    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
+
+    ; store the data
+    vst1.64         {d16}, [r1], r2
+    vst1.64         {d17}, [r1], r2
+    vst1.64         {d18}, [r1], r2
+    vst1.64         {d19}, [r1], r2
+    vst1.64         {d20}, [r1], r2
+    vst1.64         {d21}, [r1], r2
+    vst1.64         {d22}, [r1], r2
+    vst1.64         {d23}, [r1], r2
+    vst1.64         {d24}, [r1], r2
+    vst1.64         {d25}, [r1], r2
+    vst1.64         {d26}, [r1], r2
+    vst1.64         {d27}, [r1], r2
+    vst1.64         {d28}, [r1], r2
+    vst1.64         {d29}, [r1], r2
+    vst1.64         {d30}, [r1], r2
+    vst1.64         {d31}, [r1], r2
+
+    bx              lr
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
+
+;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
+;                                        int16_t *output,
+;                                        int16_t *pass1Output,
+;                                        int16_t skip_adding,
+;                                        uint8_t *dest,
+;                                        int dest_stride)
+;
+; r0  int16_t *src
+; r1  int16_t *output,
+; r2  int16_t *pass1Output,
+; r3  int16_t skip_adding,
+; r4  uint8_t *dest,
+; r5  int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_256_add_neon_pass2| PROC
+    push            {r3-r9}
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q0,q1}, [r0]!
+    vmov.s16        q15, q0;
+
+    ; generate  cospi_30_64 = 1606
+    mov             r3, #0x0600
+    add             r3, #0x46
+
+    ; generate cospi_2_64  = 16305
+    mov             r12, #0x3f00
+    add             r12, #0xb1
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d12, r3                   ; duplicate cospi_30_64
+    vdup.16         d13, r12                  ; duplicate cospi_2_64
+
+    ; preloading to avoid stall
+    ; generate cospi_14_64 = 12665
+    mov             r3, #0x3100
+    add             r3, #0x79
+
+    ; generate cospi_18_64 = 10394
+    mov             r12, #0x2800
+    add             r12, #0x9a
+
+    ; step1[8] * cospi_30_64
+    vmull.s16       q2, d16, d12
+    vmull.s16       q3, d17, d12
+
+    ; step1[8] * cospi_2_64
+    vmull.s16       q1, d16, d13
+    vmull.s16       q4, d17, d13
+
+    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
+    vmlsl.s16       q2, d30, d13
+    vmlsl.s16       q3, d31, d13
+
+    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
+    vmlal.s16       q1, d30, d12
+    vmlal.s16       q4, d31, d12
+
+    vdup.16         d30, r3                   ; duplicate cospi_14_64
+    vdup.16         d31, r12                  ; duplicate cospi_18_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d0, q2, #14               ; >> 14
+    vqrshrn.s32     d1, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d14, q1, #14              ; >> 14
+    vqrshrn.s32     d15, q4, #14              ; >> 14
+
+    ; preloading to avoid stall
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_10_64 = 14449
+    mov             r12, #0x3800
+    add             r12, #0x71
+
+    ; step1[9] * cospi_14_64
+    vmull.s16       q2, d24, d30
+    vmull.s16       q3, d25, d30
+
+    ; step1[9] * cospi_18_64
+    vmull.s16       q4, d24, d31
+    vmull.s16       q5, d25, d31
+
+    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
+    vmlsl.s16       q2, d22, d31
+    vmlsl.s16       q3, d23, d31
+
+    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
+    vmlal.s16       q4, d22, d30
+    vmlal.s16       q5, d23, d30
+
+    vdup.16         d30, r3                   ; duplicate cospi_22_64
+    vdup.16         d31, r12                  ; duplicate cospi_10_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q2, #14               ; >> 14
+    vqrshrn.s32     d3, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q4, #14              ; >> 14
+    vqrshrn.s32     d13, q5, #14              ; >> 14
+
+    ; step1[10] * cospi_22_64
+    vmull.s16       q11, d20, d30
+    vmull.s16       q12, d21, d30
+
+    ; step1[10] * cospi_10_64
+    vmull.s16       q4, d20, d31
+    vmull.s16       q5, d21, d31
+
+    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
+    vmlsl.s16       q11, d26, d31
+    vmlsl.s16       q12, d27, d31
+
+    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
+    vmlal.s16       q4, d26, d30
+    vmlal.s16       q5, d27, d30
+
+    ; preloading to avoid stall
+    ; generate cospi_6_64 = 15679
+    mov             r3, #0x3d00
+    add             r3, #0x3f
+
+    ; generate cospi_26_64 = 4756
+    mov             r12, #0x1200
+    add             r12, #0x94
+
+    vdup.16         d30, r3                   ; duplicate cospi_6_64
+    vdup.16         d31, r12                  ; duplicate cospi_26_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d11, q5, #14              ; >> 14
+    vqrshrn.s32     d10, q4, #14              ; >> 14
+
+    ; step1[11] * cospi_6_64
+    vmull.s16       q10, d28, d30
+    vmull.s16       q11, d29, d30
+
+    ; step1[11] * cospi_26_64
+    vmull.s16       q12, d28, d31
+    vmull.s16       q13, d29, d31
+
+    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
+    vmlsl.s16       q10, d18, d31
+    vmlsl.s16       q11, d19, d31
+
+    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
+    vmlal.s16       q12, d18, d30
+    vmlal.s16       q13, d19, d30
+
+    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
+    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q11, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d8, q12, #14              ; >> 14
+    vqrshrn.s32     d9, q13, #14              ; >> 14
+
+    ; stage 3
+    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
+    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
+    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
+    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
+    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
+    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
+
+    ; stage 4
+    ; generate cospi_24_64 = 6270
+    mov             r3, #0x1800
+    add             r3, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r12, #0x3b00
+    add             r12, #0x21
+
+    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vdup.16         d30, r12                  ; duplicate cospi_8_64
+    vdup.16         d31, r3                   ; duplicate cospi_24_64
+
+    ; step1[9] * cospi_24_64
+    vmull.s16       q2, d18, d31
+    vmull.s16       q3, d19, d31
+
+    ; step1[14] * cospi_24_64
+    vmull.s16       q4, d28, d31
+    vmull.s16       q5, d29, d31
+
+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+    vmlal.s16       q2, d28, d30
+    vmlal.s16       q3, d29, d30
+
+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vmlsl.s16       q4, d18, d30
+    vmlsl.s16       q5, d19, d30
+
+    rsb             r12, #0
+    vdup.16         d30, r12                  ; duplicate -cospi_8_64
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q4, #14               ; >> 14
+    vqrshrn.s32     d3, q5, #14               ; >> 14
+
+    vmov.s16        q3, q11
+    vmov.s16        q4, q12
+
+    ; - step1[13] * cospi_8_64
+    vmull.s16       q11, d26, d30
+    vmull.s16       q12, d27, d30
+
+    ; -step1[10] * cospi_8_64
+    vmull.s16       q8, d20, d30
+    vmull.s16       q9, d21, d30
+
+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlsl.s16       q11, d20, d31
+    vmlsl.s16       q12, d21, d31
+
+    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlal.s16       q8, d26, d31
+    vmlal.s16       q9, d27, d31
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q8, #14              ; >> 14
+    vqrshrn.s32     d11, q9, #14              ; >> 14
+
+    ; stage 5
+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
+
+    ; stage 6.
+    ; generate cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+
+    vdup.16         d14, r12                  ; duplicate cospi_16_64
+
+    ; step1[13] * cospi_16_64
+    vmull.s16       q3, d26, d14
+    vmull.s16       q4, d27, d14
+
+    ; step1[10] * cospi_16_64
+    vmull.s16       q0, d20, d14
+    vmull.s16       q1, d21, d14
+
+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+    vsub.s32        q5, q3, q0
+    vsub.s32        q6, q4, q1
+
+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+    vadd.s32        q10, q3, q0
+    vadd.s32        q4, q4, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q5, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q10, #14             ; >> 14
+    vqrshrn.s32     d11, q4, #14              ; >> 14
+
+    ; step1[11] * cospi_16_64
+    vmull.s16       q0, d22, d14
+    vmull.s16       q1, d23, d14
+
+    ; step1[12] * cospi_16_64
+    vmull.s16       q13, d24, d14
+    vmull.s16       q6, d25, d14
+
+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+    vsub.s32        q10, q13, q0
+    vsub.s32        q4, q6, q1
+
+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+    vadd.s32        q13, q13, q0
+    vadd.s32        q6, q6, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d8, q13, #14              ; >> 14
+    vqrshrn.s32     d9, q6, #14               ; >> 14
+
+    mov              r4, #16                  ; pass1Output stride
+    ldr              r3, [sp]                 ; load skip_adding
+    cmp              r3, #0                   ; check if need adding dest data
+    beq              skip_adding_dest
+
+    ldr              r7, [sp, #28]            ; dest used to save element 0-7
+    mov              r9, r7                   ; save dest pointer for later use
+    ldr              r8, [sp, #32]            ; load dest_stride
+
+    ; stage 7
+    ; load the data in pass1
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
+    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q8                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q9, q9, #6
+    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q9                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q2, q2, #6
+    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q2                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q3, q3, #6
+    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q3                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q4, q4, #6
+    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q4                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q5, q5, #6
+    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q5                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q14, q14, #6
+    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q14                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q15, q15, #6
+    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q15                  ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    b               end_idct16x16_pass2
+
+skip_adding_dest
+    ; stage 7
+    ; load the data in pass1
+    mov              r5, #24
+    mov              r3, #8
+
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vst1.64         {d24}, [r1], r3           ; store output[0]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[1]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vst1.64         {d24}, [r1], r3           ; store output[2]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[3]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vst1.64         {d24}, [r1], r3           ; store output[4]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[5]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+    vst1.64         {d24}, [r1], r3           ; store output[6]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[7]
+    vst1.64         {d27}, [r1], r5
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vst1.64         {d16}, [r1], r3
+    vst1.64         {d17}, [r1], r5
+    vst1.64         {d18}, [r1], r3
+    vst1.64         {d19}, [r1], r5
+    vst1.64         {d4}, [r1], r3
+    vst1.64         {d5}, [r1], r5
+    vst1.64         {d6}, [r1], r3
+    vst1.64         {d7}, [r1], r5
+    vst1.64         {d8}, [r1], r3
+    vst1.64         {d9}, [r1], r5
+    vst1.64         {d10}, [r1], r3
+    vst1.64         {d11}, [r1], r5
+    vst1.64         {d28}, [r1], r3
+    vst1.64         {d29}, [r1], r5
+    vst1.64         {d30}, [r1], r3
+    vst1.64         {d31}, [r1], r5
+end_idct16x16_pass2
+    pop             {r3-r9}
+    bx              lr
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
+
+;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
+;                                             int16_t *output, int output_stride)
+;
+; r0  int16_t input
+; r1  int16_t *output
+; r2  int  output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_10_add_neon_pass1| PROC
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q1,q2}, [r0]!
+    vmov.s16        q15, q1
+
+    ; generate  cospi_28_64*2 = 6392
+    mov             r3, #0x1800
+    add             r3, #0xf8
+
+    ; generate cospi_4_64*2  = 32138
+    mov             r12, #0x7d00
+    add             r12, #0x8a
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         q0, r3                    ; duplicate cospi_28_64*2
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
+    ; double, and return the high 16 bits, effectively giving >> 15. Doubling
+    ; the constant will change this to >> 14.
+    ; dct_const_round_shift(step2[4] * cospi_28_64);
+    vqrdmulh.s16    q4, q9, q0
+
+    ; preloading to avoid stall
+    ; generate cospi_16_64*2 = 23170
+    mov             r3, #0x5a00
+    add             r3, #0x82
+
+    ; dct_const_round_shift(step2[4] * cospi_4_64);
+    vqrdmulh.s16    q7, q9, q1
+
+    ; stage 4
+    vdup.16         q1, r3                    ; cospi_16_64*2
+
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    vdup.16         d4, r3;                   ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(step1[0] * cospi_16_64)
+    vqrdmulh.s16    q8, q8, q1
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d14, d4
+    vmull.s16       q10, d15, d4
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q12, d9, d4
+    vmull.s16       q11, d8, d4
+
+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q15, q10, q12
+    vsub.s32        q6, q9, q11
+
+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d11, q15, #14             ; >> 14
+    vqrshrn.s32     d10, q6, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; stage 6
+    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];
+    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];
+    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];
+    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];
+    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];
+    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];
+    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
+
+    ; store the data
+    vst1.64         {d4}, [r1], r2
+    vst1.64         {d5}, [r1], r2
+    vst1.64         {d18}, [r1], r2
+    vst1.64         {d19}, [r1], r2
+    vst1.64         {d20}, [r1], r2
+    vst1.64         {d21}, [r1], r2
+    vst1.64         {d22}, [r1], r2
+    vst1.64         {d23}, [r1], r2
+    vst1.64         {d24}, [r1], r2
+    vst1.64         {d25}, [r1], r2
+    vst1.64         {d26}, [r1], r2
+    vst1.64         {d27}, [r1], r2
+    vst1.64         {d28}, [r1], r2
+    vst1.64         {d29}, [r1], r2
+    vst1.64         {d30}, [r1], r2
+    vst1.64         {d31}, [r1], r2
+
+    bx              lr
+    ENDP  ; |vpx_idct16x16_10_add_neon_pass1|
+
+;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
+;                                           int16_t *output,
+;                                           int16_t *pass1Output,
+;                                           int16_t skip_adding,
+;                                           uint8_t *dest,
+;                                           int dest_stride)
+;
+; r0  int16_t *src
+; r1  int16_t *output,
+; r2  int16_t *pass1Output,
+; r3  int16_t skip_adding,
+; r4  uint8_t *dest,
+; r5  int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_10_add_neon_pass2| PROC
+    push            {r3-r9}
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q0,q1}, [r0]!
+    vmov.s16        q15, q0;
+
+    ; generate 2*cospi_30_64 = 3212
+    mov             r3, #0xc00
+    add             r3, #0x8c
+
+    ; generate 2*cospi_2_64  = 32610
+    mov             r12, #0x7f00
+    add             r12, #0x62
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64
+
+    ; dct_const_round_shift(step1[8] * cospi_30_64)
+    vqrdmulh.s16    q0, q8, q6
+
+    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64
+
+    ; dct_const_round_shift(step1[8] * cospi_2_64)
+    vqrdmulh.s16    q7, q8, q6
+
+    ; preloading to avoid stall
+    ; generate 2*cospi_26_64 = 9512
+    mov             r12, #0x2500
+    add             r12, #0x28
+    rsb             r12, #0
+    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
+
+    ; generate 2*cospi_6_64 = 31358
+    mov             r3, #0x7a00
+    add             r3, #0x7e
+    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
+
+    ; dct_const_round_shift(- step1[12] * cospi_26_64)
+    vqrdmulh.s16    q3, q9, q15
+
+    ; dct_const_round_shift(step1[12] * cospi_6_64)
+    vqrdmulh.s16    q4, q9, q14
+
+    ; stage 4
+    ; generate cospi_24_64 = 6270
+    mov             r3, #0x1800
+    add             r3, #0x7e
+    vdup.16         d31, r3                   ; duplicate cospi_24_64
+
+    ; generate cospi_8_64 = 15137
+    mov             r12, #0x3b00
+    add             r12, #0x21
+    vdup.16         d30, r12                  ; duplicate cospi_8_64
+
+    ; step1[14] * cospi_24_64
+    vmull.s16       q12, d14, d31
+    vmull.s16       q5, d15, d31
+
+    ; step1[9] * cospi_24_64
+    vmull.s16       q2, d0, d31
+    vmull.s16       q11, d1, d31
+
+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vmlsl.s16       q12, d0, d30
+    vmlsl.s16       q5, d1, d30
+
+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+    vmlal.s16       q2, d14, d30
+    vmlal.s16       q11, d15, d30
+
+    rsb              r12, #0
+    vdup.16          d30, r12                 ; duplicate -cospi_8_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q12, #14              ; >> 14
+    vqrshrn.s32     d3, q5, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q11, #14             ; >> 14
+
+    ; - step1[13] * cospi_8_64
+    vmull.s16       q10, d8, d30
+    vmull.s16       q13, d9, d30
+
+    ; -step1[10] * cospi_8_64
+    vmull.s16       q8, d6, d30
+    vmull.s16       q9, d7, d30
+
+    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
+    vmlsl.s16       q10, d6, d31
+    vmlsl.s16       q13, d7, d31
+
+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlal.s16       q8, d8, d31
+    vmlal.s16       q9, d9, d31
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q10, #14              ; >> 14
+    vqrshrn.s32     d5, q13, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q8, #14              ; >> 14
+    vqrshrn.s32     d11, q9, #14              ; >> 14
+
+    ; stage 5
+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
+
+    ; stage 6.
+    ; generate cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+
+    vdup.16         d14, r12                  ; duplicate cospi_16_64
+
+    ; step1[13] * cospi_16_64
+    vmull.s16       q3, d26, d14
+    vmull.s16       q4, d27, d14
+
+    ; step1[10] * cospi_16_64
+    vmull.s16       q0, d20, d14
+    vmull.s16       q1, d21, d14
+
+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+    vsub.s32        q5, q3, q0
+    vsub.s32        q6, q4, q1
+
+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+    vadd.s32        q0, q3, q0
+    vadd.s32        q1, q4, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q5, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q0, #14              ; >> 14
+    vqrshrn.s32     d11, q1, #14              ; >> 14
+
+    ; step1[11] * cospi_16_64
+    vmull.s16       q0, d22, d14
+    vmull.s16       q1, d23, d14
+
+    ; step1[12] * cospi_16_64
+    vmull.s16       q13, d24, d14
+    vmull.s16       q6, d25, d14
+
+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+    vsub.s32        q10, q13, q0
+    vsub.s32        q4, q6, q1
+
+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+    vadd.s32        q13, q13, q0
+    vadd.s32        q6, q6, q1
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
+    vqrshrn.s32     d8, q13, #14              ; >> 14
+    vqrshrn.s32     d9, q6, #14               ; >> 14
+
+    mov              r4, #16                  ; pass1Output stride
+    ldr              r3, [sp]                 ; load skip_adding
+
+    ; stage 7
+    ; load the data in pass1
+    mov              r5, #24
+    mov              r3, #8
+
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vst1.64         {d24}, [r1], r3           ; store output[0]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[1]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vst1.64         {d24}, [r1], r3           ; store output[2]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[3]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vst1.64         {d24}, [r1], r3           ; store output[4]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[5]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+    vst1.64         {d24}, [r1], r3           ; store output[6]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[7]
+    vst1.64         {d27}, [r1], r5
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vst1.64         {d16}, [r1], r3
+    vst1.64         {d17}, [r1], r5
+    vst1.64         {d18}, [r1], r3
+    vst1.64         {d19}, [r1], r5
+    vst1.64         {d4}, [r1], r3
+    vst1.64         {d5}, [r1], r5
+    vst1.64         {d6}, [r1], r3
+    vst1.64         {d7}, [r1], r5
+    vst1.64         {d8}, [r1], r3
+    vst1.64         {d9}, [r1], r5
+    vst1.64         {d10}, [r1], r3
+    vst1.64         {d11}, [r1], r5
+    vst1.64         {d28}, [r1], r3
+    vst1.64         {d29}, [r1], r5
+    vst1.64         {d30}, [r1], r3
+    vst1.64         {d31}, [r1], r5
+end_idct10_16x16_pass2
+    pop             {r3-r9}
+    bx              lr
+    ENDP  ; |vpx_idct16x16_10_add_neon_pass2|
+    END
diff --git a/aom_dsp/arm/idct16x16_add_neon.c b/aom_dsp/arm/idct16x16_add_neon.c
new file mode 100644
index 000000000..2bb92c683
--- /dev/null
+++ b/aom_dsp/arm/idct16x16_add_neon.c
@@ -0,0 +1,1294 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "aom_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+                                int16x8_t *q10s16, int16x8_t *q11s16,
+                                int16x8_t *q12s16, int16x8_t *q13s16,
+                                int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+  *q12s16 = vcombine_s16(d17s16, d25s16);
+  *q13s16 = vcombine_s16(d19s16, d27s16);
+  *q14s16 = vcombine_s16(d21s16, d29s16);
+  *q15s16 = vcombine_s16(d23s16, d31s16);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+  q1x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+  q2x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+  q3x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+  *q8s16 = q0x2s16.val[0];
+  *q9s16 = q0x2s16.val[1];
+  *q10s16 = q1x2s16.val[0];
+  *q11s16 = q1x2s16.val[1];
+  *q12s16 = q2x2s16.val[0];
+  *q13s16 = q2x2s16.val[1];
+  *q14s16 = q3x2s16.val[0];
+  *q15s16 = q3x2s16.val[1];
+  return;
+}
+
+void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
+                                      int output_stride) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = vld2q_s16(in);
+  q8s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q9s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q10s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q11s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q12s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q13s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q14s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+  d30s16 = vget_low_s16(q15s16);
+  d31s16 = vget_high_s16(q15s16);
+
+  // stage 3
+  d0s16 = vdup_n_s16(cospi_28_64);
+  d1s16 = vdup_n_s16(cospi_4_64);
+
+  q2s32 = vmull_s16(d18s16, d0s16);
+  q3s32 = vmull_s16(d19s16, d0s16);
+  q5s32 = vmull_s16(d18s16, d1s16);
+  q6s32 = vmull_s16(d19s16, d1s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+  q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+  q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+  d2s16 = vdup_n_s16(cospi_12_64);
+  d3s16 = vdup_n_s16(cospi_20_64);
+
+  d8s16 = vqrshrn_n_s32(q2s32, 14);
+  d9s16 = vqrshrn_n_s32(q3s32, 14);
+  d14s16 = vqrshrn_n_s32(q5s32, 14);
+  d15s16 = vqrshrn_n_s32(q6s32, 14);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  q2s32 = vmull_s16(d26s16, d2s16);
+  q3s32 = vmull_s16(d27s16, d2s16);
+  q9s32 = vmull_s16(d26s16, d3s16);
+  q15s32 = vmull_s16(d27s16, d3s16);
+
+  q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+  q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+  q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+  d10s16 = vqrshrn_n_s32(q2s32, 14);
+  d11s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q15s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 4
+  d30s16 = vdup_n_s16(cospi_16_64);
+
+  q2s32 = vmull_s16(d16s16, d30s16);
+  q11s32 = vmull_s16(d17s16, d30s16);
+  q0s32 = vmull_s16(d24s16, d30s16);
+  q1s32 = vmull_s16(d25s16, d30s16);
+
+  d30s16 = vdup_n_s16(cospi_24_64);
+  d31s16 = vdup_n_s16(cospi_8_64);
+
+  q3s32 = vaddq_s32(q2s32, q0s32);
+  q12s32 = vaddq_s32(q11s32, q1s32);
+  q13s32 = vsubq_s32(q2s32, q0s32);
+  q1s32 = vsubq_s32(q11s32, q1s32);
+
+  d16s16 = vqrshrn_n_s32(q3s32, 14);
+  d17s16 = vqrshrn_n_s32(q12s32, 14);
+  d18s16 = vqrshrn_n_s32(q13s32, 14);
+  d19s16 = vqrshrn_n_s32(q1s32, 14);
+  q8s16 = vcombine_s16(d16s16, d17s16);
+  q9s16 = vcombine_s16(d18s16, d19s16);
+
+  q0s32 = vmull_s16(d20s16, d31s16);
+  q1s32 = vmull_s16(d21s16, d31s16);
+  q12s32 = vmull_s16(d20s16, d30s16);
+  q13s32 = vmull_s16(d21s16, d30s16);
+
+  q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+  q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+  q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+  q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+  d22s16 = vqrshrn_n_s32(q0s32, 14);
+  d23s16 = vqrshrn_n_s32(q1s32, 14);
+  d20s16 = vqrshrn_n_s32(q12s32, 14);
+  d21s16 = vqrshrn_n_s32(q13s32, 14);
+  q10s16 = vcombine_s16(d20s16, d21s16);
+  q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  q14s16 = vsubq_s16(q7s16, q6s16);
+  q15s16 = vaddq_s16(q6s16, q7s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+
+  // stage 5
+  q0s16 = vaddq_s16(q8s16, q11s16);
+  q1s16 = vaddq_s16(q9s16, q10s16);
+  q2s16 = vsubq_s16(q9s16, q10s16);
+  q3s16 = vsubq_s16(q8s16, q11s16);
+
+  d16s16 = vdup_n_s16(cospi_16_64);
+
+  q11s32 = vmull_s16(d26s16, d16s16);
+  q12s32 = vmull_s16(d27s16, d16s16);
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+
+  q6s32 = vsubq_s32(q9s32, q11s32);
+  q13s32 = vsubq_s32(q10s32, q12s32);
+  q9s32 = vaddq_s32(q9s32, q11s32);
+  q10s32 = vaddq_s32(q10s32, q12s32);
+
+  d10s16 = vqrshrn_n_s32(q6s32, 14);
+  d11s16 = vqrshrn_n_s32(q13s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q10s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 6
+  q8s16 = vaddq_s16(q0s16, q15s16);
+  q9s16 = vaddq_s16(q1s16, q6s16);
+  q10s16 = vaddq_s16(q2s16, q5s16);
+  q11s16 = vaddq_s16(q3s16, q4s16);
+  q12s16 = vsubq_s16(q3s16, q4s16);
+  q13s16 = vsubq_s16(q2s16, q5s16);
+  q14s16 = vsubq_s16(q1s16, q6s16);
+  q15s16 = vsubq_s16(q0s16, q15s16);
+
+  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+  // store the data
+  output_stride >>= 1;  // output_stride / 2, out is int16_t
+  vst1_u64((uint64_t *)out, d16u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d17u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d18u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d19u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d20u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d21u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d22u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d23u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d24u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d28u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d29u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d30u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d31u64);
+  return;
+}
+
+void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
+                                      int16_t *pass1Output, int16_t skip_adding,
+                                      uint8_t *dest, int dest_stride) {
+  uint8_t *d;
+  uint8x8_t d12u8, d13u8;
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64;
+  int64x1_t d12s64, d13s64;
+  uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+  uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = vld2q_s16(src);
+  q8s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q9s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q10s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q11s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q12s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q13s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q14s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+  d30s16 = vget_low_s16(q15s16);
+  d31s16 = vget_high_s16(q15s16);
+
+  // stage 3
+  d12s16 = vdup_n_s16(cospi_30_64);
+  d13s16 = vdup_n_s16(cospi_2_64);
+
+  q2s32 = vmull_s16(d16s16, d12s16);
+  q3s32 = vmull_s16(d17s16, d12s16);
+  q1s32 = vmull_s16(d16s16, d13s16);
+  q4s32 = vmull_s16(d17s16, d13s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+  q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+  q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+  d0s16 = vqrshrn_n_s32(q2s32, 14);
+  d1s16 = vqrshrn_n_s32(q3s32, 14);
+  d14s16 = vqrshrn_n_s32(q1s32, 14);
+  d15s16 = vqrshrn_n_s32(q4s32, 14);
+  q0s16 = vcombine_s16(d0s16, d1s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  d30s16 = vdup_n_s16(cospi_14_64);
+  d31s16 = vdup_n_s16(cospi_18_64);
+
+  q2s32 = vmull_s16(d24s16, d30s16);
+  q3s32 = vmull_s16(d25s16, d30s16);
+  q4s32 = vmull_s16(d24s16, d31s16);
+  q5s32 = vmull_s16(d25s16, d31s16);
+
+  q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+  q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+  q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+  q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+  d2s16 = vqrshrn_n_s32(q2s32, 14);
+  d3s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q4s32, 14);
+  d13s16 = vqrshrn_n_s32(q5s32, 14);
+  q1s16 = vcombine_s16(d2s16, d3s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  d30s16 = vdup_n_s16(cospi_22_64);
+  d31s16 = vdup_n_s16(cospi_10_64);
+
+  q11s32 = vmull_s16(d20s16, d30s16);
+  q12s32 = vmull_s16(d21s16, d30s16);
+  q4s32 = vmull_s16(d20s16, d31s16);
+  q5s32 = vmull_s16(d21s16, d31s16);
+
+  q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+  q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+  q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+  q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+  d4s16 = vqrshrn_n_s32(q11s32, 14);
+  d5s16 = vqrshrn_n_s32(q12s32, 14);
+  d11s16 = vqrshrn_n_s32(q5s32, 14);
+  d10s16 = vqrshrn_n_s32(q4s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  d30s16 = vdup_n_s16(cospi_6_64);
+  d31s16 = vdup_n_s16(cospi_26_64);
+
+  q10s32 = vmull_s16(d28s16, d30s16);
+  q11s32 = vmull_s16(d29s16, d30s16);
+  q12s32 = vmull_s16(d28s16, d31s16);
+  q13s32 = vmull_s16(d29s16, d31s16);
+
+  q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+  q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+  q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+  q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+  d6s16 = vqrshrn_n_s32(q10s32, 14);
+  d7s16 = vqrshrn_n_s32(q11s32, 14);
+  d8s16 = vqrshrn_n_s32(q12s32, 14);
+  d9s16 = vqrshrn_n_s32(q13s32, 14);
+  q3s16 = vcombine_s16(d6s16, d7s16);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+
+  // stage 3
+  q9s16 = vsubq_s16(q0s16, q1s16);
+  q0s16 = vaddq_s16(q0s16, q1s16);
+  q10s16 = vsubq_s16(q3s16, q2s16);
+  q11s16 = vaddq_s16(q2s16, q3s16);
+  q12s16 = vaddq_s16(q4s16, q5s16);
+  q13s16 = vsubq_s16(q4s16, q5s16);
+  q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q6s16, q7s16);
+
+  // stage 4
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+
+  d30s16 = vdup_n_s16(cospi_8_64);
+  d31s16 = vdup_n_s16(cospi_24_64);
+
+  q2s32 = vmull_s16(d18s16, d31s16);
+  q3s32 = vmull_s16(d19s16, d31s16);
+  q4s32 = vmull_s16(d28s16, d31s16);
+  q5s32 = vmull_s16(d29s16, d31s16);
+
+  q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+  q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+  q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+  q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+  d12s16 = vqrshrn_n_s32(q2s32, 14);
+  d13s16 = vqrshrn_n_s32(q3s32, 14);
+  d2s16 = vqrshrn_n_s32(q4s32, 14);
+  d3s16 = vqrshrn_n_s32(q5s32, 14);
+  q1s16 = vcombine_s16(d2s16, d3s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  q3s16 = q11s16;
+  q4s16 = q12s16;
+
+  d30s16 = vdup_n_s16(-cospi_8_64);
+  q11s32 = vmull_s16(d26s16, d30s16);
+  q12s32 = vmull_s16(d27s16, d30s16);
+  q8s32 = vmull_s16(d20s16, d30s16);
+  q9s32 = vmull_s16(d21s16, d30s16);
+
+  q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+  q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+  q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+  q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+  d4s16 = vqrshrn_n_s32(q11s32, 14);
+  d5s16 = vqrshrn_n_s32(q12s32, 14);
+  d10s16 = vqrshrn_n_s32(q8s32, 14);
+  d11s16 = vqrshrn_n_s32(q9s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  // stage 5
+  q8s16 = vaddq_s16(q0s16, q3s16);
+  q9s16 = vaddq_s16(q1s16, q2s16);
+  q10s16 = vsubq_s16(q1s16, q2s16);
+  q11s16 = vsubq_s16(q0s16, q3s16);
+  q12s16 = vsubq_s16(q7s16, q4s16);
+  q13s16 = vsubq_s16(q6s16, q5s16);
+  q14s16 = vaddq_s16(q6s16, q5s16);
+  q15s16 = vaddq_s16(q7s16, q4s16);
+
+  // stage 6
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+
+  d14s16 = vdup_n_s16(cospi_16_64);
+
+  q3s32 = vmull_s16(d26s16, d14s16);
+  q4s32 = vmull_s16(d27s16, d14s16);
+  q0s32 = vmull_s16(d20s16, d14s16);
+  q1s32 = vmull_s16(d21s16, d14s16);
+
+  q5s32 = vsubq_s32(q3s32, q0s32);
+  q6s32 = vsubq_s32(q4s32, q1s32);
+  q10s32 = vaddq_s32(q3s32, q0s32);
+  q4s32 = vaddq_s32(q4s32, q1s32);
+
+  d4s16 = vqrshrn_n_s32(q5s32, 14);
+  d5s16 = vqrshrn_n_s32(q6s32, 14);
+  d10s16 = vqrshrn_n_s32(q10s32, 14);
+  d11s16 = vqrshrn_n_s32(q4s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q0s32 = vmull_s16(d22s16, d14s16);
+  q1s32 = vmull_s16(d23s16, d14s16);
+  q13s32 = vmull_s16(d24s16, d14s16);
+  q6s32 = vmull_s16(d25s16, d14s16);
+
+  q10s32 = vsubq_s32(q13s32, q0s32);
+  q4s32 = vsubq_s32(q6s32, q1s32);
+  q13s32 = vaddq_s32(q13s32, q0s32);
+  q6s32 = vaddq_s32(q6s32, q1s32);
+
+  d6s16 = vqrshrn_n_s32(q10s32, 14);
+  d7s16 = vqrshrn_n_s32(q4s32, 14);
+  d8s16 = vqrshrn_n_s32(q13s32, 14);
+  d9s16 = vqrshrn_n_s32(q6s32, 14);
+  q3s16 = vcombine_s16(d6s16, d7s16);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+
+  // stage 7
+  if (skip_adding != 0) {
+    d = dest;
+    // load the data in pass1
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+
+    q12s16 = vaddq_s16(q0s16, q15s16);
+    q13s16 = vaddq_s16(q1s16, q14s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q14s16 = vsubq_s16(q1s16, q14s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q12s16 = vaddq_s16(q10s16, q5s16);
+    q13s16 = vaddq_s16(q11s16, q4s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q4s16 = vsubq_s16(q11s16, q4s16);
+    q5s16 = vsubq_s16(q10s16, q5s16);
+
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q12s16 = vaddq_s16(q0s16, q3s16);
+    q13s16 = vaddq_s16(q1s16, q2s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q2s16 = vsubq_s16(q1s16, q2s16);
+    q3s16 = vsubq_s16(q0s16, q3s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q12s16 = vaddq_s16(q10s16, q9s16);
+    q13s16 = vaddq_s16(q11s16, q8s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q8s16 = vsubq_s16(q11s16, q8s16);
+    q9s16 = vsubq_s16(q10s16, q9s16);
+
+    // store the data  out 8,9,10,11,12,13,14,15
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q8s16 = vrshrq_n_s16(q8s16, 6);
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q9s16 = vrshrq_n_s16(q9s16, 6);
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q2s16 = vrshrq_n_s16(q2s16, 6);
+    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q3s16 = vrshrq_n_s16(q3s16, 6);
+    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q4s16 = vrshrq_n_s16(q4s16, 6);
+    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q5s16 = vrshrq_n_s16(q5s16, 6);
+    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q14s16 = vrshrq_n_s16(q14s16, 6);
+    q14u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    q15s16 = vrshrq_n_s16(q15s16, 6);
+    q15u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+  } else {  // skip_adding_dest
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q15s16);
+    q13s16 = vaddq_s16(q1s16, q14s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q14s16 = vsubq_s16(q1s16, q14s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q10s16, q5s16);
+    q13s16 = vaddq_s16(q11s16, q4s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q4s16 = vsubq_s16(q11s16, q4s16);
+    q5s16 = vsubq_s16(q10s16, q5s16);
+
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q3s16);
+    q13s16 = vaddq_s16(q1s16, q2s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q2s16 = vsubq_s16(q1s16, q2s16);
+    q3s16 = vsubq_s16(q0s16, q3s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q10s16, q9s16);
+    q13s16 = vaddq_s16(q11s16, q8s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q8s16 = vsubq_s16(q11s16, q8s16);
+    q9s16 = vsubq_s16(q10s16, q9s16);
+
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+    out += 12;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+    out += 12;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+    out += 12;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+    out += 12;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+    out += 12;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+    out += 12;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+    out += 12;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+    out += 4;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+  }
+  return;
+}
+
+void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
+                                     int output_stride) {
+  int16x4_t d4s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+  int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q6s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q15s32;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = vld2q_s16(in);
+  q8s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q9s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q10s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q11s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q12s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q13s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q14s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  // stage 3
+  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+  // stage 4
+  q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+  d4s16 = vdup_n_s16(cospi_16_64);
+
+  q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+  d8s16 = vget_low_s16(q4s16);
+  d9s16 = vget_high_s16(q4s16);
+  d14s16 = vget_low_s16(q7s16);
+  d15s16 = vget_high_s16(q7s16);
+  q9s32 = vmull_s16(d14s16, d4s16);
+  q10s32 = vmull_s16(d15s16, d4s16);
+  q12s32 = vmull_s16(d9s16, d4s16);
+  q11s32 = vmull_s16(d8s16, d4s16);
+
+  q15s32 = vsubq_s32(q10s32, q12s32);
+  q6s32 = vsubq_s32(q9s32, q11s32);
+  q9s32 = vaddq_s32(q9s32, q11s32);
+  q10s32 = vaddq_s32(q10s32, q12s32);
+
+  d11s16 = vqrshrn_n_s32(q15s32, 14);
+  d10s16 = vqrshrn_n_s32(q6s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q10s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 6
+  q2s16 = vaddq_s16(q8s16, q7s16);
+  q9s16 = vaddq_s16(q8s16, q6s16);
+  q10s16 = vaddq_s16(q8s16, q5s16);
+  q11s16 = vaddq_s16(q8s16, q4s16);
+  q12s16 = vsubq_s16(q8s16, q4s16);
+  q13s16 = vsubq_s16(q8s16, q5s16);
+  q14s16 = vsubq_s16(q8s16, q6s16);
+  q15s16 = vsubq_s16(q8s16, q7s16);
+
+  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+  // store the data
+  output_stride >>= 1;  // output_stride / 2, out is int16_t
+  vst1_u64((uint64_t *)out, d4u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d5u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d18u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d19u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d20u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d21u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d22u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d23u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d24u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d28u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d29u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d30u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d31u64);
+  return;
+}
+
+void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
+                                     int16_t *pass1Output, int16_t skip_adding,
+                                     uint8_t *dest, int dest_stride) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+  uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+  uint64x1_t d16u64, d17u64, d18u64, d19u64;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32;
+  int16x8x2_t q0x2s16;
+  (void)skip_adding;
+  (void)dest;
+  (void)dest_stride;
+
+  q0x2s16 = vld2q_s16(src);
+  q8s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q9s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q10s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q11s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q12s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q13s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q14s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  // stage 3
+  q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+  q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+  q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+  q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+  q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+  q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+  q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+  q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+  // stage 4
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+  d6s16 = vget_low_s16(q3s16);
+  d7s16 = vget_high_s16(q3s16);
+  d8s16 = vget_low_s16(q4s16);
+  d9s16 = vget_high_s16(q4s16);
+  d14s16 = vget_low_s16(q7s16);
+  d15s16 = vget_high_s16(q7s16);
+
+  d30s16 = vdup_n_s16(cospi_8_64);
+  d31s16 = vdup_n_s16(cospi_24_64);
+
+  q12s32 = vmull_s16(d14s16, d31s16);
+  q5s32 = vmull_s16(d15s16, d31s16);
+  q2s32 = vmull_s16(d0s16, d31s16);
+  q11s32 = vmull_s16(d1s16, d31s16);
+
+  q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+  q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+  q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+  q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+  d2s16 = vqrshrn_n_s32(q12s32, 14);
+  d3s16 = vqrshrn_n_s32(q5s32, 14);
+  d12s16 = vqrshrn_n_s32(q2s32, 14);
+  d13s16 = vqrshrn_n_s32(q11s32, 14);
+  q1s16 = vcombine_s16(d2s16, d3s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  d30s16 = vdup_n_s16(-cospi_8_64);
+  q10s32 = vmull_s16(d8s16, d30s16);
+  q13s32 = vmull_s16(d9s16, d30s16);
+  q8s32 = vmull_s16(d6s16, d30s16);
+  q9s32 = vmull_s16(d7s16, d30s16);
+
+  q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+  q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+  q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+  q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+  d4s16 = vqrshrn_n_s32(q10s32, 14);
+  d5s16 = vqrshrn_n_s32(q13s32, 14);
+  d10s16 = vqrshrn_n_s32(q8s32, 14);
+  d11s16 = vqrshrn_n_s32(q9s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  // stage 5
+  q8s16 = vaddq_s16(q0s16, q3s16);
+  q9s16 = vaddq_s16(q1s16, q2s16);
+  q10s16 = vsubq_s16(q1s16, q2s16);
+  q11s16 = vsubq_s16(q0s16, q3s16);
+  q12s16 = vsubq_s16(q7s16, q4s16);
+  q13s16 = vsubq_s16(q6s16, q5s16);
+  q14s16 = vaddq_s16(q6s16, q5s16);
+  q15s16 = vaddq_s16(q7s16, q4s16);
+
+  // stage 6
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+
+  d14s16 = vdup_n_s16(cospi_16_64);
+  q3s32 = vmull_s16(d26s16, d14s16);
+  q4s32 = vmull_s16(d27s16, d14s16);
+  q0s32 = vmull_s16(d20s16, d14s16);
+  q1s32 = vmull_s16(d21s16, d14s16);
+
+  q5s32 = vsubq_s32(q3s32, q0s32);
+  q6s32 = vsubq_s32(q4s32, q1s32);
+  q0s32 = vaddq_s32(q3s32, q0s32);
+  q4s32 = vaddq_s32(q4s32, q1s32);
+
+  d4s16 = vqrshrn_n_s32(q5s32, 14);
+  d5s16 = vqrshrn_n_s32(q6s32, 14);
+  d10s16 = vqrshrn_n_s32(q0s32, 14);
+  d11s16 = vqrshrn_n_s32(q4s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q0s32 = vmull_s16(d22s16, d14s16);
+  q1s32 = vmull_s16(d23s16, d14s16);
+  q13s32 = vmull_s16(d24s16, d14s16);
+  q6s32 = vmull_s16(d25s16, d14s16);
+
+  q10s32 = vsubq_s32(q13s32, q0s32);
+  q4s32 = vsubq_s32(q6s32, q1s32);
+  q13s32 = vaddq_s32(q13s32, q0s32);
+  q6s32 = vaddq_s32(q6s32, q1s32);
+
+  d6s16 = vqrshrn_n_s32(q10s32, 14);
+  d7s16 = vqrshrn_n_s32(q4s32, 14);
+  d8s16 = vqrshrn_n_s32(q13s32, 14);
+  d9s16 = vqrshrn_n_s32(q6s32, 14);
+  q3s16 = vcombine_s16(d6s16, d7s16);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+
+  // stage 7
+  q0s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q1s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q12s16 = vaddq_s16(q0s16, q15s16);
+  q13s16 = vaddq_s16(q1s16, q14s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q14s16 = vsubq_s16(q1s16, q14s16);
+  q15s16 = vsubq_s16(q0s16, q15s16);
+
+  q10s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q11s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q12s16 = vaddq_s16(q10s16, q5s16);
+  q13s16 = vaddq_s16(q11s16, q4s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q4s16 = vsubq_s16(q11s16, q4s16);
+  q5s16 = vsubq_s16(q10s16, q5s16);
+
+  q0s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q1s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q12s16 = vaddq_s16(q0s16, q3s16);
+  q13s16 = vaddq_s16(q1s16, q2s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q2s16 = vsubq_s16(q1s16, q2s16);
+  q3s16 = vsubq_s16(q0s16, q3s16);
+
+  q10s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q11s16 = vld1q_s16(pass1Output);
+  q12s16 = vaddq_s16(q10s16, q9s16);
+  q13s16 = vaddq_s16(q11s16, q8s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q8s16 = vsubq_s16(q11s16, q8s16);
+  q9s16 = vsubq_s16(q10s16, q9s16);
+
+  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+  d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
+  d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
+  d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
+  d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
+  d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+  d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+  vst1_u64((uint64_t *)out, d16u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d17u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d18u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d19u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d4u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d5u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d6u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d7u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d8u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d9u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d10u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d11u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d28u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d29u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d30u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d31u64);
+  return;
+}
diff --git a/aom_dsp/arm/idct16x16_neon.c b/aom_dsp/arm/idct16x16_neon.c
new file mode 100644
index 000000000..e205056a0
--- /dev/null
+++ b/aom_dsp/arm/idct16x16_neon.c
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "aom_dsp/vpx_dsp_common.h"
+
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
+                                      int output_stride);
+void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
+                                      int16_t *pass1Output, int16_t skip_adding,
+                                      uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
+                                     int output_stride);
+void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
+                                     int16_t *pass1Output, int16_t skip_adding,
+                                     uint8_t *dest, int dest_stride);
+
+#if HAVE_NEON_ASM
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vpx_push_neon(int64_t *store);
+extern void vpx_pop_neon(int64_t *store);
+#endif  // HAVE_NEON_ASM
+
+void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+#if HAVE_NEON_ASM
+  int64_t store_reg[8];
+#endif
+  int16_t pass1_output[16 * 16] = { 0 };
+  int16_t row_idct_output[16 * 16] = { 0 };
+
+#if HAVE_NEON_ASM
+  // save d8-d15 register values.
+  vpx_push_neon(store_reg);
+#endif
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+                                   dest, dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
+                                   pass1_output, 0, dest, dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+                                   pass1_output, 1, dest, dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+                                   row_idct_output + 8, pass1_output, 1,
+                                   dest + 8, dest_stride);
+
+#if HAVE_NEON_ASM
+  // restore d8-d15 register values.
+  vpx_pop_neon(store_reg);
+#endif
+
+  return;
+}
+
+void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+#if HAVE_NEON_ASM
+  int64_t store_reg[8];
+#endif
+  int16_t pass1_output[16 * 16] = { 0 };
+  int16_t row_idct_output[16 * 16] = { 0 };
+
+#if HAVE_NEON_ASM
+  // save d8-d15 register values.
+  vpx_push_neon(store_reg);
+#endif
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+                                  dest, dest_stride);
+
+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+                                   pass1_output, 1, dest, dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+                                   row_idct_output + 8, pass1_output, 1,
+                                   dest + 8, dest_stride);
+
+#if HAVE_NEON_ASM
+  // restore d8-d15 register values.
+  vpx_pop_neon(store_reg);
+#endif
+
+  return;
+}
diff --git a/aom_dsp/arm/idct32x32_1_add_neon.asm b/aom_dsp/arm/idct32x32_1_add_neon.asm
new file mode 100644
index 000000000..96d276b4d
--- /dev/null
+++ b/aom_dsp/arm/idct32x32_1_add_neon.asm
@@ -0,0 +1,144 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct32x32_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ;TODO(hkuang): put the following macros in a seperate
+    ;file so other idct function could also use them.
+    MACRO
+    LD_16x8          $src, $stride
+    vld1.8           {q8}, [$src], $stride
+    vld1.8           {q9}, [$src], $stride
+    vld1.8           {q10}, [$src], $stride
+    vld1.8           {q11}, [$src], $stride
+    vld1.8           {q12}, [$src], $stride
+    vld1.8           {q13}, [$src], $stride
+    vld1.8           {q14}, [$src], $stride
+    vld1.8           {q15}, [$src], $stride
+    MEND
+
+    MACRO
+    ADD_DIFF_16x8    $diff
+    vqadd.u8         q8, q8, $diff
+    vqadd.u8         q9, q9, $diff
+    vqadd.u8         q10, q10, $diff
+    vqadd.u8         q11, q11, $diff
+    vqadd.u8         q12, q12, $diff
+    vqadd.u8         q13, q13, $diff
+    vqadd.u8         q14, q14, $diff
+    vqadd.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    SUB_DIFF_16x8    $diff
+    vqsub.u8         q8, q8, $diff
+    vqsub.u8         q9, q9, $diff
+    vqsub.u8         q10, q10, $diff
+    vqsub.u8         q11, q11, $diff
+    vqsub.u8         q12, q12, $diff
+    vqsub.u8         q13, q13, $diff
+    vqsub.u8         q14, q14, $diff
+    vqsub.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    ST_16x8          $dst, $stride
+    vst1.8           {q8}, [$dst], $stride
+    vst1.8           {q9}, [$dst], $stride
+    vst1.8           {q10},[$dst], $stride
+    vst1.8           {q11},[$dst], $stride
+    vst1.8           {q12},[$dst], $stride
+    vst1.8           {q13},[$dst], $stride
+    vst1.8           {q14},[$dst], $stride
+    vst1.8           {q15},[$dst], $stride
+    MEND
+
+;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+;                              int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+
+|vpx_idct32x32_1_add_neon| PROC
+    push             {lr}
+    pld              [r1]
+    add              r3, r1, #16               ; r3 dest + 16 for second loop
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asrs             r0, r0, #6                ; >> 6
+    bge              diff_positive_32_32
+
+diff_negative_32_32
+    neg              r0, r0
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_negative_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_negative_32_32_loop
+    pop              {pc}
+
+diff_positive_32_32
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_positive_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_positive_32_32_loop
+    pop              {pc}
+
+    ENDP             ; |vpx_idct32x32_1_add_neon|
+    END
diff --git a/aom_dsp/arm/idct32x32_1_add_neon.c b/aom_dsp/arm/idct32x32_1_add_neon.c
new file mode 100644
index 000000000..35bfc6677
--- /dev/null
+++ b/aom_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+                           uint8x16_t *q9u8, uint8x16_t *q10u8,
+                           uint8x16_t *q11u8, uint8x16_t *q12u8,
+                           uint8x16_t *q13u8, uint8x16_t *q14u8,
+                           uint8x16_t *q15u8) {
+  *q8u8 = vld1q_u8(d);
+  d += d_stride;
+  *q9u8 = vld1q_u8(d);
+  d += d_stride;
+  *q10u8 = vld1q_u8(d);
+  d += d_stride;
+  *q11u8 = vld1q_u8(d);
+  d += d_stride;
+  *q12u8 = vld1q_u8(d);
+  d += d_stride;
+  *q13u8 = vld1q_u8(d);
+  d += d_stride;
+  *q14u8 = vld1q_u8(d);
+  d += d_stride;
+  *q15u8 = vld1q_u8(d);
+  return;
+}
+
+static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
+                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
+                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
+                                 uint8x16_t *q15u8) {
+  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+  return;
+}
+
+static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
+                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
+                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
+                                 uint8x16_t *q15u8) {
+  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+  return;
+}
+
+static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+                           uint8x16_t *q9u8, uint8x16_t *q10u8,
+                           uint8x16_t *q11u8, uint8x16_t *q12u8,
+                           uint8x16_t *q13u8, uint8x16_t *q14u8,
+                           uint8x16_t *q15u8) {
+  vst1q_u8(d, *q8u8);
+  d += d_stride;
+  vst1q_u8(d, *q9u8);
+  d += d_stride;
+  vst1q_u8(d, *q10u8);
+  d += d_stride;
+  vst1q_u8(d, *q11u8);
+  d += d_stride;
+  vst1q_u8(d, *q12u8);
+  d += d_stride;
+  vst1q_u8(d, *q13u8);
+  d += d_stride;
+  vst1q_u8(d, *q14u8);
+  d += d_stride;
+  vst1q_u8(d, *q15u8);
+  return;
+}
+
+void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+  int i, j, dest_stride8;
+  uint8_t *d;
+  int16_t a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  dest_stride8 = dest_stride * 8;
+  if (a1 >= 0) {  // diff_positive_32_32
+    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+    q0u8 = vdupq_n_u8(a1);
+    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
+      d = dest;
+      for (j = 0; j < 4; j++) {
+        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                      &q14u8, &q15u8);
+        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        d += dest_stride8;
+      }
+    }
+  } else {  // diff_negative_32_32
+    a1 = -a1;
+    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+    q0u8 = vdupq_n_u8(a1);
+    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
+      d = dest;
+      for (j = 0; j < 4; j++) {
+        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                      &q14u8, &q15u8);
+        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        d += dest_stride8;
+      }
+    }
+  }
+  return;
+}
diff --git a/aom_dsp/arm/idct32x32_add_neon.asm b/aom_dsp/arm/idct32x32_add_neon.asm
new file mode 100644
index 000000000..7483ee77e
--- /dev/null
+++ b/aom_dsp/arm/idct32x32_add_neon.asm
@@ -0,0 +1,1299 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;TODO(cd): adjust these constant to be able to use vqdmulh for faster
+;          dct_const_round_shift(a * b) within butterfly calculations.
+cospi_1_64  EQU 16364
+cospi_2_64  EQU 16305
+cospi_3_64  EQU 16207
+cospi_4_64  EQU 16069
+cospi_5_64  EQU 15893
+cospi_6_64  EQU 15679
+cospi_7_64  EQU 15426
+cospi_8_64  EQU 15137
+cospi_9_64  EQU 14811
+cospi_10_64 EQU 14449
+cospi_11_64 EQU 14053
+cospi_12_64 EQU 13623
+cospi_13_64 EQU 13160
+cospi_14_64 EQU 12665
+cospi_15_64 EQU 12140
+cospi_16_64 EQU 11585
+cospi_17_64 EQU 11003
+cospi_18_64 EQU 10394
+cospi_19_64 EQU  9760
+cospi_20_64 EQU  9102
+cospi_21_64 EQU  8423
+cospi_22_64 EQU  7723
+cospi_23_64 EQU  7005
+cospi_24_64 EQU  6270
+cospi_25_64 EQU  5520
+cospi_26_64 EQU  4756
+cospi_27_64 EQU  3981
+cospi_28_64 EQU  3196
+cospi_29_64 EQU  2404
+cospi_30_64 EQU  1606
+cospi_31_64 EQU   804
+
+
+    EXPORT  |vpx_idct32x32_1024_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY
+
+    ; --------------------------------------------------------------------------
+    ; Load from transposed_buffer
+    ;   q13 = transposed_buffer[first_offset]
+    ;   q14 = transposed_buffer[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   transposed_buffer must be passed in. use 0 for first use.
+    MACRO
+    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
+    ; address calculation with proper stride and loading
+    add r0, #($first_offset  - $prev_offset )*8*2
+    vld1.s16        {q14}, [r0]
+    add r0, #($second_offset - $first_offset)*8*2
+    vld1.s16        {q13}, [r0]
+    ; (used) two registers (q14, q13)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Load from output (used as temporary storage)
+    ;   reg1 = output[first_offset]
+    ;   reg2 = output[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, whether reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and loading
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vld1.s16        {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vld1.s16        {$reg2}, [r1]
+    ; (used) two registers ($reg1, $reg2)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Store into output (sometimes as as temporary storage)
+    ;   output[first_offset] = reg1
+    ;   output[second_offset] = reg2
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, whether reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and storing
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vst1.16 {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vst1.16 {$reg2}, [r1]
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]
+    vst1.16         {d11}, [r9]
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]!
+    vst1.16         {d11}, [r9]!
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]
+    vst1.16         {d4}, [r7]
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]!
+    vst1.16         {d4}, [r7]!
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    ; TODO(cd): have special case to re-use constants when they are similar for
+    ;           consecutive butterflies
+    ; TODO(cd): have special case when both constants are the same, do the
+    ;           additions/subtractions before the multiplies.
+    ; generate the constants
+    ;   generate scalar constants
+    mov             r8,  #$first_constant  & 0xFF00
+    mov             r12, #$second_constant & 0xFF00
+    add             r8,  #$first_constant  & 0x00FF
+    add             r12, #$second_constant & 0x00FF
+    ;   generate vector constants
+    vdup.16         d30, r8
+    vdup.16         d31, r12
+    ; (used) two for inputs (regA-regD), one for constants (q15)
+    ; do some multiplications (ordered for maximum latency hiding)
+    vmull.s16 q8,  $regC, d30
+    vmull.s16 q10, $regA, d31
+    vmull.s16 q9,  $regD, d30
+    vmull.s16 q11, $regB, d31
+    vmull.s16 q12, $regC, d31
+    ; (used) five for intermediate (q8-q12), one for constants (q15)
+    ; do some addition/subtractions (to get back two register)
+    vsub.s32  q8, q8, q10
+    vsub.s32  q9, q9, q11
+    ; do more multiplications (ordered for maximum latency hiding)
+    vmull.s16 q10, $regD, d31
+    vmull.s16 q11, $regA, d30
+    vmull.s16 q15, $regB, d30
+    ; (used) six for intermediate (q8-q12, q15)
+    ; do more addition/subtractions
+    vadd.s32  q11, q12, q11
+    vadd.s32  q10, q10, q15
+    ; (used) four for intermediate (q8-q11)
+    ; dct_const_round_shift
+    vqrshrn.s32 $reg1, q8,  #14
+    vqrshrn.s32 $reg2, q9,  #14
+    vqrshrn.s32 $reg3, q11, #14
+    vqrshrn.s32 $reg4, q10, #14
+    ; (used) two for results, well four d registers
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    MEND
+    ; --------------------------------------------------------------------------
+
+;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;
+;   r0  int16_t *input,
+;   r1  uint8_t *dest,
+;   r2  int dest_stride)
+; loop counters
+;   r4  bands loop counter
+;   r5  pass loop counter
+;   r8  transpose loop counter
+; combine-add pointers
+;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
+;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
+;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
+;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
+
+|vpx_idct32x32_1024_add_neon| PROC
+    ; This function does one pass of idct32x32 transform.
+    ;
+    ; This is done by transposing the input and then doing a 1d transform on
+    ; columns. In the first pass, the transposed columns are the original
+    ; rows. In the second pass, after the transposition, the colums are the
+    ; original columns.
+    ; The 1d transform is done by looping over bands of eight columns (the
+    ; idct32_bands loop). For each band, the transform input transposition
+    ; is done on demand, one band of four 8x8 matrices at a time. The four
+    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
+    push  {r4-r11}
+    vpush {d8-d15}
+    ; stack operation
+    ; internal buffer used to transpose 8 lines into before transforming them
+    ;   int16_t transpose_buffer[32 * 8];
+    ;   at sp + [4096, 4607]
+    ; results of the first pass (transpose and transform rows)
+    ;   int16_t pass1[32 * 32];
+    ;   at sp + [0, 2047]
+    ; results of the second pass (transpose and transform columns)
+    ;   int16_t pass2[32 * 32];
+    ;   at sp + [2048, 4095]
+    sub sp, sp, #512+2048+2048
+
+    ; r6  = dest + 31 * dest_stride
+    ; r7  = dest +  0 * dest_stride
+    ; r9  = dest + 15 * dest_stride
+    ; r10 = dest + 16 * dest_stride
+    rsb r6,  r2, r2, lsl #5
+    rsb r9,  r2, r2, lsl #4
+    add r10, r1, r2, lsl #4
+    mov r7, r1
+    add r6, r6, r1
+    add r9, r9, r1
+    ; r11 = -dest_stride
+    neg r11, r2
+    ; r3 = input
+    mov r3, r0
+    ; parameters for first pass
+      ; r0 = transpose_buffer[32 * 8]
+    add r0, sp, #4096
+      ; r1 = pass1[32 * 32]
+    mov r1, sp
+
+    mov r5, #0          ; initialize pass loop counter
+idct32_pass_loop
+    mov r4, #4          ; initialize bands loop counter
+idct32_bands_loop
+    mov r8, #2          ; initialize transpose loop counter
+idct32_transpose_pair_loop
+    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
+    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
+    ; adjusted to 32 because of the two post-increments.
+    vld1.s16        {q8},  [r3]!
+    vld1.s16        {q0},  [r3]!
+    add r3, #32
+    vld1.s16        {q9},  [r3]!
+    vld1.s16        {q1},  [r3]!
+    add r3, #32
+    vld1.s16        {q10}, [r3]!
+    vld1.s16        {q2},  [r3]!
+    add r3, #32
+    vld1.s16        {q11}, [r3]!
+    vld1.s16        {q3},  [r3]!
+    add r3, #32
+    vld1.s16        {q12}, [r3]!
+    vld1.s16        {q4},  [r3]!
+    add r3, #32
+    vld1.s16        {q13}, [r3]!
+    vld1.s16        {q5},  [r3]!
+    add r3, #32
+    vld1.s16        {q14}, [r3]!
+    vld1.s16        {q6},  [r3]!
+    add r3, #32
+    vld1.s16        {q15}, [r3]!
+    vld1.s16        {q7},  [r3]!
+
+    ; Transpose the two 8x8 16bit data matrices.
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vswp            d1,  d8
+    vswp            d7,  d14
+    vswp            d5,  d12
+    vswp            d3,  d10
+    vtrn.32         q8,  q10
+    vtrn.32         q9,  q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.32         q0,  q2
+    vtrn.32         q1,  q3
+    vtrn.32         q4,  q6
+    vtrn.32         q5,  q7
+    vtrn.16         q8,  q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    vtrn.16         q0,  q1
+    vtrn.16         q2,  q3
+    vtrn.16         q4,  q5
+    vtrn.16         q6,  q7
+
+    ; Store both matrices after each other. There is a stride of 32, which
+    ; adjusts to nothing because of the post-increments.
+    vst1.16        {q8},  [r0]!
+    vst1.16        {q9},  [r0]!
+    vst1.16        {q10}, [r0]!
+    vst1.16        {q11}, [r0]!
+    vst1.16        {q12}, [r0]!
+    vst1.16        {q13}, [r0]!
+    vst1.16        {q14}, [r0]!
+    vst1.16        {q15}, [r0]!
+    vst1.16        {q0},  [r0]!
+    vst1.16        {q1},  [r0]!
+    vst1.16        {q2},  [r0]!
+    vst1.16        {q3},  [r0]!
+    vst1.16        {q4},  [r0]!
+    vst1.16        {q5},  [r0]!
+    vst1.16        {q6},  [r0]!
+    vst1.16        {q7},  [r0]!
+
+    ; increment pointers by adjusted stride (not necessary for r0/out)
+    ;   go back by 7*32 for the seven lines moved fully by read and add
+    ;   go back by 32 for the eigth line only read
+    ;   advance by 16*2 to go the next pair
+    sub r3,  r3,  #7*32*2 + 32 - 16*2
+    ; transpose pair loop processing
+    subs r8, r8, #1
+    bne idct32_transpose_pair_loop
+
+    ; restore r0/input to its original value
+    sub r0, r0, #32*8*2
+
+    ; Instead of doing the transforms stage by stage, it is done by loading
+    ; some input values and doing as many stages as possible to minimize the
+    ; storing/loading of intermediate results. To fit within registers, the
+    ; final coefficients are cut into four blocks:
+    ; BLOCK A: 16-19,28-31
+    ; BLOCK B: 20-23,24-27
+    ; BLOCK C: 8-10,11-15
+    ; BLOCK D: 0-3,4-7
+    ; Blocks A and C are straight calculation through the various stages. In
+    ; block B, further calculations are performed using the results from
+    ; block A. In block D, further calculations are performed using the results
+    ; from block C and then the final calculations are done using results from
+    ; block A and B which have been combined at the end of block B.
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK A: 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; generate 16,17,30,31
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
+    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
+    ;step1b[16][i] = dct_const_round_shift(temp1);
+    ;step1b[31][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 0, 1, 31
+    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
+    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
+    ;step1b[17][i] = dct_const_round_shift(temp1);
+    ;step1b[30][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 31, 17, 15
+    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[16] =  step1b[16][i] + step1b[17][i];
+    ;step2[17] =  step1b[16][i] - step1b[17][i];
+    ;step2[30] = -step1b[30][i] + step1b[31][i];
+    ;step2[31] =  step1b[30][i] + step1b[31][i];
+    vadd.s16  q4, q0, q1
+    vsub.s16  q13, q0, q1
+    vadd.s16  q6, q2, q3
+    vsub.s16  q14, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
+    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
+    ;step3[17] = dct_const_round_shift(temp1);
+    ;step3[30] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; generate 18,19,28,29
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
+    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
+    ;step1b[18][i] = dct_const_round_shift(temp1);
+    ;step1b[29][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 15, 9, 23
+    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
+    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
+    ;step1b[19][i] = dct_const_round_shift(temp1);
+    ;step1b[28][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 23, 25, 7
+    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[18] = -step1b[18][i] + step1b[19][i];
+    ;step2[19] =  step1b[18][i] + step1b[19][i];
+    ;step2[28] =  step1b[28][i] + step1b[29][i];
+    ;step2[29] =  step1b[28][i] - step1b[29][i];
+    vsub.s16  q13, q3, q2
+    vadd.s16  q3,  q3, q2
+    vsub.s16  q14, q1, q0
+    vadd.s16  q2,  q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
+    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
+    ;step3[29] = dct_const_round_shift(temp1);
+    ;step3[18] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
+    ; --------------------------------------------------------------------------
+    ; combine 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[16] = step1b[16][i] + step1b[19][i];
+    ;step1[17] = step1b[17][i] + step1b[18][i];
+    ;step1[18] = step1b[17][i] - step1b[18][i];
+    ;step1[29] = step1b[30][i] - step1b[29][i];
+    ;step1[30] = step1b[30][i] + step1b[29][i];
+    ;step1[31] = step1b[31][i] + step1b[28][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q0
+    vadd.s16  q10, q7, q1
+    vadd.s16  q15, q6, q3
+    vsub.s16  q13, q5, q0
+    vsub.s16  q14, q7, q1
+    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
+    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
+    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
+    ;step2[18] = dct_const_round_shift(temp1);
+    ;step2[29] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
+    STORE_IN_OUTPUT 30, 29, 18, q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[19] = step1b[16][i] - step1b[19][i];
+    ;step1[28] = step1b[31][i] - step1b[28][i];
+    vsub.s16  q13, q4, q2
+    vsub.s16  q14, q6, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
+    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
+    ;step2[19] = dct_const_round_shift(temp1);
+    ;step2[28] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
+    STORE_IN_OUTPUT 18, 19, 28, q4, q6
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK B: 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; generate 20,21,26,27
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
+    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
+    ;step1b[20][i] = dct_const_round_shift(temp1);
+    ;step1b[27][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 7, 5, 27
+    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
+    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
+    ;step1b[21][i] = dct_const_round_shift(temp1);
+    ;step1b[26][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 27, 21, 11
+    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[20] =  step1b[20][i] + step1b[21][i];
+    ;step2[21] =  step1b[20][i] - step1b[21][i];
+    ;step2[26] = -step1b[26][i] + step1b[27][i];
+    ;step2[27] =  step1b[26][i] + step1b[27][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
+    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
+    ;step3[21] = dct_const_round_shift(temp1);
+    ;step3[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 22,23,24,25
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
+    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
+    ;step1b[22][i] = dct_const_round_shift(temp1);
+    ;step1b[25][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 11, 13, 19
+    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
+    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
+    ;step1b[23][i] = dct_const_round_shift(temp1);
+    ;step1b[24][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 19, 29, 3
+    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[22] = -step1b[22][i] + step1b[23][i];
+    ;step2[23] =  step1b[22][i] + step1b[23][i];
+    ;step2[24] =  step1b[24][i] + step1b[25][i];
+    ;step2[25] =  step1b[24][i] - step1b[25][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
+    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
+    ;step3[25] = dct_const_round_shift(temp1);
+    ;step3[22] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[22] = step1b[22][i] + step1b[21][i];
+    ;step1[23] = step1b[23][i] + step1b[20][i];
+    vadd.s16  q10, q7, q1
+    vadd.s16  q11, q5, q0
+    ;step1[24] = step1b[24][i] + step1b[27][i];
+    ;step1[25] = step1b[25][i] + step1b[26][i];
+    vadd.s16  q12, q6, q2
+    vadd.s16  q15, q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[16] = step1b[16][i] + step1b[23][i];
+    ;step3[17] = step1b[17][i] + step1b[22][i];
+    ;step3[22] = step1b[17][i] - step1b[22][i];
+    ;step3[23] = step1b[16][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
+    vadd.s16  q8,  q14, q11
+    vadd.s16  q9,  q13, q10
+    vsub.s16  q13, q13, q10
+    vsub.s16  q11, q14, q11
+    STORE_IN_OUTPUT 17, 17, 16, q9, q8
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[24] = step1b[31][i] - step1b[24][i];
+    ;step3[25] = step1b[30][i] - step1b[25][i];
+    ;step3[30] = step1b[30][i] + step1b[25][i];
+    ;step3[31] = step1b[31][i] + step1b[24][i];
+    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
+    vsub.s16  q8,  q9,  q12
+    vadd.s16  q10, q14, q15
+    vsub.s16  q14, q14, q15
+    vadd.s16  q12, q9,  q12
+    STORE_IN_OUTPUT 31, 30, 31, q10, q12
+    ; --------------------------------------------------------------------------
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpush {q8}  ; [24]
+    vpush {q11} ; [23]
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
+    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
+    ;step1[22] = dct_const_round_shift(temp1);
+    ;step1[25] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 31, 25, 22, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
+    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
+    ;step1[23] = dct_const_round_shift(temp1);
+    ;step1[24] = dct_const_round_shift(temp2);
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpop  {q13} ; [23]
+    vpop  {q14} ; [24]
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 22, 24, 23, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[20] = step1b[23][i] - step1b[20][i];
+    ;step1[27] = step1b[24][i] - step1b[27][i];
+    vsub.s16  q14, q5, q0
+    vsub.s16  q13, q6, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
+    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
+    ;step2[27] = dct_const_round_shift(temp1);
+    ;step2[20] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[21] = step1b[22][i] - step1b[21][i];
+    ;step1[26] = step1b[25][i] - step1b[26][i];
+    vsub.s16  q14,  q7, q1
+    vsub.s16  q13,  q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
+    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
+    ;step2[26] = dct_const_round_shift(temp1);
+    ;step2[21] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[18] = step1b[18][i] + step1b[21][i];
+    ;step3[19] = step1b[19][i] + step1b[20][i];
+    ;step3[20] = step1b[19][i] - step1b[20][i];
+    ;step3[21] = step1b[18][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
+    vadd.s16  q8,  q14, q1
+    vadd.s16  q9,  q13, q6
+    vsub.s16  q13, q13, q6
+    vsub.s16  q1,  q14, q1
+    STORE_IN_OUTPUT 19, 18, 19, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[27] = step1b[28][i] - step1b[27][i];
+    ;step3[28] = step1b[28][i] + step1b[27][i];
+    ;step3[29] = step1b[29][i] + step1b[26][i];
+    ;step3[26] = step1b[29][i] - step1b[26][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
+    vsub.s16  q14, q8, q5
+    vadd.s16  q10, q8, q5
+    vadd.s16  q11, q9, q0
+    vsub.s16  q0, q9, q0
+    STORE_IN_OUTPUT 29, 28, 29, q10, q11
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
+    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
+    ;step1[20] = dct_const_round_shift(temp1);
+    ;step1[27] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 29, 20, 27, q13, q14
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
+    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
+    ;step1[21] = dct_const_round_shift(temp1);
+    ;step1[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
+    STORE_IN_OUTPUT 27, 21, 26, q1, q0
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK C: 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; generate 8,9,14,15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
+    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
+    ;step2[8] = dct_const_round_shift(temp1);
+    ;step2[15] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 3, 2, 30
+    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
+    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
+    ;step2[9] = dct_const_round_shift(temp1);
+    ;step2[14] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 30, 18, 14
+    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[8] = step1b[8][i] + step1b[9][i];
+    ;step3[9] = step1b[8][i] - step1b[9][i];
+    ;step3[14] = step1b[15][i] - step1b[14][i];
+    ;step3[15] = step1b[15][i] + step1b[14][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
+    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
+    ;step1[9]  = dct_const_round_shift(temp1);
+    ;step1[14] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 10,11,12,13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
+    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
+    ;step2[10] = dct_const_round_shift(temp1);
+    ;step2[13] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 14, 10, 22
+    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
+    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
+    ;step2[11] = dct_const_round_shift(temp1);
+    ;step2[12] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 22, 26, 6
+    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[10] = step1b[11][i] - step1b[10][i];
+    ;step3[11] = step1b[11][i] + step1b[10][i];
+    ;step3[12] = step1b[12][i] + step1b[13][i];
+    ;step3[13] = step1b[12][i] - step1b[13][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
+    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
+    ;step1[13] = dct_const_round_shift(temp1);
+    ;step1[10] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[8]  = step1b[8][i] + step1b[11][i];
+    ;step2[9]  = step1b[9][i] + step1b[10][i];
+    ;step2[10] = step1b[9][i] - step1b[10][i];
+    vadd.s16  q8,  q0, q5
+    vadd.s16  q9,  q1, q7
+    vsub.s16  q13, q1, q7
+    ;step2[13] = step1b[14][i] - step1b[13][i];
+    ;step2[14] = step1b[14][i] + step1b[13][i];
+    ;step2[15] = step1b[15][i] + step1b[12][i];
+    vsub.s16  q14, q3, q4
+    vadd.s16  q10, q3, q4
+    vadd.s16  q15, q2, q6
+    STORE_IN_OUTPUT 26, 8, 15, q8, q15
+    STORE_IN_OUTPUT 15, 9, 14, q9, q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
+    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
+    ;step3[10] = dct_const_round_shift(temp1);
+    ;step3[13] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 14, 13, 10, q3, q1
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[11] = step1b[8][i] - step1b[11][i];
+    ;step2[12] = step1b[15][i] - step1b[12][i];
+    vsub.s16  q13, q0, q5
+    vsub.s16  q14,  q2, q6
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
+    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
+    ;step3[11] = dct_const_round_shift(temp1);
+    ;step3[12] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 10, 11, 12, q1, q3
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK D: 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; generate 4,5,6,7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
+    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
+    ;step3[4] = dct_const_round_shift(temp1);
+    ;step3[7] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 6, 4, 28
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
+    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
+    ;step3[5] = dct_const_round_shift(temp1);
+    ;step3[6] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 28, 20, 12
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[4] = step1b[4][i] + step1b[5][i];
+    ;step1[5] = step1b[4][i] - step1b[5][i];
+    ;step1[6] = step1b[7][i] - step1b[6][i];
+    ;step1[7] = step1b[7][i] + step1b[6][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
+    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
+    ;step2[5] = dct_const_round_shift(temp1);
+    ;step2[6] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 0,1,2,3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
+    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
+    ;step1[1] = dct_const_round_shift(temp1);
+    ;step1[0] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 12, 0, 16
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
+    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
+    ;step1[2] = dct_const_round_shift(temp1);
+    ;step1[3] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 16, 8, 24
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[0] = step1b[0][i] + step1b[3][i];
+    ;step2[1] = step1b[1][i] + step1b[2][i];
+    ;step2[2] = step1b[1][i] - step1b[2][i];
+    ;step2[3] = step1b[0][i] - step1b[3][i];
+    vadd.s16  q4, q7, q6
+    vsub.s16  q7, q7, q6
+    vsub.s16  q6, q5, q14
+    vadd.s16  q5, q5, q14
+    ; --------------------------------------------------------------------------
+    ; combine 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[0] = step1b[0][i] + step1b[7][i];
+    ;step3[1] = step1b[1][i] + step1b[6][i];
+    ;step3[2] = step1b[2][i] + step1b[5][i];
+    ;step3[3] = step1b[3][i] + step1b[4][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q3
+    vadd.s16  q10, q6, q1
+    vadd.s16  q11, q7, q0
+    ;step3[4] = step1b[3][i] - step1b[4][i];
+    ;step3[5] = step1b[2][i] - step1b[5][i];
+    ;step3[6] = step1b[1][i] - step1b[6][i];
+    ;step3[7] = step1b[0][i] - step1b[7][i];
+    vsub.s16  q12, q7, q0
+    vsub.s16  q13, q6, q1
+    vsub.s16  q14, q5, q3
+    vsub.s16  q15, q4, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[0] = step1b[0][i] + step1b[15][i];
+    ;step1[1] = step1b[1][i] + step1b[14][i];
+    ;step1[14] = step1b[1][i] - step1b[14][i];
+    ;step1[15] = step1b[0][i] - step1b[15][i];
+    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
+    vadd.s16  q2, q8, q1
+    vadd.s16  q3, q9, q0
+    vsub.s16  q4, q9, q0
+    vsub.s16  q5, q8, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
+    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
+    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
+    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
+    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+
+    cmp r5, #0
+    bgt idct32_bands_end_2nd_pass
+
+idct32_bands_end_1st_pass
+    STORE_IN_OUTPUT 17, 16, 17, q6, q7
+    STORE_IN_OUTPUT 17, 14, 15, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 31, 30, 31, q6, q7
+    STORE_IN_OUTPUT 31,  0,  1, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 19, 18, 19, q6, q7
+    STORE_IN_OUTPUT 19, 12, 13, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 29, 28, 29, q6, q7
+    STORE_IN_OUTPUT 29,  2,  3, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 21, 20, 21, q6, q7
+    STORE_IN_OUTPUT 21, 10, 11, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 27, 26, 27, q6, q7
+    STORE_IN_OUTPUT 27,  4,  5, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 23, 22, 23, q6, q7
+    STORE_IN_OUTPUT 23, 8, 9, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 25, 24, 25, q6, q7
+    STORE_IN_OUTPUT 25,  6,  7, q4, q5
+
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #7*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
+
+    ; bands loop processing
+    subs r4, r4, #1
+    bne idct32_bands_loop
+
+    ; parameters for second pass
+    ; the input of pass2 is the result of pass1. we have to remove the offset
+    ;   of 32 columns induced by the above idct32_bands_loop
+    sub r3, r1, #32*2
+      ; r1 = pass2[32 * 32]
+    add r1, sp, #2048
+
+    ; pass loop processing
+    add r5, r5, #1
+    b idct32_pass_loop
+
+idct32_bands_end_2nd_pass
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; restore pointers to their initial indices for next band pass by
+    ;     removing/adding dest_stride * 8. The actual increment by eight
+    ;     is taken care of within the _LAST macros.
+    add r6,  r6,  r2, lsl #3
+    add r9,  r9,  r2, lsl #3
+    sub r7,  r7,  r2, lsl #3
+    sub r10, r10, r2, lsl #3
+
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #25*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
+
+    ; bands loop processing
+    subs r4, r4, #1
+    bne idct32_bands_loop
+
+    ; stack operation
+    add sp, sp, #512+2048+2048
+    vpop {d8-d15}
+    pop  {r4-r11}
+    bx              lr
+    ENDP  ; |vpx_idct32x32_1024_add_neon|
+    END
diff --git a/aom_dsp/arm/idct32x32_add_neon.c b/aom_dsp/arm/idct32x32_add_neon.c
new file mode 100644
index 000000000..644155c96
--- /dev/null
+++ b/aom_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,685 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "aom_dsp/txfm_common.h"
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+  q14s16 = vld1q_s16(trans_buf + first * 8);      \
+  q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+  qA = vld1q_s16(out + first * 32);                   \
+  qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+  vst1q_s16(out + first * 32, qA);                   \
+  vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
+                                                  int stride, int16x8_t q6s16,
+                                                  int16x8_t q7s16,
+                                                  int16x8_t q8s16,
+                                                  int16x8_t q9s16) {
+  int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+  d8s16 = vld1_s16((int16_t *)p1);
+  p1 += stride;
+  d11s16 = vld1_s16((int16_t *)p2);
+  p2 -= stride;
+  d9s16 = vld1_s16((int16_t *)p1);
+  d10s16 = vld1_s16((int16_t *)p2);
+
+  q7s16 = vrshrq_n_s16(q7s16, 6);
+  q8s16 = vrshrq_n_s16(q8s16, 6);
+  q9s16 = vrshrq_n_s16(q9s16, 6);
+  q6s16 = vrshrq_n_s16(q6s16, 6);
+
+  q7s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
+  q8s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
+  q9s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
+  q6s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
+
+  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+  vst1_s16((int16_t *)p1, d9s16);
+  p1 -= stride;
+  vst1_s16((int16_t *)p2, d10s16);
+  p2 += stride;
+  vst1_s16((int16_t *)p1, d8s16);
+  vst1_s16((int16_t *)p2, d11s16);
+  return;
+}
+
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
+  ;                                           \
+  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
+                                                   int stride, int16x8_t q4s16,
+                                                   int16x8_t q5s16,
+                                                   int16x8_t q6s16,
+                                                   int16x8_t q7s16) {
+  int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+  d4s16 = vld1_s16((int16_t *)p1);
+  p1 += stride;
+  d7s16 = vld1_s16((int16_t *)p2);
+  p2 -= stride;
+  d5s16 = vld1_s16((int16_t *)p1);
+  d6s16 = vld1_s16((int16_t *)p2);
+
+  q5s16 = vrshrq_n_s16(q5s16, 6);
+  q6s16 = vrshrq_n_s16(q6s16, 6);
+  q7s16 = vrshrq_n_s16(q7s16, 6);
+  q4s16 = vrshrq_n_s16(q4s16, 6);
+
+  q5s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
+  q6s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
+  q7s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
+  q4s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
+
+  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+  vst1_s16((int16_t *)p1, d5s16);
+  p1 -= stride;
+  vst1_s16((int16_t *)p2, d6s16);
+  p2 += stride;
+  vst1_s16((int16_t *)p2, d7s16);
+  vst1_s16((int16_t *)p1, d4s16);
+  return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
+                                int16_t first_const, int16_t second_const,
+                                int16x8_t *qAs16, int16x8_t *qBs16) {
+  int16x4_t d30s16, d31s16;
+  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+  int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+  dCs16 = vget_low_s16(q14s16);
+  dDs16 = vget_high_s16(q14s16);
+  dAs16 = vget_low_s16(q13s16);
+  dBs16 = vget_high_s16(q13s16);
+
+  d30s16 = vdup_n_s16(first_const);
+  d31s16 = vdup_n_s16(second_const);
+
+  q8s32 = vmull_s16(dCs16, d30s16);
+  q10s32 = vmull_s16(dAs16, d31s16);
+  q9s32 = vmull_s16(dDs16, d30s16);
+  q11s32 = vmull_s16(dBs16, d31s16);
+  q12s32 = vmull_s16(dCs16, d31s16);
+
+  q8s32 = vsubq_s32(q8s32, q10s32);
+  q9s32 = vsubq_s32(q9s32, q11s32);
+
+  q10s32 = vmull_s16(dDs16, d31s16);
+  q11s32 = vmull_s16(dAs16, d30s16);
+  q15s32 = vmull_s16(dBs16, d30s16);
+
+  q11s32 = vaddq_s32(q12s32, q11s32);
+  q10s32 = vaddq_s32(q10s32, q15s32);
+
+  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
+  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
+  return;
+}
+
+static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
+  int16_t *in;
+  int i;
+  const int stride = 32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  for (i = 0; i < 4; i++, input += 8) {
+    in = input;
+    q8s16 = vld1q_s16(in);
+    in += stride;
+    q9s16 = vld1q_s16(in);
+    in += stride;
+    q10s16 = vld1q_s16(in);
+    in += stride;
+    q11s16 = vld1q_s16(in);
+    in += stride;
+    q12s16 = vld1q_s16(in);
+    in += stride;
+    q13s16 = vld1q_s16(in);
+    in += stride;
+    q14s16 = vld1q_s16(in);
+    in += stride;
+    q15s16 = vld1q_s16(in);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+    q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+    q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    q12s16 = vcombine_s16(d17s16, d25s16);
+    q13s16 = vcombine_s16(d19s16, d27s16);
+    q14s16 = vcombine_s16(d21s16, d29s16);
+    q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
+    q1x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
+    q2x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
+    q3x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    vst1q_s16(t_buf, q0x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q0x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q1x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q1x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q2x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q2x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q3x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q3x2s16.val[1]);
+    t_buf += 8;
+  }
+  return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
+                                             int16x8_t q3s16, int16x8_t q6s16,
+                                             int16x8_t q7s16, int16x8_t q8s16,
+                                             int16x8_t q9s16, int16x8_t q10s16,
+                                             int16x8_t q11s16, int16x8_t q12s16,
+                                             int16x8_t q13s16, int16x8_t q14s16,
+                                             int16x8_t q15s16) {
+  int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+  q2s16 = vaddq_s16(q10s16, q1s16);
+  q3s16 = vaddq_s16(q11s16, q0s16);
+  q4s16 = vsubq_s16(q11s16, q0s16);
+  q5s16 = vsubq_s16(q10s16, q1s16);
+
+  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+  q2s16 = vaddq_s16(q12s16, q1s16);
+  q3s16 = vaddq_s16(q13s16, q0s16);
+  q4s16 = vsubq_s16(q13s16, q0s16);
+  q5s16 = vsubq_s16(q12s16, q1s16);
+
+  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+  q2s16 = vaddq_s16(q14s16, q1s16);
+  q3s16 = vaddq_s16(q15s16, q0s16);
+  q4s16 = vsubq_s16(q15s16, q0s16);
+  q5s16 = vsubq_s16(q14s16, q1s16);
+
+  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+  return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
+    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
+    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
+    int16x8_t q14s16, int16x8_t q15s16) {
+  uint8_t *r6 = dest + 31 * stride;
+  uint8_t *r7 = dest /* +  0 * stride*/;
+  uint8_t *r9 = dest + 15 * stride;
+  uint8_t *r10 = dest + 16 * stride;
+  int str2 = stride << 1;
+  int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+  q2s16 = vaddq_s16(q10s16, q1s16);
+  q3s16 = vaddq_s16(q11s16, q0s16);
+  q4s16 = vsubq_s16(q11s16, q0s16);
+  q5s16 = vsubq_s16(q10s16, q1s16);
+
+  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+  q2s16 = vaddq_s16(q12s16, q1s16);
+  q3s16 = vaddq_s16(q13s16, q0s16);
+  q4s16 = vsubq_s16(q13s16, q0s16);
+  q5s16 = vsubq_s16(q12s16, q1s16);
+
+  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+  q2s16 = vaddq_s16(q14s16, q1s16);
+  q3s16 = vaddq_s16(q15s16, q0s16);
+  q4s16 = vsubq_s16(q15s16, q0s16);
+  q5s16 = vsubq_s16(q14s16, q1s16);
+
+  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  return;
+}
+
+void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
+  int i, idct32_pass_loop;
+  int16_t trans_buf[32 * 8];
+  int16_t pass1[32 * 32];
+  int16_t pass2[32 * 32];
+  int16_t *out;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+       idct32_pass_loop++,
+      input = pass1,  // the input of pass2 is the result of pass1
+       out = pass2) {
+    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
+      idct32_transpose_pair(input, trans_buf);
+
+      // -----------------------------------------
+      // BLOCK A: 16-19,28-31
+      // -----------------------------------------
+      // generate 16,17,30,31
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(0, 1, 31)
+      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(31, 17, 15)
+      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+      // part of stage 2
+      q4s16 = vaddq_s16(q0s16, q1s16);
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q6s16 = vaddq_s16(q2s16, q3s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+      // generate 18,19,28,29
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(15, 9, 23)
+      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(23, 25, 7)
+      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+      // part of stage 2
+      q13s16 = vsubq_s16(q3s16, q2s16);
+      q3s16 = vaddq_s16(q3s16, q2s16);
+      q14s16 = vsubq_s16(q1s16, q0s16);
+      q2s16 = vaddq_s16(q1s16, q0s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+      // part of stage 4
+      q8s16 = vaddq_s16(q4s16, q2s16);
+      q9s16 = vaddq_s16(q5s16, q0s16);
+      q10s16 = vaddq_s16(q7s16, q1s16);
+      q15s16 = vaddq_s16(q6s16, q3s16);
+      q13s16 = vsubq_s16(q5s16, q0s16);
+      q14s16 = vsubq_s16(q7s16, q1s16);
+      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+      // part of stage 4
+      q13s16 = vsubq_s16(q4s16, q2s16);
+      q14s16 = vsubq_s16(q6s16, q3s16);
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+      // -----------------------------------------
+      // BLOCK B: 20-23,24-27
+      // -----------------------------------------
+      // generate 20,21,26,27
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(7, 5, 27)
+      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(27, 21, 11)
+      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+      // part of stage 2
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+      // generate 22,23,24,25
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(11, 13, 19)
+      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(19, 29, 3)
+      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+      // part of stage 2
+      q14s16 = vsubq_s16(q4s16, q5s16);
+      q5s16 = vaddq_s16(q4s16, q5s16);
+      q13s16 = vsubq_s16(q6s16, q7s16);
+      q6s16 = vaddq_s16(q6s16, q7s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+      // part of stage 4
+      q10s16 = vaddq_s16(q7s16, q1s16);
+      q11s16 = vaddq_s16(q5s16, q0s16);
+      q12s16 = vaddq_s16(q6s16, q2s16);
+      q15s16 = vaddq_s16(q4s16, q3s16);
+      // part of stage 6
+      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+      q8s16 = vaddq_s16(q14s16, q11s16);
+      q9s16 = vaddq_s16(q13s16, q10s16);
+      q13s16 = vsubq_s16(q13s16, q10s16);
+      q11s16 = vsubq_s16(q14s16, q11s16);
+      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+      q8s16 = vsubq_s16(q9s16, q12s16);
+      q10s16 = vaddq_s16(q14s16, q15s16);
+      q14s16 = vsubq_s16(q14s16, q15s16);
+      q12s16 = vaddq_s16(q9s16, q12s16);
+      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+      // part of stage 7
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+      q13s16 = q11s16;
+      q14s16 = q8s16;
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+      // part of stage 4
+      q14s16 = vsubq_s16(q5s16, q0s16);
+      q13s16 = vsubq_s16(q6s16, q2s16);
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+      q14s16 = vsubq_s16(q7s16, q1s16);
+      q13s16 = vsubq_s16(q4s16, q3s16);
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+      // part of stage 6
+      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+      q8s16 = vaddq_s16(q14s16, q1s16);
+      q9s16 = vaddq_s16(q13s16, q6s16);
+      q13s16 = vsubq_s16(q13s16, q6s16);
+      q1s16 = vsubq_s16(q14s16, q1s16);
+      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+      q14s16 = vsubq_s16(q8s16, q5s16);
+      q10s16 = vaddq_s16(q8s16, q5s16);
+      q11s16 = vaddq_s16(q9s16, q0s16);
+      q0s16 = vsubq_s16(q9s16, q0s16);
+      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+      // part of stage 7
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
+      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+      // -----------------------------------------
+      // BLOCK C: 8-10,11-15
+      // -----------------------------------------
+      // generate 8,9,14,15
+      // part of stage 2
+      LOAD_FROM_TRANSPOSED(3, 2, 30)
+      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(30, 18, 14)
+      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+      // part of stage 3
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 4
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+      // generate 10,11,12,13
+      // part of stage 2
+      LOAD_FROM_TRANSPOSED(14, 10, 22)
+      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(22, 26, 6)
+      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+      // part of stage 3
+      q14s16 = vsubq_s16(q4s16, q5s16);
+      q5s16 = vaddq_s16(q4s16, q5s16);
+      q13s16 = vsubq_s16(q6s16, q7s16);
+      q6s16 = vaddq_s16(q6s16, q7s16);
+      // part of stage 4
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+      // part of stage 5
+      q8s16 = vaddq_s16(q0s16, q5s16);
+      q9s16 = vaddq_s16(q1s16, q7s16);
+      q13s16 = vsubq_s16(q1s16, q7s16);
+      q14s16 = vsubq_s16(q3s16, q4s16);
+      q10s16 = vaddq_s16(q3s16, q4s16);
+      q15s16 = vaddq_s16(q2s16, q6s16);
+      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+      // part of stage 6
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+      q13s16 = vsubq_s16(q0s16, q5s16);
+      q14s16 = vsubq_s16(q2s16, q6s16);
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+      // -----------------------------------------
+      // BLOCK D: 0-3,4-7
+      // -----------------------------------------
+      // generate 4,5,6,7
+      // part of stage 3
+      LOAD_FROM_TRANSPOSED(6, 4, 28)
+      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(28, 20, 12)
+      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+      // part of stage 4
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+      // generate 0,1,2,3
+      // part of stage 4
+      LOAD_FROM_TRANSPOSED(12, 0, 16)
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(16, 8, 24)
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+      // part of stage 5
+      q4s16 = vaddq_s16(q7s16, q6s16);
+      q7s16 = vsubq_s16(q7s16, q6s16);
+      q6s16 = vsubq_s16(q5s16, q14s16);
+      q5s16 = vaddq_s16(q5s16, q14s16);
+      // part of stage 6
+      q8s16 = vaddq_s16(q4s16, q2s16);
+      q9s16 = vaddq_s16(q5s16, q3s16);
+      q10s16 = vaddq_s16(q6s16, q1s16);
+      q11s16 = vaddq_s16(q7s16, q0s16);
+      q12s16 = vsubq_s16(q7s16, q0s16);
+      q13s16 = vsubq_s16(q6s16, q1s16);
+      q14s16 = vsubq_s16(q5s16, q3s16);
+      q15s16 = vsubq_s16(q4s16, q2s16);
+      // part of stage 7
+      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+      q2s16 = vaddq_s16(q8s16, q1s16);
+      q3s16 = vaddq_s16(q9s16, q0s16);
+      q4s16 = vsubq_s16(q9s16, q0s16);
+      q5s16 = vsubq_s16(q8s16, q1s16);
+      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+      q8s16 = vaddq_s16(q4s16, q1s16);
+      q9s16 = vaddq_s16(q5s16, q0s16);
+      q6s16 = vsubq_s16(q5s16, q0s16);
+      q7s16 = vsubq_s16(q4s16, q1s16);
+
+      if (idct32_pass_loop == 0) {
+        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                                  q10s16, q11s16, q12s16, q13s16, q14s16,
+                                  q15s16);
+      } else {
+        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
+                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
+                                  q14s16, q15s16);
+        dest += 8;
+      }
+    }
+  }
+  return;
+}
diff --git a/aom_dsp/arm/idct4x4_1_add_neon.asm b/aom_dsp/arm/idct4x4_1_add_neon.asm
new file mode 100644
index 000000000..adab715dd
--- /dev/null
+++ b/aom_dsp/arm/idct4x4_1_add_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_idct4x4_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct4x4_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    vld1.32          {d2[0]}, [r1], r2
+    vld1.32          {d2[1]}, [r1], r2
+    vld1.32          {d4[0]}, [r1], r2
+    vld1.32          {d4[1]}, [r1]
+
+    vaddw.u8         q8, q0, d2                ; dest[x] + a1
+    vaddw.u8         q9, q0, d4
+
+    vqmovun.s16      d6, q8                    ; clip_pixel
+    vqmovun.s16      d7, q9
+
+    vst1.32          {d6[0]}, [r12], r2
+    vst1.32          {d6[1]}, [r12], r2
+    vst1.32          {d7[0]}, [r12], r2
+    vst1.32          {d7[1]}, [r12]
+
+    bx               lr
+    ENDP             ; |vpx_idct4x4_1_add_neon|
+
+    END
diff --git a/aom_dsp/arm/idct4x4_1_add_neon.c b/aom_dsp/arm/idct4x4_1_add_neon.c
new file mode 100644
index 000000000..0a2e827b9
--- /dev/null
+++ b/aom_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d6u8;
+  uint32x2_t d2u32 = vdup_n_u32(0);
+  uint16x8_t q8u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  q0s16 = vdupq_n_s16(a1);
+
+  // dc_only_idct_add
+  d1 = d2 = dest;
+  for (i = 0; i < 2; i++) {
+    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+    d1 += dest_stride;
+    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
+    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+    d2 += dest_stride;
+    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+    d2 += dest_stride;
+  }
+  return;
+}
diff --git a/aom_dsp/arm/idct4x4_add_neon.asm b/aom_dsp/arm/idct4x4_add_neon.asm
new file mode 100644
index 000000000..877fbd634
--- /dev/null
+++ b/aom_dsp/arm/idct4x4_add_neon.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct4x4_16_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct4x4_16_add_neon| PROC
+
+    ; The 2D transform is done with two passes which are actually pretty
+    ; similar. We first transform the rows. This is done by transposing
+    ; the inputs, doing an SIMD column transform (the columns are the
+    ; transposed rows) and then transpose the results (so that it goes back
+    ; in normal/row positions). Then, we transform the columns by doing
+    ; another SIMD column transform.
+    ; So, two passes of a transpose followed by a column transform.
+
+    ; load the inputs into q8-q9, d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+
+    ; generate scalar constants
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov             r0, #0x3b00
+    add             r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov             r3, #0x2d00
+    add             r3, #0x41
+    ; cospi_24_64 = 6270 = 0x 187e
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; transpose the input data
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+
+    ; generate constant vectors
+    vdup.16         d20, r0         ; replicate cospi_8_64
+    vdup.16         d21, r3         ; replicate cospi_16_64
+
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    vdup.16         d22, r12        ; replicate cospi_24_64
+
+    ; do the transform on transposed rows
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+    vswp     d18, d19
+
+    ; transpose the results
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    ; do the transform on columns
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+
+    ; The results are in two registers, one of them being swapped. This will
+    ; be taken care of by loading the 'dest' value in a swapped fashion and
+    ; also storing them in the same swapped fashion.
+    ; temp_out[0, 1] = d16, d17 = q8
+    ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16 q8, q8, #4
+    vrshr.s16 q9, q9, #4
+
+    vld1.32 {d26[0]}, [r1], r2
+    vld1.32 {d26[1]}, [r1], r2
+    vld1.32 {d27[1]}, [r1], r2
+    vld1.32 {d27[0]}, [r1]  ; no post-increment
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8 q8, q8, d26
+    vaddw.u8 q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb r2, r2, #0
+    vst1.32 {d27[0]}, [r1], r2
+    vst1.32 {d27[1]}, [r1], r2
+    vst1.32 {d26[1]}, [r1], r2
+    vst1.32 {d26[0]}, [r1]  ; no post-increment
+    bx              lr
+    ENDP  ; |vpx_idct4x4_16_add_neon|
+
+    END
diff --git a/aom_dsp/arm/idct4x4_add_neon.c b/aom_dsp/arm/idct4x4_add_neon.c
new file mode 100644
index 000000000..382626928
--- /dev/null
+++ b/aom_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d26u8, d27u8;
+  uint32x2_t d26u32, d27u32;
+  uint16x8_t q8u16, q9u16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+  int16x8_t q8s16, q9s16, q13s16, q14s16;
+  int32x4_t q1s32, q13s32, q14s32, q15s32;
+  int16x4x2_t d0x2s16, d1x2s16;
+  int32x4x2_t q0x2s32;
+  uint8_t *d;
+  int16_t cospi_8_64 = 15137;
+  int16_t cospi_16_64 = 11585;
+  int16_t cospi_24_64 = 6270;
+
+  d26u32 = d27u32 = vdup_n_u32(0);
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+
+  d0x2s16 = vtrn_s16(d16s16, d17s16);
+  d1x2s16 = vtrn_s16(d18s16, d19s16);
+  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+  d20s16 = vdup_n_s16(cospi_8_64);
+  d21s16 = vdup_n_s16(cospi_16_64);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+  d22s16 = vdup_n_s16(cospi_24_64);
+
+  // stage 1
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, d22s16);
+  q1s32 = vmull_s16(d17s16, d20s16);
+  q13s32 = vmull_s16(d23s16, d21s16);
+  q14s32 = vmull_s16(d24s16, d21s16);
+
+  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+  d26s16 = vqrshrn_n_s32(q13s32, 14);
+  d27s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d28s16 = vqrshrn_n_s32(q1s32, 14);
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+
+  // stage 2
+  q8s16 = vaddq_s16(q13s16, q14s16);
+  q9s16 = vsubq_s16(q13s16, q14s16);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
+  d19s16 = vget_low_s16(q9s16);
+
+  d0x2s16 = vtrn_s16(d16s16, d17s16);
+  d1x2s16 = vtrn_s16(d18s16, d19s16);
+  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+  // do the transform on columns
+  // stage 1
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, d22s16);
+  q1s32 = vmull_s16(d17s16, d20s16);
+  q13s32 = vmull_s16(d23s16, d21s16);
+  q14s32 = vmull_s16(d24s16, d21s16);
+
+  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+  d26s16 = vqrshrn_n_s32(q13s32, 14);
+  d27s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d28s16 = vqrshrn_n_s32(q1s32, 14);
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+
+  // stage 2
+  q8s16 = vaddq_s16(q13s16, q14s16);
+  q9s16 = vsubq_s16(q13s16, q14s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 4);
+  q9s16 = vrshrq_n_s16(q9s16, 4);
+
+  d = dest;
+  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+  d += dest_stride;
+  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+  d += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+  d += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+  d = dest;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+  return;
+}
diff --git a/aom_dsp/arm/idct8x8_1_add_neon.asm b/aom_dsp/arm/idct8x8_1_add_neon.asm
new file mode 100644
index 000000000..dbbff364f
--- /dev/null
+++ b/aom_dsp/arm/idct8x8_1_add_neon.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vpx_idct8x8_1_add_neon|
+
+    END
diff --git a/aom_dsp/arm/idct8x8_1_add_neon.c b/aom_dsp/arm/idct8x8_1_add_neon.c
new file mode 100644
index 000000000..bda59986f
--- /dev/null
+++ b/aom_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d2u8, d3u8, d30u8, d31u8;
+  uint64x1_t d2u64, d3u64, d4u64, d5u64;
+  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+
+  q0s16 = vdupq_n_s16(a1);
+  q0u16 = vreinterpretq_u16_s16(q0s16);
+
+  d1 = d2 = dest;
+  for (i = 0; i < 2; i++) {
+    d2u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d4u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d5u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+
+    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+    d2 += dest_stride;
+  }
+  return;
+}
diff --git a/aom_dsp/arm/idct8x8_add_neon.asm b/aom_dsp/arm/idct8x8_add_neon.asm
new file mode 100644
index 000000000..6ab59b41b
--- /dev/null
+++ b/aom_dsp/arm/idct8x8_add_neon.asm
@@ -0,0 +1,519 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct8x8_64_add_neon|
+    EXPORT  |vpx_idct8x8_12_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+    ; This macro will touch q0-q7 registers and use them as buffer during
+    ; calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct8x8_64_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    IDCT8x8_1D
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vpx_idct8x8_64_add_neon|
+
+;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct8x8_12_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    ; stage 1
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+    ; to double the constants before multiplying to compensate this.
+    mov             r12, r3, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
+    mov             r12, r4, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_28_64)
+    vqrdmulh.s16    q4, q9, q0
+
+    mov             r12, r6, lsl #1
+    rsb             r12, #0
+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_4_64)
+    vqrdmulh.s16    q7, q9, q1
+
+    mov             r12, r5, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
+
+    ; dct_const_round_shift(- input[3] * cospi_20_64)
+    vqrdmulh.s16    q5, q11, q0
+
+    mov             r12, r7, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
+
+    ; dct_const_round_shift(input[3] * cospi_12_64)
+    vqrdmulh.s16    q6, q11, q1
+
+    ; stage 2 & stage 3 - even half
+    mov             r12, r8, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrdmulh.s16    q9, q8, q0
+
+    mov             r12, r9, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_24_64)
+    vqrdmulh.s16    q13, q10, q1
+
+    ; dct_const_round_shift(input[1] * cospi_8_64)
+    vqrdmulh.s16    q15, q10, q0
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vpx_idct8x8_12_add_neon|
+
+    END
diff --git a/aom_dsp/arm/idct8x8_add_neon.c b/aom_dsp/arm/idct8x8_add_neon.c
new file mode 100644
index 000000000..124c317c1
--- /dev/null
+++ b/aom_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,508 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "aom_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+                                int16x8_t *q10s16, int16x8_t *q11s16,
+                                int16x8_t *q12s16, int16x8_t *q13s16,
+                                int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+  *q12s16 = vcombine_s16(d17s16, d25s16);
+  *q13s16 = vcombine_s16(d19s16, d27s16);
+  *q14s16 = vcombine_s16(d21s16, d29s16);
+  *q15s16 = vcombine_s16(d23s16, d31s16);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+  q1x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+  q2x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+  q3x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+  *q8s16 = q0x2s16.val[0];
+  *q9s16 = q0x2s16.val[1];
+  *q10s16 = q1x2s16.val[0];
+  *q11s16 = q1x2s16.val[1];
+  *q12s16 = q2x2s16.val[0];
+  *q13s16 = q2x2s16.val[1];
+  *q14s16 = q3x2s16.val[0];
+  *q15s16 = q3x2s16.val[1];
+  return;
+}
+
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+                              int16x8_t *q10s16, int16x8_t *q11s16,
+                              int16x8_t *q12s16, int16x8_t *q13s16,
+                              int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+  d0s16 = vdup_n_s16(cospi_28_64);
+  d1s16 = vdup_n_s16(cospi_4_64);
+  d2s16 = vdup_n_s16(cospi_12_64);
+  d3s16 = vdup_n_s16(cospi_20_64);
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  q2s32 = vmull_s16(d18s16, d0s16);
+  q3s32 = vmull_s16(d19s16, d0s16);
+  q5s32 = vmull_s16(d26s16, d2s16);
+  q6s32 = vmull_s16(d27s16, d2s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+  d8s16 = vqrshrn_n_s32(q2s32, 14);
+  d9s16 = vqrshrn_n_s32(q3s32, 14);
+  d10s16 = vqrshrn_n_s32(q5s32, 14);
+  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q2s32 = vmull_s16(d18s16, d1s16);
+  q3s32 = vmull_s16(d19s16, d1s16);
+  q9s32 = vmull_s16(d26s16, d3s16);
+  q13s32 = vmull_s16(d27s16, d3s16);
+
+  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+  d14s16 = vqrshrn_n_s32(q2s32, 14);
+  d15s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q13s32, 14);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  d0s16 = vdup_n_s16(cospi_16_64);
+
+  q2s32 = vmull_s16(d16s16, d0s16);
+  q3s32 = vmull_s16(d17s16, d0s16);
+  q13s32 = vmull_s16(d16s16, d0s16);
+  q15s32 = vmull_s16(d17s16, d0s16);
+
+  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+  d0s16 = vdup_n_s16(cospi_24_64);
+  d1s16 = vdup_n_s16(cospi_8_64);
+
+  d18s16 = vqrshrn_n_s32(q2s32, 14);
+  d19s16 = vqrshrn_n_s32(q3s32, 14);
+  d22s16 = vqrshrn_n_s32(q13s32, 14);
+  d23s16 = vqrshrn_n_s32(q15s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+  *q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q2s32 = vmull_s16(d20s16, d0s16);
+  q3s32 = vmull_s16(d21s16, d0s16);
+  q8s32 = vmull_s16(d20s16, d1s16);
+  q12s32 = vmull_s16(d21s16, d1s16);
+
+  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+  d26s16 = vqrshrn_n_s32(q2s32, 14);
+  d27s16 = vqrshrn_n_s32(q3s32, 14);
+  d30s16 = vqrshrn_n_s32(q8s32, 14);
+  d31s16 = vqrshrn_n_s32(q12s32, 14);
+  *q13s16 = vcombine_s16(d26s16, d27s16);
+  *q15s16 = vcombine_s16(d30s16, d31s16);
+
+  q0s16 = vaddq_s16(*q9s16, *q15s16);
+  q1s16 = vaddq_s16(*q11s16, *q13s16);
+  q2s16 = vsubq_s16(*q11s16, *q13s16);
+  q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+  *q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  *q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+
+  d16s16 = vdup_n_s16(cospi_16_64);
+
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vqrshrn_n_s32(q9s32, 14);
+  d11s16 = vqrshrn_n_s32(q10s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  *q8s16 = vaddq_s16(q0s16, q7s16);
+  *q9s16 = vaddq_s16(q1s16, q6s16);
+  *q10s16 = vaddq_s16(q2s16, q5s16);
+  *q11s16 = vaddq_s16(q3s16, q4s16);
+  *q12s16 = vsubq_s16(q3s16, q4s16);
+  *q13s16 = vsubq_s16(q2s16, q5s16);
+  *q14s16 = vsubq_s16(q1s16, q6s16);
+  *q15s16 = vsubq_s16(q0s16, q7s16);
+  return;
+}
+
+void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 16);
+  q11s16 = vld1q_s16(input + 24);
+  q12s16 = vld1q_s16(input + 32);
+  q13s16 = vld1q_s16(input + 40);
+  q14s16 = vld1q_s16(input + 48);
+  q15s16 = vld1q_s16(input + 56);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  d1 = d2 = dest;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+
+  q8s16 = q12s16;
+  q9s16 = q13s16;
+  q10s16 = q14s16;
+  q11s16 = q15s16;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+  return;
+}
+
+void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+  int16x4_t d26s16, d27s16, d28s16, d29s16;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+  int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 16);
+  q11s16 = vld1q_s16(input + 24);
+  q12s16 = vld1q_s16(input + 32);
+  q13s16 = vld1q_s16(input + 40);
+  q14s16 = vld1q_s16(input + 48);
+  q15s16 = vld1q_s16(input + 56);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  // First transform rows
+  // stage 1
+  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+  q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+  q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+  q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+  // stage 2 & stage 3 - even half
+  q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+  q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+  // stage 3 -odd half
+  q0s16 = vaddq_s16(q9s16, q15s16);
+  q1s16 = vaddq_s16(q9s16, q13s16);
+  q2s16 = vsubq_s16(q9s16, q13s16);
+  q3s16 = vsubq_s16(q9s16, q15s16);
+
+  // stage 2 - odd half
+  q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+
+  d16s16 = vdup_n_s16(cospi_16_64);
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vqrshrn_n_s32(q9s32, 14);
+  d11s16 = vqrshrn_n_s32(q10s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 4
+  q8s16 = vaddq_s16(q0s16, q7s16);
+  q9s16 = vaddq_s16(q1s16, q6s16);
+  q10s16 = vaddq_s16(q2s16, q5s16);
+  q11s16 = vaddq_s16(q3s16, q4s16);
+  q12s16 = vsubq_s16(q3s16, q4s16);
+  q13s16 = vsubq_s16(q2s16, q5s16);
+  q14s16 = vsubq_s16(q1s16, q6s16);
+  q15s16 = vsubq_s16(q0s16, q7s16);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  d1 = d2 = dest;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+
+  q8s16 = q12s16;
+  q9s16 = q13s16;
+  q10s16 = q14s16;
+  q11s16 = q15s16;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+  return;
+}
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
new file mode 100644
index 000000000..3166a4e83
--- /dev/null
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,818 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "aom/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_left = vcombine_u16(p2, p2);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A = vld1q_u8(above);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    const uint8x16_t L = vld1q_u8(left);  // left row
+    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_left = vcombine_u16(p3, p3);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+    }
+  }
+}
+
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
+  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
+  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
+  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
+  const uint8x8_t A0 = vld1_u8(above);  // top row
+  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
+  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t avg1 = vhadd_u8(A0, A2);
+  uint8x8_t row = vrhadd_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 7; ++i) {
+    vst1_u8(dst + i * stride, row);
+    row = vtbl1_u8(row, sh_12345677);
+  }
+  vst1_u8(dst + i * stride, row);
+}
+
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t A0 = vld1q_u8(above);  // top row
+  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
+  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
+  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
+  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 15; ++i) {
+    vst1q_u8(dst + i * stride, row);
+    row = vextq_u8(row, above_right, 1);
+  }
+  vst1q_u8(dst + i * stride, row);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+#if !HAVE_NEON_ASM
+
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint32x2_t d0u32 = vdup_n_u32(0);
+  (void)left;
+
+  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride)
+    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  (void)left;
+
+  d0u8 = vld1_u8(above);
+  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+}
+
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+}
+
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  q1u8 = vld1q_u8(above + 16);
+  for (i = 0; i < 32; i++, dst += stride) {
+    vst1q_u8(dst, q0u8);
+    vst1q_u8(dst + 16, q1u8);
+  }
+}
+
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d1u32 = vdup_n_u32(0);
+  (void)above;
+
+  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint64x1_t d1u64 = vdup_n_u64(0);
+  (void)above;
+
+  d1u64 = vld1_u64((const uint64_t *)left);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+  vst1_u8(dst, d0u8);
+}
+
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  q1u8 = vld1q_u8(left);
+  d2u8 = vget_low_u8(q1u8);
+  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+    q0u8 = vdupq_lane_u8(d2u8, 0);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 1);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 2);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 3);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 4);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 5);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 6);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 7);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+  }
+}
+
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  for (k = 0; k < 2; k++, left += 16) {
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+      q0u8 = vdupq_lane_u8(d2u8, 0);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 1);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 2);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 3);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 4);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 5);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 6);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 7);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint16x8_t q1u16, q3u16;
+  int16x8_t q1s16;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d2u32 = vdup_n_u32(0);
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+  for (i = 0; i < 4; i++, dst += stride) {
+    q1u16 = vdupq_n_u16((uint16_t)left[i]);
+    q1s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
+    d0u8 = vqmovun_s16(q1s16);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  }
+}
+
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint16x8_t q0u16, q3u16, q10u16;
+  int16x8_t q0s16;
+  uint16x4_t d20u16;
+  uint8x8_t d0u8, d2u8, d30u8;
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d30u8 = vld1_u8(left);
+  d2u8 = vld1_u8(above);
+  q10u16 = vmovl_u8(d30u8);
+  q3u16 = vsubl_u8(d2u8, d0u8);
+  d20u16 = vget_low_u16(q10u16);
+  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+    q0u16 = vdupq_lane_u16(d20u16, 0);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 1);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 2);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 3);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+  }
+}
+
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+  uint8x16_t q0u8, q1u8;
+  int16x8_t q0s16, q1s16, q8s16, q11s16;
+  uint16x4_t d20u16;
+  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  for (k = 0; k < 2; k++, left += 8) {
+    d18u8 = vld1_u8(left);
+    q10u16 = vmovl_u8(d18u8);
+    d20u16 = vget_low_u16(q10u16);
+    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+      q0u16 = vdupq_lane_u16(d20u16, 0);
+      q8u16 = vdupq_lane_u16(d20u16, 1);
+      q1s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+      q0s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+      q11s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+      q8s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d20u16, 2);
+      q8u16 = vdupq_lane_u16(d20u16, 3);
+      q1s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+      q0s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+      q11s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+      q8s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+    }
+  }
+}
+
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+  uint8x16_t q0u8, q1u8, q2u8;
+  int16x8_t q12s16, q13s16, q14s16, q15s16;
+  uint16x4_t d6u16;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u8 = vld1q_u8(above + 16);
+  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+  for (k = 0; k < 4; k++, left += 8) {
+    d26u8 = vld1_u8(left);
+    q3u16 = vmovl_u8(d26u8);
+    d6u16 = vget_low_u16(q3u16);
+    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+      q0u16 = vdupq_lane_u16(d6u16, 0);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 1);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 2);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 3);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+    }
+  }
+}
+#endif  // !HAVE_NEON_ASM
diff --git a/aom_dsp/arm/intrapred_neon_asm.asm b/aom_dsp/arm/intrapred_neon_asm.asm
new file mode 100644
index 000000000..115790d48
--- /dev/null
+++ b/aom_dsp/arm/intrapred_neon_asm.asm
@@ -0,0 +1,630 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_v_predictor_4x4_neon|
+    EXPORT  |vpx_v_predictor_8x8_neon|
+    EXPORT  |vpx_v_predictor_16x16_neon|
+    EXPORT  |vpx_v_predictor_32x32_neon|
+    EXPORT  |vpx_h_predictor_4x4_neon|
+    EXPORT  |vpx_h_predictor_8x8_neon|
+    EXPORT  |vpx_h_predictor_16x16_neon|
+    EXPORT  |vpx_h_predictor_32x32_neon|
+    EXPORT  |vpx_tm_predictor_4x4_neon|
+    EXPORT  |vpx_tm_predictor_8x8_neon|
+    EXPORT  |vpx_tm_predictor_16x16_neon|
+    EXPORT  |vpx_tm_predictor_32x32_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_4x4_neon| PROC
+    vld1.32             {d0[0]}, [r2]
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_4x4_neon|
+
+;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_8x8_neon| PROC
+    vld1.8              {d0}, [r2]
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_8x8_neon|
+
+;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_16x16_neon| PROC
+    vld1.8              {q0}, [r2]
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_16x16_neon|
+
+;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_32x32_neon| PROC
+    vld1.8              {q0, q1}, [r2]
+    mov                 r2, #2
+loop_v
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    subs                r2, r2, #1
+    bgt                 loop_v
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_32x32_neon|
+
+;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_4x4_neon| PROC
+    vld1.32             {d1[0]}, [r3]
+    vdup.8              d0, d1[0]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[1]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[2]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[3]
+    vst1.32             {d0[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_4x4_neon|
+
+;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_8x8_neon| PROC
+    vld1.64             {d1}, [r3]
+    vdup.8              d0, d1[0]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[1]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[2]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[3]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[4]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[5]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[6]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[7]
+    vst1.64             {d0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_8x8_neon|
+
+;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_16x16_neon| PROC
+    vld1.8              {q1}, [r3]
+    vdup.8              q0, d2[0]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[1]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[2]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[3]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[4]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[5]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[6]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[7]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[0]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[1]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[2]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[3]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[4]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[5]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[6]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[7]
+    vst1.8              {q0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_16x16_neon|
+
+;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_32x32_neon| PROC
+    sub                 r1, r1, #16
+    mov                 r2, #2
+loop_h
+    vld1.8              {q1}, [r3]!
+    vdup.8              q0, d2[0]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[1]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[2]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[3]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[4]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[5]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[6]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[7]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[0]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[1]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[2]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[3]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[4]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[5]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[6]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[7]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    subs                r2, r2, #1
+    bgt                 loop_h
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_32x32_neon|
+
+;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_4x4_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.u8             {d0[]}, [r12]
+
+    ; Load above 4 pixels
+    vld1.32             {d2[0]}, [r2]
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    vld1.u8             {d2[]}, [r3]!
+    vld1.u8             {d4[]}, [r3]!
+    vmovl.u8            q1, d2
+    vmovl.u8            q2, d4
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+
+    ; 3rd row and 4th row
+    vld1.u8             {d2[]}, [r3]!
+    vld1.u8             {d4[]}, [r3]
+    vmovl.u8            q1, d2
+    vmovl.u8            q2, d4
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_4x4_neon|
+
+;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_8x8_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.8              {d0[]}, [r12]
+
+    ; preload 8 left
+    vld1.8              {d30}, [r3]
+
+    ; Load above 8 pixels
+    vld1.64             {d2}, [r2]
+
+    vmovl.u8            q10, d30
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    vdup.16             q0, d20[0]
+    vdup.16             q1, d20[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
+
+    ; 3rd row and 4th row
+    vdup.16             q8, d20[2]
+    vdup.16             q9, d20[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
+
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
+
+    ; 5th row and 6th row
+    vdup.16             q0, d21[0]
+    vdup.16             q1, d21[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
+
+    ; 7th row and 8th row
+    vdup.16             q8, d21[2]
+    vdup.16             q9, d21[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
+
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
+
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_8x8_neon|
+
+;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_16x16_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.8              {d0[]}, [r12]
+
+    ; Load above 8 pixels
+    vld1.8              {q1}, [r2]
+
+    ; preload 8 left into r12
+    vld1.8              {d18}, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q2, d2, d0
+    vsubl.u8            q3, d3, d0
+
+    vmovl.u8            q10, d18
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+    mov                 r2, #2
+
+loop_16x16_neon
+    ; Process two rows.
+    vdup.16             q0, d20[0]
+    vdup.16             q8, d20[1]
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d20[2]                  ; proload next 2 rows data
+    vdup.16             q8, d20[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d21[0]                  ; proload next 2 rows data
+    vdup.16             q8, d21[1]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d21[2]                  ; proload next 2 rows data
+    vdup.16             q8, d21[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
+    vmovl.u8            q10, d18
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_16x16_neon
+
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_16x16_neon|
+
+;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                  const uint8_t *above,
+;                                  const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_32x32_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.8              {d0[]}, [r12]
+
+    ; Load above 32 pixels
+    vld1.8              {q1}, [r2]!
+    vld1.8              {q2}, [r2]
+
+    ; preload 8 left pixels
+    vld1.8              {d26}, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q8, d2, d0
+    vsubl.u8            q9, d3, d0
+    vsubl.u8            q10, d4, d0
+    vsubl.u8            q11, d5, d0
+
+    vmovl.u8            q3, d26
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+    mov                 r2, #4
+
+loop_32x32_neon
+    ; Process two rows.
+    vdup.16             q0, d6[0]
+    vdup.16             q2, d6[1]
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q1, d6[2]
+    vdup.16             q2, d6[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q1, q8
+    vadd.s16            q13, q1, q9
+    vadd.s16            q14, q1, q10
+    vadd.s16            q15, q1, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q0, d7[0]
+    vdup.16             q2, d7[1]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q0, d7[2]
+    vdup.16             q2, d7[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vmovl.u8            q3, d0
+    vst1.64             {d24-d27}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_32x32_neon
+
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_32x32_neon|
+
+    END
diff --git a/aom_dsp/arm/loopfilter_16_neon.asm b/aom_dsp/arm/loopfilter_16_neon.asm
new file mode 100644
index 000000000..5a8fdd6af
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_16_neon.asm
@@ -0,0 +1,199 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_horizontal_4_dual_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    add         r1, r1, r1                 ; double pitch
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    add         r3, r2, r1, lsr #1         ; s[-3 * p]
+
+    vld1.u8     {q3}, [r2@64], r1          ; p3
+    vld1.u8     {q4}, [r3@64], r1          ; p2
+    vld1.u8     {q5}, [r2@64], r1          ; p1
+    vld1.u8     {q6}, [r3@64], r1          ; p0
+    vld1.u8     {q7}, [r2@64], r1          ; q0
+    vld1.u8     {q8}, [r3@64], r1          ; q1
+    vld1.u8     {q9}, [r2@64]              ; q2
+    vld1.u8     {q10}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vpx_loop_filter_neon_16
+
+    vst1.u8     {q5}, [r2@64], r1          ; store op1
+    vst1.u8     {q6}, [r3@64], r1          ; store op0
+    vst1.u8     {q7}, [r2@64], r1          ; store oq0
+    vst1.u8     {q8}, [r3@64], r1          ; store oq1
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
+
+; void vpx_loop_filter_neon_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0    blimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+;
+; Outputs:
+; q5    op1
+; q6    op0
+; q7    oq0
+; q8    oq1
+|vpx_loop_filter_neon_16| PROC
+
+    ; filter_mask
+    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
+
+    vmov.u8     q10, #0x80
+
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    veor        q7, q7, q10                 ; qs0
+
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
+
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u16    q4, #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; a > blimit
+
+    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; hev
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; filter &= hev
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
+
+    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; filter &= mask
+
+    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
+    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
+    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
+    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
+
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
+
+    vbic        q1, q1, q14                 ; filter &= ~hev
+
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
+
+    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q5, q13, q10                ; *op1 = u^0x80
+    veor        q8, q12, q10                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vpx_loop_filter_neon_16|
+
+    END
diff --git a/aom_dsp/arm/loopfilter_16_neon.c b/aom_dsp/arm/loopfilter_16_neon.c
new file mode 100644
index 000000000..70087f988
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_16_neon.c
@@ -0,0 +1,173 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "aom/vpx_integer.h"
+
+static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
+                                       uint8x16_t qlimit,   // limit
+                                       uint8x16_t qthresh,  // thresh
+                                       uint8x16_t q3,       // p3
+                                       uint8x16_t q4,       // p2
+                                       uint8x16_t q5,       // p1
+                                       uint8x16_t q6,       // p0
+                                       uint8x16_t q7,       // q0
+                                       uint8x16_t q8,       // q1
+                                       uint8x16_t q9,       // q2
+                                       uint8x16_t q10,      // q3
+                                       uint8x16_t *q5r,     // p1
+                                       uint8x16_t *q6r,     // p0
+                                       uint8x16_t *q7r,     // q0
+                                       uint8x16_t *q8r) {   // q1
+  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+  int16x8_t q2s16, q11s16;
+  uint16x8_t q4u16;
+  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+  int8x8_t d2s8, d3s8;
+
+  q11u8 = vabdq_u8(q3, q4);
+  q12u8 = vabdq_u8(q4, q5);
+  q13u8 = vabdq_u8(q5, q6);
+  q14u8 = vabdq_u8(q8, q7);
+  q3 = vabdq_u8(q9, q8);
+  q4 = vabdq_u8(q10, q9);
+
+  q11u8 = vmaxq_u8(q11u8, q12u8);
+  q12u8 = vmaxq_u8(q13u8, q14u8);
+  q3 = vmaxq_u8(q3, q4);
+  q15u8 = vmaxq_u8(q11u8, q12u8);
+
+  q9 = vabdq_u8(q6, q7);
+
+  // vp8_hevmask
+  q13u8 = vcgtq_u8(q13u8, qthresh);
+  q14u8 = vcgtq_u8(q14u8, qthresh);
+  q15u8 = vmaxq_u8(q15u8, q3);
+
+  q2u8 = vabdq_u8(q5, q8);
+  q9 = vqaddq_u8(q9, q9);
+
+  q15u8 = vcgeq_u8(qlimit, q15u8);
+
+  // vp8_filter() function
+  // convert to signed
+  q10 = vdupq_n_u8(0x80);
+  q8 = veorq_u8(q8, q10);
+  q7 = veorq_u8(q7, q10);
+  q6 = veorq_u8(q6, q10);
+  q5 = veorq_u8(q5, q10);
+
+  q2u8 = vshrq_n_u8(q2u8, 1);
+  q9 = vqaddq_u8(q9, q2u8);
+
+  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                   vget_low_s8(vreinterpretq_s8_u8(q6)));
+  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                    vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+  q9 = vcgeq_u8(qblimit, q9);
+
+  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
+
+  q14u8 = vorrq_u8(q13u8, q14u8);
+
+  q4u16 = vdupq_n_u16(3);
+  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+  q15u8 = vandq_u8(q15u8, q9);
+
+  q1s8 = vreinterpretq_s8_u8(q1u8);
+  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+  q4 = vdupq_n_u8(3);
+  q9 = vdupq_n_u8(4);
+  // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+  d2s8 = vqmovn_s16(q2s16);
+  d3s8 = vqmovn_s16(q11s16);
+  q1s8 = vcombine_s8(d2s8, d3s8);
+  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+  q1s8 = vreinterpretq_s8_u8(q1u8);
+
+  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+  q2s8 = vshrq_n_s8(q2s8, 3);
+  q1s8 = vshrq_n_s8(q1s8, 3);
+
+  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+  q1s8 = vrshrq_n_s8(q1s8, 1);
+  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+  return;
+}
+
+void vpx_lpf_horizontal_4_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+  uint8x16_t qblimit, qlimit, qthresh;
+  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+  dblimit0 = vld1_u8(blimit0);
+  dlimit0 = vld1_u8(limit0);
+  dthresh0 = vld1_u8(thresh0);
+  dblimit1 = vld1_u8(blimit1);
+  dlimit1 = vld1_u8(limit1);
+  dthresh1 = vld1_u8(thresh1);
+  qblimit = vcombine_u8(dblimit0, dblimit1);
+  qlimit = vcombine_u8(dlimit0, dlimit1);
+  qthresh = vcombine_u8(dthresh0, dthresh1);
+
+  s -= (p << 2);
+
+  q3u8 = vld1q_u8(s);
+  s += p;
+  q4u8 = vld1q_u8(s);
+  s += p;
+  q5u8 = vld1q_u8(s);
+  s += p;
+  q6u8 = vld1q_u8(s);
+  s += p;
+  q7u8 = vld1q_u8(s);
+  s += p;
+  q8u8 = vld1q_u8(s);
+  s += p;
+  q9u8 = vld1q_u8(s);
+  s += p;
+  q10u8 = vld1q_u8(s);
+
+  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
+                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
+
+  s -= (p * 5);
+  vst1q_u8(s, q5u8);
+  s += p;
+  vst1q_u8(s, q6u8);
+  s += p;
+  vst1q_u8(s, q7u8);
+  s += p;
+  vst1q_u8(s, q8u8);
+  return;
+}
diff --git a/aom_dsp/arm/loopfilter_4_neon.asm b/aom_dsp/arm/loopfilter_4_neon.asm
new file mode 100644
index 000000000..937115898
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_4_neon.asm
@@ -0,0 +1,249 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_4_neon|
+    EXPORT  |vpx_lpf_vertical_4_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
+;                                int p /* pitch */,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vpx_lpf_horizontal_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vpx_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    pop         {pc}
+    ENDP        ; |vpx_lpf_horizontal_4_neon|
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
+;                              int p /* pitch */,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vpx_lpf_vertical_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vpx_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    pop         {pc}
+    ENDP        ; |vpx_lpf_vertical_4_neon|
+
+; void vpx_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|vpx_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vpx_loop_filter_neon|
+
+    END
diff --git a/aom_dsp/arm/loopfilter_4_neon.c b/aom_dsp/arm/loopfilter_4_neon.c
new file mode 100644
index 000000000..1c1e80e00
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_4_neon.c
@@ -0,0 +1,249 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
+                                    uint8x8_t dlimit,    // limit
+                                    uint8x8_t dthresh,   // thresh
+                                    uint8x8_t d3u8,      // p3
+                                    uint8x8_t d4u8,      // p2
+                                    uint8x8_t d5u8,      // p1
+                                    uint8x8_t d6u8,      // p0
+                                    uint8x8_t d7u8,      // q0
+                                    uint8x8_t d16u8,     // q1
+                                    uint8x8_t d17u8,     // q2
+                                    uint8x8_t d18u8,     // q3
+                                    uint8x8_t *d4ru8,    // p1
+                                    uint8x8_t *d5ru8,    // p0
+                                    uint8x8_t *d6ru8,    // q0
+                                    uint8x8_t *d7ru8) {  // q1
+  uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+  int16x8_t q12s16;
+  int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+  d19u8 = vabd_u8(d3u8, d4u8);
+  d20u8 = vabd_u8(d4u8, d5u8);
+  d21u8 = vabd_u8(d5u8, d6u8);
+  d22u8 = vabd_u8(d16u8, d7u8);
+  d3u8 = vabd_u8(d17u8, d16u8);
+  d4u8 = vabd_u8(d18u8, d17u8);
+
+  d19u8 = vmax_u8(d19u8, d20u8);
+  d20u8 = vmax_u8(d21u8, d22u8);
+  d3u8 = vmax_u8(d3u8, d4u8);
+  d23u8 = vmax_u8(d19u8, d20u8);
+
+  d17u8 = vabd_u8(d6u8, d7u8);
+
+  d21u8 = vcgt_u8(d21u8, dthresh);
+  d22u8 = vcgt_u8(d22u8, dthresh);
+  d23u8 = vmax_u8(d23u8, d3u8);
+
+  d28u8 = vabd_u8(d5u8, d16u8);
+  d17u8 = vqadd_u8(d17u8, d17u8);
+
+  d23u8 = vcge_u8(dlimit, d23u8);
+
+  d18u8 = vdup_n_u8(0x80);
+  d5u8 = veor_u8(d5u8, d18u8);
+  d6u8 = veor_u8(d6u8, d18u8);
+  d7u8 = veor_u8(d7u8, d18u8);
+  d16u8 = veor_u8(d16u8, d18u8);
+
+  d28u8 = vshr_n_u8(d28u8, 1);
+  d17u8 = vqadd_u8(d17u8, d28u8);
+
+  d19u8 = vdup_n_u8(3);
+
+  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
+
+  d17u8 = vcge_u8(dblimit, d17u8);
+
+  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
+
+  d22u8 = vorr_u8(d21u8, d22u8);
+
+  q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+  d23u8 = vand_u8(d23u8, d17u8);
+
+  q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+  d17u8 = vdup_n_u8(4);
+
+  d27s8 = vqmovn_s16(q12s16);
+  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+  d27s8 = vreinterpret_s8_u8(d27u8);
+
+  d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+  d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+  d28s8 = vshr_n_s8(d28s8, 3);
+  d27s8 = vshr_n_s8(d27s8, 3);
+
+  d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+  d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+  d27s8 = vrshr_n_s8(d27s8, 1);
+  d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+  d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+  d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+  *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+  *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+  *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+  *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+  return;
+}
+
+void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s, *psrc;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  psrc = src - (pitch << 2);
+  for (i = 0; i < 1; i++) {
+    s = psrc + i * 8;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+    s -= (pitch * 5);
+    vst1_u8(s, d4u8);
+    s += pitch;
+    vst1_u8(s, d5u8);
+    s += pitch;
+    vst1_u8(s, d6u8);
+    s += pitch;
+    vst1_u8(s, d7u8);
+  }
+  return;
+}
+
+void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  int i, pitch8;
+  uint8_t *s;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+  uint8x8x4_t d4Result;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  pitch8 = pitch * 8;
+  for (i = 0; i < 1; i++, src += pitch8) {
+    s = src - (i + 1) * 4;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                      vreinterpret_u16_u32(d2tmp2.val[0]));
+    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                      vreinterpret_u16_u32(d2tmp3.val[0]));
+    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                      vreinterpret_u16_u32(d2tmp2.val[1]));
+    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                      vreinterpret_u16_u32(d2tmp3.val[1]));
+
+    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                     vreinterpret_u8_u16(d2tmp5.val[0]));
+    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                     vreinterpret_u8_u16(d2tmp5.val[1]));
+    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                      vreinterpret_u8_u16(d2tmp7.val[0]));
+    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                      vreinterpret_u8_u16(d2tmp7.val[1]));
+
+    d3u8 = d2tmp8.val[0];
+    d4u8 = d2tmp8.val[1];
+    d5u8 = d2tmp9.val[0];
+    d6u8 = d2tmp9.val[1];
+    d7u8 = d2tmp10.val[0];
+    d16u8 = d2tmp10.val[1];
+    d17u8 = d2tmp11.val[0];
+    d18u8 = d2tmp11.val[1];
+
+    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+    d4Result.val[0] = d4u8;
+    d4Result.val[1] = d5u8;
+    d4Result.val[2] = d6u8;
+    d4Result.val[3] = d7u8;
+
+    src -= 2;
+    vst4_lane_u8(src, d4Result, 0);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 1);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 2);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 3);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 4);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 5);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 6);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 7);
+  }
+  return;
+}
diff --git a/aom_dsp/arm/loopfilter_8_neon.asm b/aom_dsp/arm/loopfilter_8_neon.asm
new file mode 100644
index 000000000..a2f20e15f
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_8_neon.asm
@@ -0,0 +1,425 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vpx_lpf_horizontal_8_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r2, [sp, #12]              ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r2, r3, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r3@64], r1          ; p3
+    vld1.u8     {d4}, [r2@64], r1          ; p2
+    vld1.u8     {d5}, [r3@64], r1          ; p1
+    vld1.u8     {d6}, [r2@64], r1          ; p0
+    vld1.u8     {d7}, [r3@64], r1          ; q0
+    vld1.u8     {d16}, [r2@64], r1         ; q1
+    vld1.u8     {d17}, [r3@64]             ; q2
+    vld1.u8     {d18}, [r2@64], r1         ; q3
+
+    sub         r3, r3, r1, lsl #1
+    sub         r2, r2, r1, lsl #2
+
+    bl          vpx_mbloop_filter_neon
+
+    vst1.u8     {d0}, [r2@64], r1          ; store op2
+    vst1.u8     {d1}, [r3@64], r1          ; store op1
+    vst1.u8     {d2}, [r2@64], r1          ; store op0
+    vst1.u8     {d3}, [r3@64], r1          ; store oq0
+    vst1.u8     {d4}, [r2@64], r1          ; store oq1
+    vst1.u8     {d5}, [r3@64], r1          ; store oq2
+
+    pop         {r4-r5, pc}
+
+    ENDP        ; |vpx_lpf_horizontal_8_neon|
+
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
+;                              int pitch,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh)
+;
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vpx_lpf_vertical_8_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #12]             ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    sub         r2, r0, #3
+    add         r3, r0, #1
+
+    bl          vpx_mbloop_filter_neon
+
+    ;store op2, op1, op0, oq0
+    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+    ;store oq1, oq2
+    vst2.8      {d4[0], d5[0]}, [r3], r1
+    vst2.8      {d4[1], d5[1]}, [r3], r1
+    vst2.8      {d4[2], d5[2]}, [r3], r1
+    vst2.8      {d4[3], d5[3]}, [r3], r1
+    vst2.8      {d4[4], d5[4]}, [r3], r1
+    vst2.8      {d4[5], d5[5]}, [r3], r1
+    vst2.8      {d4[6], d5[6]}, [r3], r1
+    vst2.8      {d4[7], d5[7]}, [r3]
+
+    pop         {r4-r5, pc}
+    ENDP        ; |vpx_lpf_vertical_8_neon|
+
+; void vpx_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d0    op2
+; d1    op1
+; d2    op0
+; d3    oq0
+; d4    oq1
+; d5    oq2
+|vpx_mbloop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
+    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
+    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
+
+    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
+
+    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
+
+    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
+
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
+    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
+    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d1, d19
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
+    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+
+    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
+
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
+
+    vmov.u8     d23, #1
+    vcge.u8     d24, d0, d24               ; a > blimit
+
+    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
+
+    vcge.u8     d20, d23, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
+
+    vand        d20, d20, d19              ; flat & mask
+
+    vmov.u8     d22, #0x80
+
+    vorr        d23, d21, d23              ; hev
+
+    ; This instruction will truncate the "flat & mask" masks down to 4 bits
+    ; each to fit into one 32 bit arm register. The values are stored in
+    ; q10.64[0].
+    vshrn.u16   d30, q10, #4
+    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
+
+    adds        r5, r4, #1                 ; Check for all 1's
+
+    ; If mask and flat are 1's for all vectors, then we only need to execute
+    ; the power branch for all vectors.
+    beq         power_branch_only
+
+    cmp         r4, #0                     ; Check for 0, set flag for later
+
+    ; mbfilter() function
+    ; filter() function
+    ; convert to signed
+    veor        d21, d7, d22               ; qs0
+    veor        d24, d6, d22               ; ps0
+    veor        d25, d5, d22               ; ps1
+    veor        d26, d16, d22              ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
+
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+
+    vand        d29, d29, d23              ; filter &= hev
+
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d23              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    ; If mask and flat are 0's for all vectors, then we only need to execute
+    ; the filter branch for all vectors.
+    beq         filter_branch_only
+
+    ; If mask and flat are mixed then we must perform both branches and
+    ; combine the data.
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d21, d21, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    ; At this point we have already executed the filter branch. The filter
+    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+    ; branch and combine the data.
+    vmov.u8     d23, #2
+    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
+    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
+
+    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
+
+    vaddw.u8    q14, d5                    ; r_op2 += p1
+
+    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
+
+    vqrshrn.u16 d30, q14, #3               ; r_op2
+
+    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
+    vsubw.u8    q14, d4                    ; r_op1 -= p2
+    vaddw.u8    q14, d5                    ; r_op1 += p1
+    vaddw.u8    q14, d16                   ; r_op1 += q1
+
+    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
+
+    vqrshrn.u16 d31, q14, #3               ; r_op1
+
+    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
+    vsubw.u8    q14, d5                    ; r_op0 -= p1
+    vaddw.u8    q14, d6                    ; r_op0 += p0
+    vaddw.u8    q14, d17                   ; r_op0 += q2
+
+    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
+
+    vqrshrn.u16 d23, q14, #3               ; r_op0
+
+    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
+    vsubw.u8    q14, d6                    ; r_oq0 -= p0
+    vaddw.u8    q14, d7                    ; r_oq0 += q0
+
+    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
+
+    vaddw.u8    q14, d18                   ; oq0 += q3
+
+    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
+
+    vqrshrn.u16 d22, q14, #3               ; r_oq0
+
+    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
+    vsubw.u8    q14, d7                    ; r_oq1 -= q0
+    vaddw.u8    q14, d16                   ; r_oq1 += q1
+
+    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
+
+    vaddw.u8    q14, d18                   ; r_oq1 += q3
+
+    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
+
+    vqrshrn.u16 d6, q14, #3                ; r_oq1
+
+    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
+    vsubw.u8    q14, d16                   ; r_oq2 -= q1
+    vaddw.u8    q14, d17                   ; r_oq2 += q2
+    vaddw.u8    q14, d18                   ; r_oq2 += q3
+
+    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
+
+    vqrshrn.u16 d7, q14, #3                ; r_oq2
+
+    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
+    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
+    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
+
+    bx          lr
+
+power_branch_only
+    vmov.u8     d27, #3
+    vmov.u8     d21, #2
+    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
+    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
+    vaddw.u8    q14, d5                    ; op2 += p1
+    vqrshrn.u16 d0, q14, #3                ; op2
+
+    vsubw.u8    q14, d3                    ; op1 = op2 - p3
+    vsubw.u8    q14, d4                    ; op1 -= p2
+    vaddw.u8    q14, d5                    ; op1 += p1
+    vaddw.u8    q14, d16                   ; op1 += q1
+    vqrshrn.u16 d1, q14, #3                ; op1
+
+    vsubw.u8    q14, d3                    ; op0 = op1 - p3
+    vsubw.u8    q14, d5                    ; op0 -= p1
+    vaddw.u8    q14, d6                    ; op0 += p0
+    vaddw.u8    q14, d17                   ; op0 += q2
+    vqrshrn.u16 d2, q14, #3                ; op0
+
+    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
+    vsubw.u8    q14, d6                    ; oq0 -= p0
+    vaddw.u8    q14, d7                    ; oq0 += q0
+    vaddw.u8    q14, d18                   ; oq0 += q3
+    vqrshrn.u16 d3, q14, #3                ; oq0
+
+    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
+    vsubw.u8    q14, d7                    ; oq1 -= q0
+    vaddw.u8    q14, d16                   ; oq1 += q1
+    vaddw.u8    q14, d18                   ; oq1 += q3
+    vqrshrn.u16 d4, q14, #3                ; oq1
+
+    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
+    vsubw.u8    q14, d16                   ; oq2 -= q1
+    vaddw.u8    q14, d17                   ; oq2 += q2
+    vaddw.u8    q14, d18                   ; oq2 += q3
+    vqrshrn.u16 d5, q14, #3                ; oq2
+
+    bx          lr
+
+filter_branch_only
+    ; TODO(fgalligan): See if we can rearange registers so we do not need to
+    ; do the 2 vswp.
+    vswp        d0, d4                      ; op2
+    vswp        d5, d17                     ; oq2
+    veor        d2, d24, d22                ; *op0 = u^0x80
+    veor        d3, d21, d22                ; *oq0 = u^0x80
+    veor        d1, d25, d22                ; *op1 = u^0x80
+    veor        d4, d26, d22                ; *oq1 = u^0x80
+
+    bx          lr
+
+    ENDP        ; |vpx_mbloop_filter_neon|
+
+    END
diff --git a/aom_dsp/arm/loopfilter_8_neon.c b/aom_dsp/arm/loopfilter_8_neon.c
new file mode 100644
index 000000000..854196f42
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_8_neon.c
@@ -0,0 +1,429 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
+                                      uint8x8_t dlimit,    // limit
+                                      uint8x8_t dthresh,   // thresh
+                                      uint8x8_t d3u8,      // p2
+                                      uint8x8_t d4u8,      // p2
+                                      uint8x8_t d5u8,      // p1
+                                      uint8x8_t d6u8,      // p0
+                                      uint8x8_t d7u8,      // q0
+                                      uint8x8_t d16u8,     // q1
+                                      uint8x8_t d17u8,     // q2
+                                      uint8x8_t d18u8,     // q3
+                                      uint8x8_t *d0ru8,    // p1
+                                      uint8x8_t *d1ru8,    // p1
+                                      uint8x8_t *d2ru8,    // p0
+                                      uint8x8_t *d3ru8,    // q0
+                                      uint8x8_t *d4ru8,    // q1
+                                      uint8x8_t *d5ru8) {  // q1
+  uint32_t flat;
+  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+  int16x8_t q15s16;
+  uint16x8_t q10u16, q14u16;
+  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+  d19u8 = vabd_u8(d3u8, d4u8);
+  d20u8 = vabd_u8(d4u8, d5u8);
+  d21u8 = vabd_u8(d5u8, d6u8);
+  d22u8 = vabd_u8(d16u8, d7u8);
+  d23u8 = vabd_u8(d17u8, d16u8);
+  d24u8 = vabd_u8(d18u8, d17u8);
+
+  d19u8 = vmax_u8(d19u8, d20u8);
+  d20u8 = vmax_u8(d21u8, d22u8);
+
+  d25u8 = vabd_u8(d6u8, d4u8);
+
+  d23u8 = vmax_u8(d23u8, d24u8);
+
+  d26u8 = vabd_u8(d7u8, d17u8);
+
+  d19u8 = vmax_u8(d19u8, d20u8);
+
+  d24u8 = vabd_u8(d6u8, d7u8);
+  d27u8 = vabd_u8(d3u8, d6u8);
+  d28u8 = vabd_u8(d18u8, d7u8);
+
+  d19u8 = vmax_u8(d19u8, d23u8);
+
+  d23u8 = vabd_u8(d5u8, d16u8);
+  d24u8 = vqadd_u8(d24u8, d24u8);
+
+  d19u8 = vcge_u8(dlimit, d19u8);
+
+  d25u8 = vmax_u8(d25u8, d26u8);
+  d26u8 = vmax_u8(d27u8, d28u8);
+
+  d23u8 = vshr_n_u8(d23u8, 1);
+
+  d25u8 = vmax_u8(d25u8, d26u8);
+
+  d24u8 = vqadd_u8(d24u8, d23u8);
+
+  d20u8 = vmax_u8(d20u8, d25u8);
+
+  d23u8 = vdup_n_u8(1);
+  d24u8 = vcge_u8(dblimit, d24u8);
+
+  d21u8 = vcgt_u8(d21u8, dthresh);
+
+  d20u8 = vcge_u8(d23u8, d20u8);
+
+  d19u8 = vand_u8(d19u8, d24u8);
+
+  d23u8 = vcgt_u8(d22u8, dthresh);
+
+  d20u8 = vand_u8(d20u8, d19u8);
+
+  d22u8 = vdup_n_u8(0x80);
+
+  d23u8 = vorr_u8(d21u8, d23u8);
+
+  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
+
+  d30u8 = vshrn_n_u16(q10u16, 4);
+  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
+    d27u8 = vdup_n_u8(3);
+    d21u8 = vdup_n_u8(2);
+    q14u16 = vaddl_u8(d6u8, d7u8);
+    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+    *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vsubw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vsubw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+  } else {
+    d21u8 = veor_u8(d7u8, d22u8);
+    d24u8 = veor_u8(d6u8, d22u8);
+    d25u8 = veor_u8(d5u8, d22u8);
+    d26u8 = veor_u8(d16u8, d22u8);
+
+    d27u8 = vdup_n_u8(3);
+
+    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+    q15s16 = vaddw_s8(q15s16, d29s8);
+
+    d29u8 = vdup_n_u8(4);
+
+    d28s8 = vqmovn_s16(q15s16);
+
+    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+    d30s8 = vshr_n_s8(d30s8, 3);
+    d29s8 = vshr_n_s8(d29s8, 3);
+
+    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+    d29s8 = vrshr_n_s8(d29s8, 1);
+    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+    if (flat == 0) {  // filter_branch_only
+      *d0ru8 = d4u8;
+      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+      *d5ru8 = d17u8;
+      return;
+    }
+
+    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+    d23u8 = vdup_n_u8(2);
+    q14u16 = vaddl_u8(d6u8, d7u8);
+    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+    q14u16 = vaddw_u8(q14u16, d5u8);
+
+    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+    d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+
+    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+    d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+
+    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+    d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d7u8);
+
+    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+    q14u16 = vaddw_u8(q14u16, d18u8);
+
+    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+    d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vsubw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+
+    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+    q14u16 = vaddw_u8(q14u16, d18u8);
+
+    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+    d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vsubw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+
+    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+    d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+  }
+  return;
+}
+
+void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s, *psrc;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  uint8x8_t d16u8, d17u8, d18u8;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  psrc = src - (pitch << 2);
+  for (i = 0; i < 1; i++) {
+    s = psrc + i * 8;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+                       &d5u8);
+
+    s -= (pitch * 6);
+    vst1_u8(s, d0u8);
+    s += pitch;
+    vst1_u8(s, d1u8);
+    s += pitch;
+    vst1_u8(s, d2u8);
+    s += pitch;
+    vst1_u8(s, d3u8);
+    s += pitch;
+    vst1_u8(s, d4u8);
+    s += pitch;
+    vst1_u8(s, d5u8);
+  }
+  return;
+}
+
+void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  uint8x8_t d16u8, d17u8, d18u8;
+  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+  uint8x8x4_t d4Result;
+  uint8x8x2_t d2Result;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  for (i = 0; i < 1; i++) {
+    s = src + (i * (pitch << 3)) - 4;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                      vreinterpret_u16_u32(d2tmp2.val[0]));
+    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                      vreinterpret_u16_u32(d2tmp3.val[0]));
+    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                      vreinterpret_u16_u32(d2tmp2.val[1]));
+    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                      vreinterpret_u16_u32(d2tmp3.val[1]));
+
+    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                     vreinterpret_u8_u16(d2tmp5.val[0]));
+    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                     vreinterpret_u8_u16(d2tmp5.val[1]));
+    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                      vreinterpret_u8_u16(d2tmp7.val[0]));
+    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                      vreinterpret_u8_u16(d2tmp7.val[1]));
+
+    d3u8 = d2tmp8.val[0];
+    d4u8 = d2tmp8.val[1];
+    d5u8 = d2tmp9.val[0];
+    d6u8 = d2tmp9.val[1];
+    d7u8 = d2tmp10.val[0];
+    d16u8 = d2tmp10.val[1];
+    d17u8 = d2tmp11.val[0];
+    d18u8 = d2tmp11.val[1];
+
+    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+                       &d5u8);
+
+    d4Result.val[0] = d0u8;
+    d4Result.val[1] = d1u8;
+    d4Result.val[2] = d2u8;
+    d4Result.val[3] = d3u8;
+
+    d2Result.val[0] = d4u8;
+    d2Result.val[1] = d5u8;
+
+    s = src - 3;
+    vst4_lane_u8(s, d4Result, 0);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 1);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 2);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 3);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 4);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 5);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 6);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 7);
+
+    s = src + 1;
+    vst2_lane_u8(s, d2Result, 0);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 1);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 2);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 3);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 4);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 5);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 6);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 7);
+  }
+  return;
+}
diff --git a/aom_dsp/arm/loopfilter_mb_neon.asm b/aom_dsp/arm/loopfilter_mb_neon.asm
new file mode 100644
index 000000000..d5da7a840
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_mb_neon.asm
@@ -0,0 +1,635 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_edge_8_neon|
+    EXPORT  |vpx_lpf_horizontal_edge_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+;                             const uint8_t *blimit,
+;                             const uint8_t *limit,
+;                             const uint8_t *thresh,
+;                             int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; r12   int count
+|mb_lpf_horizontal_edge| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+
+h_count
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
+
+    vld1.u8     {d0}, [r8@64], r1          ; p7
+    vld1.u8     {d1}, [r8@64], r1          ; p6
+    vld1.u8     {d2}, [r8@64], r1          ; p5
+    vld1.u8     {d3}, [r8@64], r1          ; p4
+    vld1.u8     {d4}, [r8@64], r1          ; p3
+    vld1.u8     {d5}, [r8@64], r1          ; p2
+    vld1.u8     {d6}, [r8@64], r1          ; p1
+    vld1.u8     {d7}, [r8@64], r1          ; p0
+    vld1.u8     {d8}, [r8@64], r1          ; q0
+    vld1.u8     {d9}, [r8@64], r1          ; q1
+    vld1.u8     {d10}, [r8@64], r1         ; q2
+    vld1.u8     {d11}, [r8@64], r1         ; q3
+    vld1.u8     {d12}, [r8@64], r1         ; q4
+    vld1.u8     {d13}, [r8@64], r1         ; q5
+    vld1.u8     {d14}, [r8@64], r1         ; q6
+    vld1.u8     {d15}, [r8@64], r1         ; q7
+
+    bl          vpx_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         h_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, r1, lsl #1
+
+    vst1.u8     {d25}, [r8@64], r1         ; store op1
+    vst1.u8     {d24}, [r8@64], r1         ; store op0
+    vst1.u8     {d23}, [r8@64], r1         ; store oq0
+    vst1.u8     {d26}, [r8@64], r1         ; store oq1
+
+    b           h_next
+
+h_mbfilter
+    tst         r7, #2
+    beq         h_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, r1, lsl #1
+    sub         r8, r8, r1
+
+    vst1.u8     {d18}, [r8@64], r1         ; store op2
+    vst1.u8     {d19}, [r8@64], r1         ; store op1
+    vst1.u8     {d20}, [r8@64], r1         ; store op0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq0
+    vst1.u8     {d22}, [r8@64], r1         ; store oq1
+    vst1.u8     {d23}, [r8@64], r1         ; store oq2
+
+    b           h_next
+
+h_wide_mbfilter
+    sub         r8, r0, r1, lsl #3
+    add         r8, r8, r1
+
+    vst1.u8     {d16}, [r8@64], r1         ; store op6
+    vst1.u8     {d24}, [r8@64], r1         ; store op5
+    vst1.u8     {d25}, [r8@64], r1         ; store op4
+    vst1.u8     {d26}, [r8@64], r1         ; store op3
+    vst1.u8     {d27}, [r8@64], r1         ; store op2
+    vst1.u8     {d18}, [r8@64], r1         ; store op1
+    vst1.u8     {d19}, [r8@64], r1         ; store op0
+    vst1.u8     {d20}, [r8@64], r1         ; store oq0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq1
+    vst1.u8     {d22}, [r8@64], r1         ; store oq2
+    vst1.u8     {d23}, [r8@64], r1         ; store oq3
+    vst1.u8     {d1}, [r8@64], r1          ; store oq4
+    vst1.u8     {d2}, [r8@64], r1          ; store oq5
+    vst1.u8     {d3}, [r8@64], r1          ; store oq6
+
+h_next
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         h_count
+
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |mb_lpf_horizontal_edge|
+
+; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
+;                                     const uint8_t *blimit,
+;                                     const uint8_t *limit,
+;                                     const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_edge_8_neon| PROC
+    mov r12, #1
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|
+
+; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
+;                                      const uint8_t *blimit,
+;                                      const uint8_t *limit,
+;                                      const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_edge_16_neon| PROC
+    mov r12, #2
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|
+
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
+;                               const uint8_t *blimit,
+;                               const uint8_t *limit,
+;                               const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vpx_lpf_vertical_16_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, #8
+
+    vld1.8      {d0}, [r8@64], r1
+    vld1.8      {d8}, [r0@64], r1
+    vld1.8      {d1}, [r8@64], r1
+    vld1.8      {d9}, [r0@64], r1
+    vld1.8      {d2}, [r8@64], r1
+    vld1.8      {d10}, [r0@64], r1
+    vld1.8      {d3}, [r8@64], r1
+    vld1.8      {d11}, [r0@64], r1
+    vld1.8      {d4}, [r8@64], r1
+    vld1.8      {d12}, [r0@64], r1
+    vld1.8      {d5}, [r8@64], r1
+    vld1.8      {d13}, [r0@64], r1
+    vld1.8      {d6}, [r8@64], r1
+    vld1.8      {d14}, [r0@64], r1
+    vld1.8      {d7}, [r8@64], r1
+    vld1.8      {d15}, [r0@64], r1
+
+    sub         r0, r0, r1, lsl #3
+
+    vtrn.32     q0, q2
+    vtrn.32     q1, q3
+    vtrn.32     q4, q6
+    vtrn.32     q5, q7
+
+    vtrn.16     q0, q1
+    vtrn.16     q2, q3
+    vtrn.16     q4, q5
+    vtrn.16     q6, q7
+
+    vtrn.8      d0, d1
+    vtrn.8      d2, d3
+    vtrn.8      d4, d5
+    vtrn.8      d6, d7
+
+    vtrn.8      d8, d9
+    vtrn.8      d10, d11
+    vtrn.8      d12, d13
+    vtrn.8      d14, d15
+
+    bl          vpx_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         v_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, #2
+
+    vswp        d23, d25
+
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+    b           v_end
+
+v_mbfilter
+    tst         r7, #2
+    beq         v_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, #3
+
+    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
+    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
+    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
+    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
+    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
+    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
+    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
+    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
+    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
+    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
+    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
+    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
+    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
+    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
+    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
+    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
+
+    b           v_end
+
+v_wide_mbfilter
+    sub         r8, r0, #8
+
+    vtrn.32     d0,  d26
+    vtrn.32     d16, d27
+    vtrn.32     d24, d18
+    vtrn.32     d25, d19
+
+    vtrn.16     d0,  d24
+    vtrn.16     d16, d25
+    vtrn.16     d26, d18
+    vtrn.16     d27, d19
+
+    vtrn.8      d0,  d16
+    vtrn.8      d24, d25
+    vtrn.8      d26, d27
+    vtrn.8      d18, d19
+
+    vtrn.32     d20, d1
+    vtrn.32     d21, d2
+    vtrn.32     d22, d3
+    vtrn.32     d23, d15
+
+    vtrn.16     d20, d22
+    vtrn.16     d21, d23
+    vtrn.16     d1,  d3
+    vtrn.16     d2,  d15
+
+    vtrn.8      d20, d21
+    vtrn.8      d22, d23
+    vtrn.8      d1,  d2
+    vtrn.8      d3,  d15
+
+    vst1.8      {d0}, [r8@64], r1
+    vst1.8      {d20}, [r0@64], r1
+    vst1.8      {d16}, [r8@64], r1
+    vst1.8      {d21}, [r0@64], r1
+    vst1.8      {d24}, [r8@64], r1
+    vst1.8      {d22}, [r0@64], r1
+    vst1.8      {d25}, [r8@64], r1
+    vst1.8      {d23}, [r0@64], r1
+    vst1.8      {d26}, [r8@64], r1
+    vst1.8      {d1}, [r0@64], r1
+    vst1.8      {d27}, [r8@64], r1
+    vst1.8      {d2}, [r0@64], r1
+    vst1.8      {d18}, [r8@64], r1
+    vst1.8      {d3}, [r0@64], r1
+    vst1.8      {d19}, [r8@64], r1
+    vst1.8      {d15}, [r0@64], r1
+
+v_end
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vpx_lpf_vertical_16_neon|
+
+; void vpx_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16    blimit
+; d17    limit
+; d18    thresh
+; d0    p7
+; d1    p6
+; d2    p5
+; d3    p4
+; d4    p3
+; d5    p2
+; d6    p1
+; d7    p0
+; d8    q0
+; d9    q1
+; d10   q2
+; d11   q3
+; d12   q4
+; d13   q5
+; d14   q6
+; d15   q7
+|vpx_wide_mbfilter_neon| PROC
+    mov         r7, #0
+
+    ; filter_mask
+    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
+    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
+    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
+    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
+    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
+    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
+    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
+    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d17, d19
+
+    ; flatmask4
+    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
+    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
+    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
+    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
+    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
+    vmax.u8     d25, d25, d26
+    vmax.u8     d20, d20, d25
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmov.u8     d30, #1
+    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
+
+    vcge.u8     d20, d30, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    ; hevmask
+    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
+    vorr        d21, d21, d22              ; hev
+
+    vand        d16, d20, d19              ; flat && mask
+    vmov        r5, r6, d16
+
+    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
+    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
+    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
+    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
+    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
+    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
+    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
+    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
+    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
+    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
+    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
+
+    vmax.u8     d26, d22, d23
+    vmax.u8     d27, d24, d25
+    vmax.u8     d23, d26, d27
+
+    vcge.u8     d18, d30, d23              ; flat2
+
+    vmov.u8     d22, #0x80
+
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
+    vand        d17, d18, d16              ; flat2 && flat && mask
+    vmov        r5, r6, d17
+
+    ; mbfilter() function
+
+    ; filter() function
+    ; convert to signed
+    veor        d23, d8, d22               ; qs0
+    veor        d24, d7, d22               ; ps0
+    veor        d25, d6, d22               ; ps1
+    veor        d26, d9, d22               ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+    vand        d29, d29, d21              ; filter &= hev
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d21              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d23, d23, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    tst         r7, #1
+    bxne        lr
+
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
+    ; mbfilter flat && mask branch
+    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+    ; and using vibt on the q's?
+    vmov.u8     d29, #2
+    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
+    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
+    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddl.u8    q10, d4, d5
+    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vaddl.u8    q14, d6, d9
+    vqrshrn.u16 d18, q15, #3               ; r_op2
+
+    vsub.i16    q15, q10
+    vaddl.u8    q10, d4, d6
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d7, d10
+    vqrshrn.u16 d19, q15, #3               ; r_op1
+
+    vsub.i16    q15, q10
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d8, d11
+    vqrshrn.u16 d20, q15, #3               ; r_op0
+
+    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d9, d11
+    vqrshrn.u16 d21, q15, #3               ; r_oq0
+
+    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d10, d11
+    vqrshrn.u16 d22, q15, #3               ; r_oq1
+
+    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vadd.i16    q15, q14
+    vqrshrn.u16 d27, q15, #3               ; r_oq2
+
+    ; Filter does not set op2 or oq2, so use p2 and q2.
+    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
+    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
+    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
+    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
+    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
+    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
+
+    tst         r7, #2
+    bxne        lr
+
+    ; wide_mbfilter flat2 && flat && mask branch
+    vmov.u8     d16, #7
+    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vaddl.u8    q12, d2, d3
+    vaddl.u8    q13, d4, d5
+    vaddl.u8    q14, d1, d6
+    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
+    vadd.i16    q12, q13
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vadd.i16    q15, q12
+    vaddl.u8    q12, d0, d1
+    vaddw.u8    q15, d1
+    vaddl.u8    q13, d0, d2
+    vadd.i16    q14, q15, q14
+    vqrshrn.u16 d16, q15, #4               ; w_op6
+
+    vsub.i16    q15, q14, q12
+    vaddl.u8    q14, d3, d10
+    vqrshrn.u16 d24, q15, #4               ; w_op5
+
+    vsub.i16    q15, q13
+    vaddl.u8    q13, d0, d3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vqrshrn.u16 d25, q15, #4               ; w_op4
+
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d0, d4
+    vsub.i16    q15, q13
+    vsub.i16    q14, q15, q14
+    vqrshrn.u16 d26, q15, #4               ; w_op3
+
+    vaddw.u8    q15, q14, d5               ; op2 += p2
+    vaddl.u8    q14, d0, d5
+    vaddw.u8    q15, d12                   ; op2 += q4
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vqrshrn.u16 d27, q15, #4               ; w_op2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d6
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d13                   ; op1 += q5
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
+    vqrshrn.u16 d18, q15, #4               ; w_op1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d7
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d14                   ; op0 += q6
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
+    vqrshrn.u16 d19, q15, #4               ; w_op0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d1, d8
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d15                   ; oq0 += q7
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
+    vqrshrn.u16 d20, q15, #4               ; w_oq0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddl.u8    q4, d10, d15
+    vaddw.u8    q15, d15                   ; oq1 += q7
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
+    vqrshrn.u16 d21, q15, #4               ; w_oq1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d3, d10
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d11, d15
+    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
+    vqrshrn.u16 d22, q15, #4               ; w_oq2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d12, d15
+    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
+    vqrshrn.u16 d23, q15, #4               ; w_oq3
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d5, d12
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d13, d15
+    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
+    vqrshrn.u16 d1, q15, #4                ; w_oq4
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d6, d13
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d14, d15
+    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
+    vqrshrn.u16 d2, q15, #4                ; w_oq5
+
+    vsub.i16    q15, q14
+    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
+    vadd.i16    q15, q4
+    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
+    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
+    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
+    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
+
+    bx          lr
+    ENDP        ; |vpx_wide_mbfilter_neon|
+
+    END
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
new file mode 100644
index 000000000..04c163ab0
--- /dev/null
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "aom/vpx_integer.h"
+
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+#if HAVE_NEON_ASM
+void vpx_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // HAVE_NEON_ASM
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
new file mode 100644
index 000000000..11f13befb
--- /dev/null
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -0,0 +1,224 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "aom/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
+                &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
+                &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
+                &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
+                &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo =
+        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo =
+        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo =
+        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo =
+        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/aom_dsp/arm/sad_media.asm b/aom_dsp/arm/sad_media.asm
new file mode 100644
index 000000000..aed1d3a22
--- /dev/null
+++ b/aom_dsp/arm/sad_media.asm
@@ -0,0 +1,95 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_sad16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+|vpx_sad16x16_media| PROC
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
new file mode 100644
index 000000000..19fa1093c
--- /dev/null
+++ b/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+#include "aom/vpx_integer.h"
+
+unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 15; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
+                             unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x2_t d1;
+  uint64x1_t d3;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 3; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  d1 = vpaddl_u16(vget_low_u16(q12));
+  d3 = vpaddl_u32(d1);
+
+  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x16_t q0, q4;
+  uint16x8_t q12, q13;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  q0 = vld1q_u8(src_ptr);
+  src_ptr += src_stride;
+  q4 = vld1q_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+  for (i = 0; i < 7; i++) {
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+  }
+
+  q12 = vaddq_u16(q12, q13);
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+                            vget_low_u8(vec_ref_32));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+                            vget_high_u8(vec_ref_32));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+                            vget_low_u8(vec_ref_48));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+                            vget_high_u8(vec_ref_48));
+  }
+  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+}
+
+unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref = vld1q_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo =
+        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
+    vec_accum_hi =
+        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum = vdupq_n_u16(0);
+
+  for (i = 0; i < 8; ++i) {
+    const uint8x8_t vec_src = vld1_u8(src);
+    const uint8x8_t vec_ref = vld1_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  }
+  return horizontal_add_16x8(vec_accum);
+}
diff --git a/aom_dsp/arm/save_reg_neon.asm b/aom_dsp/arm/save_reg_neon.asm
new file mode 100644
index 000000000..c9ca10801
--- /dev/null
+++ b/aom_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_push_neon|
+    EXPORT  |vpx_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vpx_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
diff --git a/aom_dsp/arm/subpel_variance_media.c b/aom_dsp/arm/subpel_variance_media.c
new file mode 100644
index 000000000..69b1b331a
--- /dev/null
+++ b/aom_dsp/arm/subpel_variance_media.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "aom/vpx_integer.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
+                                                      { 96, 32 }, { 80, 48 },
+                                                      { 64, 64 }, { 48, 80 },
+                                                      { 32, 96 }, { 16, 112 } };
+
+extern void vpx_filter_block2d_bil_first_pass_media(
+    const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
+    uint32_t height, uint32_t width, const int16_t *filter);
+
+extern void vpx_filter_block2d_bil_second_pass_media(
+    const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
+    uint32_t height, uint32_t width, const int16_t *filter);
+
+unsigned int vpx_sub_pixel_variance8x8_media(
+    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+  uint16_t first_pass[10 * 8];
+  uint8_t second_pass[8 * 8];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = bilinear_filters_media[xoffset];
+  VFilter = bilinear_filters_media[yoffset];
+
+  vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                          src_pixels_per_line, 9, 8, HFilter);
+  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
+                                           VFilter);
+
+  return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
+                               sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_media(
+    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+  uint16_t first_pass[36 * 16];
+  uint8_t second_pass[20 * 16];
+  const int16_t *HFilter, *VFilter;
+  unsigned int var;
+
+  if (xoffset == 4 && yoffset == 0) {
+    var = vpx_variance_halfpixvar16x16_h_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  } else if (xoffset == 0 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_v_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  } else if (xoffset == 4 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_hv_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  } else {
+    HFilter = bilinear_filters_media[xoffset];
+    VFilter = bilinear_filters_media[yoffset];
+
+    vpx_filter_block2d_bil_first_pass_media(
+        src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
+    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
+                                             16, VFilter);
+
+    var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
+                                  sse);
+  }
+  return var;
+}
+#endif  // HAVE_MEDIA
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 000000000..caa3f4a74
--- /dev/null
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/vpx_integer.h"
+
+#include "aom_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 16) {
+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+    }
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *dst, int dst_stride,
+                                            unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
+                            bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
+                            bilinear_filters[yoffset]);
+  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
+                             bilinear_filters[yoffset]);
+  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
+                             bilinear_filters[yoffset]);
+  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
+                             bilinear_filters[yoffset]);
+  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}
diff --git a/aom_dsp/arm/subtract_neon.c b/aom_dsp/arm/subtract_neon.c
new file mode 100644
index 000000000..ab7157cab
--- /dev/null
+++ b/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "aom/vpx_integer.h"
+
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
+                             ptrdiff_t diff_stride, const uint8_t *src,
+                             ptrdiff_t src_stride, const uint8_t *pred,
+                             ptrdiff_t pred_stride) {
+  int r, c;
+
+  if (cols > 16) {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; c += 32) {
+        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t v_diff_lo_00 =
+            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+        const uint16x8_t v_diff_hi_00 =
+            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+        const uint16x8_t v_diff_lo_16 =
+            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+        const uint16x8_t v_diff_hi_16 =
+            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+      }
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else if (cols > 8) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x16_t v_src = vld1q_u8(&src[0]);
+      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+      const uint16x8_t v_diff_lo =
+          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+      const uint16x8_t v_diff_hi =
+          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else if (cols > 4) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint8x8_t v_pred = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
+
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  }
+}
diff --git a/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm b/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
new file mode 100644
index 000000000..dab845a20
--- /dev/null
+++ b/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
@@ -0,0 +1,182 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance_halfpixvar16x16_h_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_h_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
new file mode 100644
index 000000000..01953b709
--- /dev/null
+++ b/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
@@ -0,0 +1,222 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_hv_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
diff --git a/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm b/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
new file mode 100644
index 000000000..0d17acb38
--- /dev/null
+++ b/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
@@ -0,0 +1,184 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance_halfpixvar16x16_v_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_v_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/aom_dsp/arm/variance_media.asm b/aom_dsp/arm/variance_media.asm
new file mode 100644
index 000000000..f7f9e14b0
--- /dev/null
+++ b/aom_dsp/arm/variance_media.asm
@@ -0,0 +1,358 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance16x16_media|
+    EXPORT  |vpx_variance8x8_media|
+    EXPORT  |vpx_mse16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance16x16_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop16x16
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop16x16
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance8x8_media| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop8x8
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop8x8
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vpx_variance16x16_media. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vpx_mse16x16_media| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loopmse
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loopmse
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
new file mode 100644
index 000000000..fcf6e4549
--- /dev/null
+++ b/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,399 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "aom/vpx_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, uint32_t *sse,
+                             int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo =
+          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+      v_sse_hi =
+          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                        int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
+}
+
+unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
+}
+
+unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
+                   32, 32, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+}
+
+unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 4; i++) {
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  uint8x8_t d0u8, d2u8, d4u8, d6u8;
+  int16x4_t d22s16, d23s16, d24s16, d25s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint16x8_t q11u16, q12u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d2u8, d6u8);
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
+                               const unsigned char *ref_ptr, int recon_stride,
+                               unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  int64x1_t d0s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  q7s32 = vdupq_n_s32(0);
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q10s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q10s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride) {
+  int16x4_t d22s16, d24s16, d26s16, d28s16;
+  int64x1_t d0s64;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  d0u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d4u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d1u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d5u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d2u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d6u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d3u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d7u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+
+  q11u16 = vsubl_u8(d0u8, d4u8);
+  q12u16 = vsubl_u8(d1u8, d5u8);
+  q13u16 = vsubl_u8(d2u8, d6u8);
+  q14u16 = vsubl_u8(d3u8, d7u8);
+
+  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+  q7s32 = vmull_s16(d22s16, d22s16);
+  q8s32 = vmull_s16(d24s16, d24s16);
+  q9s32 = vmull_s16(d26s16, d26s16);
+  q10s32 = vmull_s16(d28s16, d28s16);
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q9s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q9s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
diff --git a/aom_dsp/arm/vpx_convolve8_avg_neon.c b/aom_dsp/arm/vpx_convolve8_avg_neon.c
new file mode 100644
index 000000000..c6b183100
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve8_avg_neon.c
@@ -0,0 +1,355 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "aom/vpx_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
+                                       int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y,  // unused
+                                  int y_step_q4,            // unused
+                                  int w, int h) {
+  int width;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;                // adjust for taps
+  for (; h > 0; h -= 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 =
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    src += 7;
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
+    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
+      s = src;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(src + 64);
+
+      d0x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+      d1x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(src + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 =
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(src + 64 + src_stride * 2);
+
+      d = dst;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+                             d23s16, d24s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+                             d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(src + 64 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      d = dst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+    src += src_stride * 4 - w - 7;
+    dst += dst_stride * 4 - w;
+  }
+  return;
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x,  // unused
+                                 int x_step_q4,            // unused
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  uint8x16_t q1u8, q3u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+      d -= dst_stride * 3;
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+                             d22s16, d24s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+                             d24s16, d26s16, q0s16);
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
diff --git a/aom_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/aom_dsp/arm/vpx_convolve8_avg_neon_asm.asm
new file mode 100644
index 000000000..e279d570f
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -0,0 +1,292 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vpx_convolve8_avg_horiz_neon|
+    EXPORT  |vpx_convolve8_avg_vert_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vpx_convolve8_avg_horiz_neon| PROC
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+vpx_convolve8_avg_loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+vpx_convolve8_avg_loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    ; slightly out of order load to match the existing data
+    vld1.u32        {d6[0]}, [r2], r3
+    vld1.u32        {d7[0]}, [r2], r3
+    vld1.u32        {d6[1]}, [r2], r3
+    vld1.u32        {d7[1]}, [r2], r3
+
+    sub             r2, r2, r3, lsl #2      ; reset for store
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    ; average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_avg_loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt vpx_convolve8_avg_loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+    ENDP
+
+|vpx_convolve8_avg_vert_neon| PROC
+    push            {r4-r8, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
+
+    vld1.s16        {q0}, [r4]              ; filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+vpx_convolve8_avg_loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+vpx_convolve8_avg_loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    vld1.u32        {d6[0]}, [r5@32], r3
+    vld1.u32        {d6[1]}, [r8@32], r3
+    vld1.u32        {d7[0]}, [r5@32], r3
+    vld1.u32        {d7[1]}, [r8@32], r3
+
+    pld             [r7]
+    pld             [r4]
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    sub             r5, r5, r3, lsl #1      ; reset for store
+    sub             r8, r8, r3, lsl #1
+
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
+    bgt             vpx_convolve8_avg_loop_vert
+
+    ; outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_avg_loop_vert_h
+
+    pop             {r4-r8, pc}
+
+    ENDP
+    END
diff --git a/aom_dsp/arm/vpx_convolve8_neon.c b/aom_dsp/arm/vpx_convolve8_neon.c
new file mode 100644
index 000000000..b84be935b
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve8_neon.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "aom/vpx_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
+                                       int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y,  // unused
+                              int y_step_q4,            // unused
+                              int w, int h) {
+  int width;
+  const uint8_t *s, *psrc;
+  uint8_t *d, *pdst;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4, src += src_stride * 4,
+                dst += dst_stride * 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 =
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+    __builtin_prefetch(src + src_stride * 6);
+
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w, psrc = src + 7, pdst = dst; width > 0;
+         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
+      s = psrc;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(psrc + 64);
+
+      d0x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+      d1x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(psrc + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 =
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+                             d23s16, d24s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+                             d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+      d = pdst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+  }
+  return;
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,  // unused
+                             int x_step_q4,            // unused
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint32x2_t d2u32, d3u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+                             d22s16, d24s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+                             d24s16, d26s16, q0s16);
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
diff --git a/aom_dsp/arm/vpx_convolve8_neon_asm.asm b/aom_dsp/arm/vpx_convolve8_neon_asm.asm
new file mode 100644
index 000000000..2d0f2ae06
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -0,0 +1,270 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vpx_convolve8_horiz_neon|
+    EXPORT  |vpx_convolve8_vert_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vpx_convolve8_horiz_neon| PROC
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+vpx_convolve8_loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+vpx_convolve8_loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt vpx_convolve8_loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+    ENDP
+
+|vpx_convolve8_vert_neon| PROC
+    push            {r4-r8, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
+
+    vld1.s16        {q0}, [r4]              ; filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+vpx_convolve8_loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+vpx_convolve8_loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r7]
+    pld             [r4]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
+    bgt             vpx_convolve8_loop_vert
+
+    ; outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_loop_vert_h
+
+    pop             {r4-r8, pc}
+
+    ENDP
+    END
diff --git a/aom_dsp/arm/vpx_convolve_avg_neon.c b/aom_dsp/arm/vpx_convolve_avg_neon.c
new file mode 100644
index 000000000..a04d38467
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve_avg_neon.c
@@ -0,0 +1,144 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "aom/vpx_integer.h"
+
+void vpx_convolve_avg_neon(const uint8_t *src,    // r0
+                           ptrdiff_t src_stride,  // r1
+                           uint8_t *dst,          // r2
+                           ptrdiff_t dst_stride,  // r3
+                           const int16_t *filter_x, int filter_x_stride,
+                           const int16_t *filter_y, int filter_y_stride, int w,
+                           int h) {
+  uint8_t *d;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint32x2_t d0u32, d2u32;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  d = dst;
+  if (w > 32) {  // avg64
+    for (; h > 0; h -= 1) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
+      src += src_stride;
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
+      q10u8 = vld1q_u8(d + 32);
+      q11u8 = vld1q_u8(d + 48);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // avg32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+      q10u8 = vld1q_u8(d);
+      q11u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // avg16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+      q2u8 = vld1q_u8(d);
+      d += dst_stride;
+      q3u8 = vld1q_u8(d);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q2u8);
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // avg8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d1u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(d);
+      d += dst_stride;
+      d3u8 = vld1_u8(d);
+      d += dst_stride;
+
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+      vst1_u8(dst, vget_low_u8(q0u8));
+      dst += dst_stride;
+      vst1_u8(dst, vget_high_u8(q0u8));
+      dst += dst_stride;
+    }
+  } else {  // avg4
+    for (; h > 0; h -= 2) {
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+      src += src_stride;
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+      src += src_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+
+      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
+
+      d0u32 = vreinterpret_u32_u8(d0u8);
+      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+      dst += dst_stride;
+    }
+  }
+  return;
+}
diff --git a/aom_dsp/arm/vpx_convolve_avg_neon_asm.asm b/aom_dsp/arm/vpx_convolve_avg_neon_asm.asm
new file mode 100644
index 000000000..97e6189fd
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_convolve_avg_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_avg_neon| PROC
+    push                {r4-r6, lr}
+    ldrd                r4, r5, [sp, #32]
+    mov                 r6, r2
+
+    cmp                 r4, #32
+    bgt                 avg64
+    beq                 avg32
+    cmp                 r4, #8
+    bgt                 avg16
+    beq                 avg8
+    b                   avg4
+
+avg64
+    sub                 lr, r1, #32
+    sub                 r4, r3, #32
+avg64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    pld                 [r2, r3]
+    vld1.8              {q8-q9},   [r6@128]!
+    vld1.8              {q10-q11}, [r6@128], r4
+    vrhadd.u8           q0, q0, q8
+    vrhadd.u8           q1, q1, q9
+    vrhadd.u8           q2, q2, q10
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r4
+    subs                r5, r5, #1
+    bgt                 avg64_h
+    pop                 {r4-r6, pc}
+
+avg32
+    vld1.8              {q0-q1}, [r0], r1
+    vld1.8              {q2-q3}, [r0], r1
+    vld1.8              {q8-q9},   [r6@128], r3
+    vld1.8              {q10-q11}, [r6@128], r3
+    pld                 [r0]
+    vrhadd.u8           q0, q0, q8
+    pld                 [r0, r1]
+    vrhadd.u8           q1, q1, q9
+    pld                 [r6]
+    vrhadd.u8           q2, q2, q10
+    pld                 [r6, r3]
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg32
+    pop                 {r4-r6, pc}
+
+avg16
+    vld1.8              {q0}, [r0], r1
+    vld1.8              {q1}, [r0], r1
+    vld1.8              {q2}, [r6@128], r3
+    vld1.8              {q3}, [r6@128], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q2
+    pld                 [r6]
+    pld                 [r6, r3]
+    vrhadd.u8           q1, q1, q3
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg16
+    pop                 {r4-r6, pc}
+
+avg8
+    vld1.8              {d0}, [r0], r1
+    vld1.8              {d1}, [r0], r1
+    vld1.8              {d2}, [r6@64], r3
+    vld1.8              {d3}, [r6@64], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q1
+    pld                 [r6]
+    pld                 [r6, r3]
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d1}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 avg8
+    pop                 {r4-r6, pc}
+
+avg4
+    vld1.32             {d0[0]}, [r0], r1
+    vld1.32             {d0[1]}, [r0], r1
+    vld1.32             {d2[0]}, [r6@32], r3
+    vld1.32             {d2[1]}, [r6@32], r3
+    vrhadd.u8           d0, d0, d2
+    vst1.32             {d0[0]}, [r2@32], r3
+    vst1.32             {d0[1]}, [r2@32], r3
+    subs                r5, r5, #2
+    bgt                 avg4
+    pop                 {r4-r6, pc}
+    ENDP
+
+    END
diff --git a/aom_dsp/arm/vpx_convolve_copy_neon.c b/aom_dsp/arm/vpx_convolve_copy_neon.c
new file mode 100644
index 000000000..8000eb7e5
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve_copy_neon.c
@@ -0,0 +1,92 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "aom/vpx_integer.h"
+
+void vpx_convolve_copy_neon(const uint8_t *src,    // r0
+                            ptrdiff_t src_stride,  // r1
+                            uint8_t *dst,          // r2
+                            ptrdiff_t dst_stride,  // r3
+                            const int16_t *filter_x, int filter_x_stride,
+                            const int16_t *filter_y, int filter_y_stride, int w,
+                            int h) {
+  uint8x8_t d0u8, d2u8;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  if (w > 32) {  // copy64
+    for (; h > 0; h--) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // copy32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // copy16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // copy8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(src);
+      src += src_stride;
+
+      vst1_u8(dst, d0u8);
+      dst += dst_stride;
+      vst1_u8(dst, d2u8);
+      dst += dst_stride;
+    }
+  } else {  // copy4
+    for (; h > 0; h--) {
+      *(uint32_t *)dst = *(const uint32_t *)src;
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+  return;
+}
diff --git a/aom_dsp/arm/vpx_convolve_copy_neon_asm.asm b/aom_dsp/arm/vpx_convolve_copy_neon_asm.asm
new file mode 100644
index 000000000..89164ad48
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_convolve_copy_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_copy_neon| PROC
+    push                {r4-r5, lr}
+    ldrd                r4, r5, [sp, #28]
+
+    cmp                 r4, #32
+    bgt                 copy64
+    beq                 copy32
+    cmp                 r4, #8
+    bgt                 copy16
+    beq                 copy8
+    b                   copy4
+
+copy64
+    sub                 lr, r1, #32
+    sub                 r3, r3, #32
+copy64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #1
+    bgt                 copy64_h
+    pop                 {r4-r5, pc}
+
+copy32
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q2-q3}, [r0], r1
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy32
+    pop                 {r4-r5, pc}
+
+copy16
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q1}, [r0], r1
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy16
+    pop                 {r4-r5, pc}
+
+copy8
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d2}, [r0], r1
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d2}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 copy8
+    pop                 {r4-r5, pc}
+
+copy4
+    ldr                 r12, [r0], r1
+    str                 r12, [r2], r3
+    subs                r5, r5, #1
+    bgt                 copy4
+    pop                 {r4-r5, pc}
+    ENDP
+
+    END
diff --git a/aom_dsp/arm/vpx_convolve_neon.c b/aom_dsp/arm/vpx_convolve_neon.c
new file mode 100644
index 000000000..297b64b21
--- /dev/null
+++ b/aom_dsp/arm/vpx_convolve_neon.c
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "aom_dsp/vpx_dsp_common.h"
+#include "aom_ports/mem.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
+                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+   */
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+
+  // Account for the vertical phase needing 3 lines prior and 4 lines post
+  int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the
+   * given height and filter a multiple of 4 lines. Since this goes in to
+   * the temp buffer which has lots of extra room and is subsequently discarded
+   * this is safe if somewhat less than ideal.
+   */
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w,
+                           intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                          x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w,
+                           intermediate_height);
+  vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+}
author	Yaowu Xu <yaowu@google.com>	2016-08-22 16:08:15 -0700
committer	Yaowu Xu <yaowu@google.com>	2016-08-31 17:26:24 -0700
commit	c27fc14b02dd41f677290022b0de2d296f5e6dca (patch)
tree	4efe2eb5549de86dc09449e694be310236f867cf /aom_dsp/arm
parent	b1fb998c460e22d9b119d5b66b6609c4a544876f (diff)