File 0011-add-some-neon.patch of Package gst-plugins-base

From b6b043f71f89b77de441726b1dabed961291ddf1 Mon Sep 17 00:00:00 2001
From: Rob Clark <rob@ti.com>
Date: Thu, 8 Apr 2010 00:30:25 -0500
Subject: [PATCH 11/38] add some neon

---
 configure.ac           |    1 +
 gst/stride/Makefile.am |    1 +
 gst/stride/armv7.s     |  119 ++++++++++++++++++++++++++++++++++++++++++++++++
 gst/stride/convert.c   |   76 ++++++++++++++++--------------
 4 files changed, 162 insertions(+), 35 deletions(-)
 create mode 100644 gst/stride/armv7.s

diff --git a/configure.ac b/configure.ac
index 4a467d9..4c65068 100644
--- a/configure.ac
+++ b/configure.ac
@@ -58,6 +58,7 @@ dnl AS_LIBTOOL_TAGS
 
 AC_LIBTOOL_WIN32_DLL
 AM_PROG_LIBTOOL
+AM_PROG_AS
 
 dnl *** required versions of GStreamer stuff ***
 GST_REQ=0.10.34
diff --git a/gst/stride/Makefile.am b/gst/stride/Makefile.am
index 0b61d55..3b466de 100644
--- a/gst/stride/Makefile.am
+++ b/gst/stride/Makefile.am
@@ -3,6 +3,7 @@ plugin_LTLIBRARIES = libgststridetransform.la
 libgststridetransform_la_SOURCES = \
 	gststridetransform.c \
 	convert.c \
+	armv7.s \
 	plugin.c
 
 libgststridetransform_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS)
diff --git a/gst/stride/armv7.s b/gst/stride/armv7.s
new file mode 100644
index 0000000..ed636f7
--- /dev/null
+++ b/gst/stride/armv7.s
@@ -0,0 +1,119 @@
+@ GStreamer
+@
+@ Copyright (C) 2009 Texas Instruments, Inc - http://www.ti.com/
+@
+@ Description: NEON/VFP accelerated functions for armv7 architecture
+@  Created on: Nov 27, 2009
+@      Author: Rob Clark <rob@ti.com>
+@
+@ This library is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Library General Public
+@ License as published by the Free Software Foundation; either
+@ version 2 of the License, or (at your option) any later version.
+@
+@ This library is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+@ Library General Public License for more details.
+@
+@ You should have received a copy of the GNU Library General Public
+@ License along with this library; if not, write to the
+@ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+@ Boston, MA 02111-1307, USA.
+
+       .fpu neon
+       .text
+
+       .align
+       .global stride_copy_zip2
+       .type   stride_copy_zip2, %function
+@void
+@stride_copy_zip2 (guchar *new_buf, guchar *orig_buf1, guchar *orig_buf2, gint sz)
+@{
+@@@@ note: r0-r3, q0-3, and q8-q15 do not need to be preserved
+stride_copy_zip2:
+@ interleave remaining >= 16 bytes:
+       pld [r1, #64]
+       pld [r2, #64]
+       cmp r3, #16
+       blt stride_copy_zip2_2
+stride_copy_zip2_1:
+       vld1.8 {q8}, [r1]!
+       vld1.8 {q9}, [r2]!
+
+       vzip.8 q8, q9
+
+       pld [r1, #64]
+       vst1.8 {q8,q9}, [r0]!
+       pld [r2, #64]
+       sub r3, r3, #16
+
+       cmp r3, #16
+       bge stride_copy_zip2_1
+@ interleave remaining >= 8 bytes:
+stride_copy_zip2_2:
+       cmp r3, #8
+       blt stride_copy_zip2_3
+
+       vld1.8 {d16}, [r1]!
+       vld1.8 {d17}, [r2]!
+
+       vzip.8 d16, d17
+
+       vst1.8 {d16,d17}, [r0]!
+       sub r3, r3, #8
+
+@ interleave remaining < 8 bytes:
+stride_copy_zip2_3:
+@XXX
+       bx lr
+@}
+
+       .align
+       .global stride_copy
+       .type   stride_copy, %function
+@void
+@stride_copy (guchar *new_buf, guchar *orig_buf, gint sz)
+@{
+@@@@ note: r0-r3, q0-3, and q8-q15 do not need to be preserved
+stride_copy:
+@ copy remaining >= 64 bytes:
+       pld [r1, #64]
+       cmp r2, #64
+       blt stride_copy_2
+stride_copy_1:
+       vld1.8 {q8-q9},  [r1]!
+       sub r2, r2, #64
+       vld1.8 {q10-q11},[r1]!
+       vst1.8 {q8-q9},  [r0]!
+       pld [r1, #64]
+       cmp r2, #64
+       vst1.8 {q10-q11},[r0]!
+       bge stride_copy_1
+@ copy remaining >= 32 bytes:
+stride_copy_2:
+       cmp r2, #32
+       blt stride_copy_3
+       vld1.8 {q8-q9}, [r1]!
+       sub r2, r2, #32
+       vst1.8 {q8-q9}, [r0]!
+@ copy remaining >= 16 bytes:
+stride_copy_3:
+       cmp r2, #16
+       blt stride_copy_4
+       vld1.8 {q8}, [r1]!
+       sub r2, r2, #16
+       vst1.8 {q8}, [r0]!
+@ copy remaining >= 8 bytes:
+stride_copy_4:
+       cmp r2, #8
+       blt stride_copy_5
+       vld1.8 {d16}, [r1]!
+       sub r2, r2, #8
+       vst1.8 {d16}, [r0]!
+@ copy remaining < 8 bytes:
+stride_copy_5:
+@XXX
+       bx lr
+@}
+
diff --git a/gst/stride/convert.c b/gst/stride/convert.c
index 860f16c..a15063b 100644
--- a/gst/stride/convert.c
+++ b/gst/stride/convert.c
@@ -37,38 +37,43 @@ GST_DEBUG_CATEGORY_EXTERN (stridetransform_debug);
 #define GST_CAT_DEFAULT stridetransform_debug
 
 
+/* note: some parts of code support in-place transform.. some do not..  I'm
+ * not sure if zip/interleave functions could really support in-place copy..
+ * I need to think about this after having some sleep ;-)
+ */
+
+#define WEAK __attribute__((weak))
+
 /*
  * Conversion utilities:
  */
 
-static void
-memmove_demux (guchar *new_buf, guchar *orig_buf, gint sz, gint pxstride)
+WEAK void
+stride_copy_zip2 (guchar *new_buf, guchar *orig_buf1, guchar *orig_buf2, gint sz)
 {
-  if (new_buf > orig_buf) {
-    /* copy backwards */
-    new_buf += ((sz - 1) * pxstride);
-    orig_buf += sz - 1;
-    while(sz--) {
-      *new_buf = *orig_buf;
-      new_buf -= pxstride;
-      orig_buf--;
-    }
-  } else {
-    while(sz--) {
-      *new_buf = *orig_buf;
-      new_buf += pxstride;
-      orig_buf++;
-    }
+  while (sz--) {
+    *new_buf++ = *orig_buf1++;
+    *new_buf++ = *orig_buf2++;
   }
 }
 
+WEAK void
+stride_copy (guchar *new_buf, guchar *orig_buf, gint sz)
+{
+  memcpy (new_buf, orig_buf, sz);
+}
+
+
+/**
+ * move to strided buffer, interleaving two planes of identical dimensions
+ */
 static void
-stridemove_demux (guchar *new_buf, guchar *orig_buf, gint new_width, gint orig_width, gint height, gint pxstride)
+stridemove_zip2 (guchar *new_buf, guchar *orig_buf1, guchar *orig_buf2, gint new_width, gint orig_width, gint height)
 {
   int row;
 
-  GST_DEBUG ("new_buf=%p, orig_buf=%p, new_width=%d, orig_width=%d, height=%d",
-      new_buf, orig_buf, new_width, orig_width, height);
+  GST_DEBUG ("new_buf=%p, orig_buf1=%p, orig_buf2=%p, new_width=%d, orig_width=%d, height=%d",
+      new_buf, orig_buf1, orig_buf2, new_width, orig_width, height);
 
   /* if increasing the stride, work from bottom-up to avoid overwriting data
    * that has not been moved yet.. otherwise, work in the opposite order,
@@ -76,11 +81,19 @@ stridemove_demux (guchar *new_buf, guchar *orig_buf, gint new_width, gint orig_w
    */
   if (new_width > orig_width) {
     for (row=height-1; row>=0; row--) {
-      memmove_demux (new_buf+(new_width*row), orig_buf+(orig_width*row), orig_width, pxstride);
+      stride_copy_zip2 (
+          new_buf+(new_width*row),
+          orig_buf1+(orig_width*row),
+          orig_buf2+(orig_width*row),
+          orig_width);
     }
   } else {
     for (row=0; row<height; row++) {
-      memmove_demux (new_buf+(new_width*row), orig_buf+(orig_width*row), new_width, pxstride);
+      stride_copy_zip2 (
+          new_buf+(new_width*row),
+          orig_buf1+(orig_width*row),
+          orig_buf2+(orig_width*row),
+          new_width);
     }
   }
 }
@@ -106,11 +119,11 @@ stridemove (guchar *new_buf, guchar *orig_buf, gint new_width, gint orig_width,
    */
   if (new_width > orig_width) {
     for (row=height-1; row>=0; row--) {
-      memmove (new_buf+(new_width*row), orig_buf+(orig_width*row), orig_width);
+      stride_copy (new_buf+(new_width*row), orig_buf+(orig_width*row), orig_width);
     }
   } else {
     for (row=0; row<height; row++) {
-      memmove (new_buf+(new_width*row), orig_buf+(orig_width*row), new_width);
+      stride_copy (new_buf+(new_width*row), orig_buf+(orig_width*row), new_width);
     }
   }
 }
@@ -234,19 +247,12 @@ stridify_i420_nv12 (GstStrideTransform *self, guchar *strided, guchar *unstrided
 
   g_return_val_if_fail (stride >= width, GST_FLOW_ERROR);
 
-  /* note: if not an in-place conversion, then doing the U&V in one pass
-   * would be more efficient... but if it is an in-place conversion, I'd
-   * need to think about whether it is potential for the new UV plane to
-   * corrupt the V plane before it is done copying..
-   */
-  stridemove_demux (
-      strided + (height*stride) + 1,
-      unstrided + (int)(height*width*1.25),
-      stride, width/2, height/2, 2);                        /* move V */
-  stridemove_demux (
+  /* XXX widths/heights/strides that are not multiple of four??: */
+  stridemove_zip2 (
       strided + (height*stride),
       unstrided + (height*width),
-      stride, width/2, height/2, 2);                        /* move U */
+      unstrided + (int)(height*width*1.25),
+      stride, width/2, height/2);                           /* interleave U&V */
   stridemove (strided, unstrided, stride, width, height);   /* move Y */
 
   return GST_FLOW_OK;
-- 
1.7.5.4