Search
SailfishOS Open Build Service
>
Projects
>
home:sledge
:
branches:nemo:devel:hw:ti:omap4:common
>
gst-plugins-base
> 0011-add-some-neon.patch
Log In
Username
Password
Cancel
Overview
Repositories
Revisions
Requests
Users
Advanced
Attributes
Meta
File 0011-add-some-neon.patch of Package gst-plugins-base
From b6b043f71f89b77de441726b1dabed961291ddf1 Mon Sep 17 00:00:00 2001 From: Rob Clark <rob@ti.com> Date: Thu, 8 Apr 2010 00:30:25 -0500 Subject: [PATCH 11/38] add some neon --- configure.ac | 1 + gst/stride/Makefile.am | 1 + gst/stride/armv7.s | 119 ++++++++++++++++++++++++++++++++++++++++++++++++ gst/stride/convert.c | 76 ++++++++++++++++-------------- 4 files changed, 162 insertions(+), 35 deletions(-) create mode 100644 gst/stride/armv7.s diff --git a/configure.ac b/configure.ac index 4a467d9..4c65068 100644 --- a/configure.ac +++ b/configure.ac @@ -58,6 +58,7 @@ dnl AS_LIBTOOL_TAGS AC_LIBTOOL_WIN32_DLL AM_PROG_LIBTOOL +AM_PROG_AS dnl *** required versions of GStreamer stuff *** GST_REQ=0.10.34 diff --git a/gst/stride/Makefile.am b/gst/stride/Makefile.am index 0b61d55..3b466de 100644 --- a/gst/stride/Makefile.am +++ b/gst/stride/Makefile.am @@ -3,6 +3,7 @@ plugin_LTLIBRARIES = libgststridetransform.la libgststridetransform_la_SOURCES = \ gststridetransform.c \ convert.c \ + armv7.s \ plugin.c libgststridetransform_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) diff --git a/gst/stride/armv7.s b/gst/stride/armv7.s new file mode 100644 index 0000000..ed636f7 --- /dev/null +++ b/gst/stride/armv7.s @@ -0,0 +1,119 @@ +@ GStreamer +@ +@ Copyright (C) 2009 Texas Instruments, Inc - http://www.ti.com/ +@ +@ Description: NEON/VFP accelerated functions for armv7 architecture +@ Created on: Nov 27, 2009 +@ Author: Rob Clark <rob@ti.com> +@ +@ This library is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Library General Public +@ License as published by the Free Software Foundation; either +@ version 2 of the License, or (at your option) any later version. +@ +@ This library is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Library General Public License for more details. +@ +@ You should have received a copy of the GNU Library General Public +@ License along with this library; if not, write to the +@ Free Software Foundation, Inc., 59 Temple Place - Suite 330, +@ Boston, MA 02111-1307, USA. + + .fpu neon + .text + + .align + .global stride_copy_zip2 + .type stride_copy_zip2, %function +@void +@stride_copy_zip2 (guchar *new_buf, guchar *orig_buf1, guchar *orig_buf2, gint sz) +@{ +@@@@ note: r0-r3, q0-3, and q8-q15 do not need to be preserved +stride_copy_zip2: +@ interleave remaining >= 16 bytes: + pld [r1, #64] + pld [r2, #64] + cmp r3, #16 + blt stride_copy_zip2_2 +stride_copy_zip2_1: + vld1.8 {q8}, [r1]! + vld1.8 {q9}, [r2]! + + vzip.8 q8, q9 + + pld [r1, #64] + vst1.8 {q8,q9}, [r0]! + pld [r2, #64] + sub r3, r3, #16 + + cmp r3, #16 + bge stride_copy_zip2_1 +@ interleave remaining >= 8 bytes: +stride_copy_zip2_2: + cmp r3, #8 + blt stride_copy_zip2_3 + + vld1.8 {d16}, [r1]! + vld1.8 {d17}, [r2]! + + vzip.8 d16, d17 + + vst1.8 {d16,d17}, [r0]! + sub r3, r3, #8 + +@ interleave remaining < 8 bytes: +stride_copy_zip2_3: +@XXX + bx lr +@} + + .align + .global stride_copy + .type stride_copy, %function +@void +@stride_copy (guchar *new_buf, guchar *orig_buf, gint sz) +@{ +@@@@ note: r0-r3, q0-3, and q8-q15 do not need to be preserved +stride_copy: +@ copy remaining >= 64 bytes: + pld [r1, #64] + cmp r2, #64 + blt stride_copy_2 +stride_copy_1: + vld1.8 {q8-q9}, [r1]! + sub r2, r2, #64 + vld1.8 {q10-q11},[r1]! + vst1.8 {q8-q9}, [r0]! + pld [r1, #64] + cmp r2, #64 + vst1.8 {q10-q11},[r0]! + bge stride_copy_1 +@ copy remaining >= 32 bytes: +stride_copy_2: + cmp r2, #32 + blt stride_copy_3 + vld1.8 {q8-q9}, [r1]! + sub r2, r2, #32 + vst1.8 {q8-q9}, [r0]! +@ copy remaining >= 16 bytes: +stride_copy_3: + cmp r2, #16 + blt stride_copy_4 + vld1.8 {q8}, [r1]! + sub r2, r2, #16 + vst1.8 {q8}, [r0]! +@ copy remaining >= 8 bytes: +stride_copy_4: + cmp r2, #8 + blt stride_copy_5 + vld1.8 {d16}, [r1]! + sub r2, r2, #8 + vst1.8 {d16}, [r0]! +@ copy remaining < 8 bytes: +stride_copy_5: +@XXX + bx lr +@} + diff --git a/gst/stride/convert.c b/gst/stride/convert.c index 860f16c..a15063b 100644 --- a/gst/stride/convert.c +++ b/gst/stride/convert.c @@ -37,38 +37,43 @@ GST_DEBUG_CATEGORY_EXTERN (stridetransform_debug); #define GST_CAT_DEFAULT stridetransform_debug +/* note: some parts of code support in-place transform.. some do not.. I'm + * not sure if zip/interleave functions could really support in-place copy.. + * I need to think about this after having some sleep ;-) + */ + +#define WEAK __attribute__((weak)) + /* * Conversion utilities: */ -static void -memmove_demux (guchar *new_buf, guchar *orig_buf, gint sz, gint pxstride) +WEAK void +stride_copy_zip2 (guchar *new_buf, guchar *orig_buf1, guchar *orig_buf2, gint sz) { - if (new_buf > orig_buf) { - /* copy backwards */ - new_buf += ((sz - 1) * pxstride); - orig_buf += sz - 1; - while(sz--) { - *new_buf = *orig_buf; - new_buf -= pxstride; - orig_buf--; - } - } else { - while(sz--) { - *new_buf = *orig_buf; - new_buf += pxstride; - orig_buf++; - } + while (sz--) { + *new_buf++ = *orig_buf1++; + *new_buf++ = *orig_buf2++; } } +WEAK void +stride_copy (guchar *new_buf, guchar *orig_buf, gint sz) +{ + memcpy (new_buf, orig_buf, sz); +} + + +/** + * move to strided buffer, interleaving two planes of identical dimensions + */ static void -stridemove_demux (guchar *new_buf, guchar *orig_buf, gint new_width, gint orig_width, gint height, gint pxstride) +stridemove_zip2 (guchar *new_buf, guchar *orig_buf1, guchar *orig_buf2, gint new_width, gint orig_width, gint height) { int row; - GST_DEBUG ("new_buf=%p, orig_buf=%p, new_width=%d, orig_width=%d, height=%d", - new_buf, orig_buf, new_width, orig_width, height); + GST_DEBUG ("new_buf=%p, orig_buf1=%p, orig_buf2=%p, new_width=%d, orig_width=%d, height=%d", + new_buf, orig_buf1, orig_buf2, new_width, orig_width, height); /* if increasing the stride, work from bottom-up to avoid overwriting data * that has not been moved yet.. otherwise, work in the opposite order, @@ -76,11 +81,19 @@ stridemove_demux (guchar *new_buf, guchar *orig_buf, gint new_width, gint orig_w */ if (new_width > orig_width) { for (row=height-1; row>=0; row--) { - memmove_demux (new_buf+(new_width*row), orig_buf+(orig_width*row), orig_width, pxstride); + stride_copy_zip2 ( + new_buf+(new_width*row), + orig_buf1+(orig_width*row), + orig_buf2+(orig_width*row), + orig_width); } } else { for (row=0; row<height; row++) { - memmove_demux (new_buf+(new_width*row), orig_buf+(orig_width*row), new_width, pxstride); + stride_copy_zip2 ( + new_buf+(new_width*row), + orig_buf1+(orig_width*row), + orig_buf2+(orig_width*row), + new_width); } } } @@ -106,11 +119,11 @@ stridemove (guchar *new_buf, guchar *orig_buf, gint new_width, gint orig_width, */ if (new_width > orig_width) { for (row=height-1; row>=0; row--) { - memmove (new_buf+(new_width*row), orig_buf+(orig_width*row), orig_width); + stride_copy (new_buf+(new_width*row), orig_buf+(orig_width*row), orig_width); } } else { for (row=0; row<height; row++) { - memmove (new_buf+(new_width*row), orig_buf+(orig_width*row), new_width); + stride_copy (new_buf+(new_width*row), orig_buf+(orig_width*row), new_width); } } } @@ -234,19 +247,12 @@ stridify_i420_nv12 (GstStrideTransform *self, guchar *strided, guchar *unstrided g_return_val_if_fail (stride >= width, GST_FLOW_ERROR); - /* note: if not an in-place conversion, then doing the U&V in one pass - * would be more efficient... but if it is an in-place conversion, I'd - * need to think about whether it is potential for the new UV plane to - * corrupt the V plane before it is done copying.. - */ - stridemove_demux ( - strided + (height*stride) + 1, - unstrided + (int)(height*width*1.25), - stride, width/2, height/2, 2); /* move V */ - stridemove_demux ( + /* XXX widths/heights/strides that are not multiple of four??: */ + stridemove_zip2 ( strided + (height*stride), unstrided + (height*width), - stride, width/2, height/2, 2); /* move U */ + unstrided + (int)(height*width*1.25), + stride, width/2, height/2); /* interleave U&V */ stridemove (strided, unstrided, stride, width, height); /* move Y */ return GST_FLOW_OK; -- 1.7.5.4