|
@@ -0,0 +1,742 @@
+commit 844025d7ca809e961264f87184d098ac5804f9ce
+Author: Marko Kenttälä <marko.kenttala@jolla.com>
+Date: Thu Feb 28 15:57:43 2019 +0200
+
+ ARM vec optimisations from https://github.com/kaffeemonster/zlib
+
+diff --git a/INDEX b/INDEX
+index 2ba0641..8b1a726 100644
+--- a/INDEX
++++ b/INDEX
+@@ -21,6 +21,7 @@ zlib.pc.cmakein zlib.pc template for cmake
+ zlib2ansi perl script to convert source files for C++ compilation
+
+ amiga/ makefiles for Amiga SAS C
++arm/ files for the arm platform
+ as400/ makefiles for AS/400
+ doc/ documentation for formats and algorithms
+ msdos/ makefiles for MSDOS
+diff --git a/adler32.c b/adler32.c
+index d0be438..01d0417 100644
+--- a/adler32.c
++++ b/adler32.c
+@@ -59,6 +59,56 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
+ # define MOD63(a) a %= BASE
+ #endif
+
++#ifndef NO_ADLER32_VEC
++# if defined(__arm__)
++# include "arm/adler32.c"
++# endif
++#endif
++
++#ifndef MIN_WORK
++# define MIN_WORK 16
++#endif
++
++/* ========================================================================= */
++#if MIN_WORK - 16 > 0
++# ifndef NO_ADLER32_GE16
++local noinline uLong adler32_ge16(adler, buf, len)
++ uLong adler;
++ const Bytef *buf;
++ uInt len;
++{
++ unsigned long sum2;
++ unsigned n;
++
++ /* split Adler-32 into component sums */
++ sum2 = (adler >> 16) & 0xffff;
++ adler &= 0xffff;
++ n = len / 16;
++ len %= 16;
++
++ do {
++ DO16(buf); /* 16 sums unrolled */
++ buf += 16;
++ } while (--n);
++
++ /* handle trailer */
++ while (len--) {
++ adler += *buf++;
++ sum2 += adler;
++ }
++
++ MOD28(adler);
++ MOD28(sum2);
++
++ /* return recombined sums */
++ return adler | (sum2 << 16);
++}
++# endif
++# define COMMON_WORK 16
++#else
++# define COMMON_WORK MIN_WORK
++#endif
++
+ /* ========================================================================= */
+ uLong ZEXPORT adler32_z(adler, buf, len)
+ uLong adler;
+@@ -136,7 +186,17 @@ uLong ZEXPORT adler32(adler, buf, len)
+ const Bytef *buf;
+ uInt len;
+ {
++#if ! defined(NO_ADLER32_VEC) && defined(__arm__)
++ if (len < COMMON_WORK)
++ return adler32_z(adler, buf, len);
++#if MIN_WORK - 16 > 0
++ if (len < MIN_WORK)
++ return adler32_ge16(adler, buf, len);
++#endif
++ return adler32_vec(adler, buf, len);
++#else
+ return adler32_z(adler, buf, len);
++#endif
+ }
+
+ /* ========================================================================= */
+diff --git a/arm/adler32.c b/arm/adler32.c
+new file mode 100644
+index 0000000..e939d7f
+--- /dev/null
++++ b/arm/adler32.c
+@@ -0,0 +1,614 @@
++/*
++ * adler32.c -- compute the Adler-32 checksum of a data stream
++ * arm implementation
++ * Copyright (C) 1995-2007 Mark Adler
++ * Copyright (C) 2009-2011 Jan Seiffert
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++/* @(#) $Id$ */
++
++#if defined(__ARM_NEON__) && defined(__ARMEL__)
++/*
++ * Big endian NEON qwords are kind of broken.
++ * They are big endian within the dwords, but WRONG
++ * (really??) way round between lo and hi.
++ * Creating some kind of PDP11 middle endian.
++ *
++ * This is madness and unsupportable. For this reason
++ * GCC wants to disable qword endian specific patterns.
++ */
++# include <arm_neon.h>
++
++# define SOVUCQ sizeof(uint8x16_t)
++# define SOVUC sizeof(uint8x8_t)
++/* since we do not have the 64bit psadbw sum, we could still go a little higher (we are at 0xc) */
++# define VNMAX (8*NMAX)
++# define HAVE_ADLER32_VEC
++# define MIN_WORK 32
++
++/* ========================================================================= */
++local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, unsigned amount)
++{
++ switch(amount % SOVUCQ)
++ {
++ case 0: return a;
++ case 1: return vextq_u8(a, b, 1);
++ case 2: return vextq_u8(a, b, 2);
++ case 3: return vextq_u8(a, b, 3);
++ case 4: return vextq_u8(a, b, 4);
++ case 5: return vextq_u8(a, b, 5);
++ case 6: return vextq_u8(a, b, 6);
++ case 7: return vextq_u8(a, b, 7);
++ case 8: return vextq_u8(a, b, 8);
++ case 9: return vextq_u8(a, b, 9);
++ case 10: return vextq_u8(a, b, 10);
++ case 11: return vextq_u8(a, b, 11);
++ case 12: return vextq_u8(a, b, 12);
++ case 13: return vextq_u8(a, b, 13);
++ case 14: return vextq_u8(a, b, 14);
++ case 15: return vextq_u8(a, b, 15);
++ }
++ return b;
++}
++
++/* ========================================================================= */
++local inline uint32x4_t vector_chop(uint32x4_t x)
++{
++ uint32x4_t y;
++
++ y = vshlq_n_u32(x, 16);
++ x = vshrq_n_u32(x, 16);
++ y = vshrq_n_u32(y, 16);
++ y = vsubq_u32(y, x);
++ x = vaddq_u32(y, vshlq_n_u32(x, 4));
++ return x;
++}
++
++/* ========================================================================= */
++local noinline uLong adler32_vec(adler, buf, len)
++ uLong adler;
++ const Bytef *buf;
++ uInt len;
++{
++ uint32x4_t v0_32 = (uint32x4_t){0,0,0,0};
++ uint8x16_t v0 = (uint8x16_t)v0_32;
++ uint8x16_t vord, vord_a;
++ uint32x4_t vs1, vs2;
++ uint32x2_t v_tsum;
++ uint8x16_t in16;
++ uint32_t s1, s2;
++ unsigned k;
++
++ s1 = adler & 0xffff;
++ s2 = (adler >> 16) & 0xffff;
++
++ vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
++
++ if (likely(len >= 2*SOVUCQ)) {
++ unsigned f, n;
++
++ /*
++ * Add stuff to achieve alignment
++ */
++ /* align hard down */
++ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ);
++ n = SOVUCQ - f;
|