Search
SailfishOS Open Build Service
>
Projects
>
home:abhishek9650
>
zlib
> _service:tar_git:0001-zlib-arm-vec.patch
Log In
Username
Password
Cancel
Overview
Repositories
Revisions
Requests
Users
Advanced
Attributes
Meta
File _service:tar_git:0001-zlib-arm-vec.patch of Package zlib
commit 844025d7ca809e961264f87184d098ac5804f9ce Author: Marko Kenttälä <marko.kenttala@jolla.com> Date: Thu Feb 28 15:57:43 2019 +0200 ARM vec optimisations from https://github.com/kaffeemonster/zlib diff --git a/INDEX b/INDEX index 2ba0641..8b1a726 100644 --- a/INDEX +++ b/INDEX @@ -21,6 +21,7 @@ zlib.pc.cmakein zlib.pc template for cmake zlib2ansi perl script to convert source files for C++ compilation amiga/ makefiles for Amiga SAS C +arm/ files for the arm platform as400/ makefiles for AS/400 doc/ documentation for formats and algorithms msdos/ makefiles for MSDOS diff --git a/adler32.c b/adler32.c index d0be438..01d0417 100644 --- a/adler32.c +++ b/adler32.c @@ -59,6 +59,56 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2)); # define MOD63(a) a %= BASE #endif +#ifndef NO_ADLER32_VEC +# if defined(__arm__) +# include "arm/adler32.c" +# endif +#endif + +#ifndef MIN_WORK +# define MIN_WORK 16 +#endif + +/* ========================================================================= */ +#if MIN_WORK - 16 > 0 +# ifndef NO_ADLER32_GE16 +local noinline uLong adler32_ge16(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long sum2; + unsigned n; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + n = len / 16; + len %= 16; + + do { + DO16(buf); /* 16 sums unrolled */ + buf += 16; + } while (--n); + + /* handle trailer */ + while (len--) { + adler += *buf++; + sum2 += adler; + } + + MOD28(adler); + MOD28(sum2); + + /* return recombined sums */ + return adler | (sum2 << 16); +} +# endif +# define COMMON_WORK 16 +#else +# define COMMON_WORK MIN_WORK +#endif + /* ========================================================================= */ uLong ZEXPORT adler32_z(adler, buf, len) uLong adler; @@ -136,7 +186,17 @@ uLong ZEXPORT adler32(adler, buf, len) const Bytef *buf; uInt len; { +#if ! defined(NO_ADLER32_VEC) && defined(__arm__) + if (len < COMMON_WORK) + return adler32_z(adler, buf, len); +#if MIN_WORK - 16 > 0 + if (len < MIN_WORK) + return adler32_ge16(adler, buf, len); +#endif + return adler32_vec(adler, buf, len); +#else return adler32_z(adler, buf, len); +#endif } /* ========================================================================= */ diff --git a/arm/adler32.c b/arm/adler32.c new file mode 100644 index 0000000..e939d7f --- /dev/null +++ b/arm/adler32.c @@ -0,0 +1,614 @@ +/* + * adler32.c -- compute the Adler-32 checksum of a data stream + * arm implementation + * Copyright (C) 1995-2007 Mark Adler + * Copyright (C) 2009-2011 Jan Seiffert + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#if defined(__ARM_NEON__) && defined(__ARMEL__) +/* + * Big endian NEON qwords are kind of broken. + * They are big endian within the dwords, but WRONG + * (really??) way round between lo and hi. + * Creating some kind of PDP11 middle endian. + * + * This is madness and unsupportable. For this reason + * GCC wants to disable qword endian specific patterns. + */ +# include <arm_neon.h> + +# define SOVUCQ sizeof(uint8x16_t) +# define SOVUC sizeof(uint8x8_t) +/* since we do not have the 64bit psadbw sum, we could still go a little higher (we are at 0xc) */ +# define VNMAX (8*NMAX) +# define HAVE_ADLER32_VEC +# define MIN_WORK 32 + +/* ========================================================================= */ +local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, unsigned amount) +{ + switch(amount % SOVUCQ) + { + case 0: return a; + case 1: return vextq_u8(a, b, 1); + case 2: return vextq_u8(a, b, 2); + case 3: return vextq_u8(a, b, 3); + case 4: return vextq_u8(a, b, 4); + case 5: return vextq_u8(a, b, 5); + case 6: return vextq_u8(a, b, 6); + case 7: return vextq_u8(a, b, 7); + case 8: return vextq_u8(a, b, 8); + case 9: return vextq_u8(a, b, 9); + case 10: return vextq_u8(a, b, 10); + case 11: return vextq_u8(a, b, 11); + case 12: return vextq_u8(a, b, 12); + case 13: return vextq_u8(a, b, 13); + case 14: return vextq_u8(a, b, 14); + case 15: return vextq_u8(a, b, 15); + } + return b; +} + +/* ========================================================================= */ +local inline uint32x4_t vector_chop(uint32x4_t x) +{ + uint32x4_t y; + + y = vshlq_n_u32(x, 16); + x = vshrq_n_u32(x, 16); + y = vshrq_n_u32(y, 16); + y = vsubq_u32(y, x); + x = vaddq_u32(y, vshlq_n_u32(x, 4)); + return x; +} + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + uint32x4_t v0_32 = (uint32x4_t){0,0,0,0}; + uint8x16_t v0 = (uint8x16_t)v0_32; + uint8x16_t vord, vord_a; + uint32x4_t vs1, vs2; + uint32x2_t v_tsum; + uint8x16_t in16; + uint32_t s1, s2; + unsigned k; + + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}; + + if (likely(len >= 2*SOVUCQ)) { + unsigned f, n; + + /* + * Add stuff to achieve alignment + */ + /* align hard down */ + f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ); + n = SOVUCQ - f; + buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUCQ); + + /* add n times s1 to s2 for start round */ + s2 += s1 * n; + + /* set sums 0 */ + vs1 = v0_32; + vs2 = v0_32; + /* + * the accumulation of s1 for every round grows very fast + * (quadratic?), even if we accumulate in 4 dwords, more + * rounds means nonlinear growth. + * We already split it out of s2, normaly it would be in + * s2 times 16... and even grow faster. + * Thanks to this split and vector reduction, we can stay + * longer in the loops. But we have to prepare for the worst + * (all 0xff), only do 6 times the work. + * (we could prop. stay a little longer since we have 4 sums, + * not 2 like on x86). + */ + k = len < VNMAX ? (unsigned)len : VNMAX; + len -= k; + /* insert scalar start somewhere */ + vs1 = vsetq_lane_u32(s1, vs1, 0); + vs2 = vsetq_lane_u32(s2, vs2, 0); + + /* get input data */ + in16 = *(const uint8x16_t *)buf; + /* mask out excess data */ + in16 = neon_simple_alignq(in16, v0, f); + vord_a = neon_simple_alignq(vord, v0, f); + /* pairwise add bytes and long, pairwise add word long acc */ + vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); + /* apply order, add words, pairwise add word long acc */ + vs2 = vpadalq_u16(vs2, + vmlal_u8( + vmull_u8(vget_low_u8(in16), vget_low_u8(vord_a)), + vget_high_u8(in16), vget_high_u8(vord_a) + ) + ); + + buf += SOVUCQ; + k -= n; + + if (likely(k >= SOVUCQ)) do { + uint32x4_t vs1_r = v0_32; + do { + uint16x8_t vs2_lo = (uint16x8_t)v0_32, vs2_hi = (uint16x8_t)v0_32; + unsigned j; + + j = (k/16) > 16 ? 16 : k/16; + k -= j * 16; + do { + /* GCC does not create the most pretty inner loop, + * with extra moves and stupid scheduling, but + * i am not in the mood for inline ASM, keep it + * compatible. + */ + /* get input data */ + in16 = *(const uint8x16_t *)buf; + buf += SOVUCQ; + + /* add vs1 for this round */ + vs1_r = vaddq_u32(vs1_r, vs1); + + /* pairwise add bytes and long, pairwise add word long acc */ + vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); + /* apply order, word long and acc */ + vs2_lo = vmlal_u8(vs2_lo, vget_low_u8(in16), vget_low_u8(vord)); + vs2_hi = vmlal_u8(vs2_hi, vget_high_u8(in16), vget_high_u8(vord)); + } while(--j); + /* pair wise add long and acc */ + vs2 = vpadalq_u16(vs2, vs2_lo); + vs2 = vpadalq_u16(vs2, vs2_hi); + } while (k >= SOVUCQ); + /* chop vs1 round sum before multiplying by 16 */ + vs1_r = vector_chop(vs1_r); + /* add vs1 for this round (16 times) */ + /* they have shift right and accummulate, where is shift left and acc?? */ + vs2 = vaddq_u32(vs2, vshlq_n_u32(vs1_r, 4)); + /* chop both vectors to something within 16 bit */ + vs2 = vector_chop(vs2); + vs1 = vector_chop(vs1); + len += k; + k = len < VNMAX ? (unsigned) len : VNMAX; + len -= k; + } while (likely(k >= SOVUCQ)); + + if (likely(k)) { + /* + * handle trailer + */ + f = SOVUCQ - k; + /* add k times vs1 for this trailer */ + vs2 = vmlaq_u32(vs2, vs1, vdupq_n_u32(k)); + + /* get input data */ + in16 = *(const uint8x16_t *)buf; + /* mask out bad data */ + in16 = neon_simple_alignq(v0, in16, k); + + /* pairwise add bytes and long, pairwise add word long acc */ + vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); + /* apply order, add words, pairwise add word long acc */ + vs2 = vpadalq_u16(vs2, + vmlal_u8( + vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), + vget_high_u8(in16), vget_high_u8(vord) + ) + ); + + buf += k; + k -= k; + } + + /* add horizontal */ + v_tsum = vpadd_u32(vget_high_u32(vs1), vget_low_u32(vs1)); + v_tsum = vpadd_u32(v_tsum, v_tsum); + s1 = vget_lane_u32(v_tsum, 0); + v_tsum = vpadd_u32(vget_high_u32(vs2), vget_low_u32(vs2)); + v_tsum = vpadd_u32(v_tsum, v_tsum); + s2 = vget_lane_u32(v_tsum, 0); + } + + if (unlikely(len)) do { + s1 += *buf++; + s2 += s1; + } while (--len); + MOD28(s1); + MOD28(s2); + + return (s2 << 16) | s1; +} + +#elif defined(__IWMMXT__) +# ifndef __GNUC__ +/* GCC doesn't take it's own intrinsic header and ICEs if forced to */ +# include <mmintrin.h> +# else +typedef unsigned long long __m64; + +// TODO: older gcc may need U constrain instead of y? +static inline __m64 _mm_setzero_si64(void) +{ + __m64 r; +# if 0 + asm ("wzero %0" : "=y" (r)); +# else + r = 0; +# endif + return r; +} +/* there is slli/srli and we want to use it, but it's iWMMXt-2 */ +static inline __m64 _mm_sll_pi32(__m64 a, __m64 c) +{ + asm ("wsllw %0, %1, %2" : "=y" (a) : "y" (a), "y" (c)); + return a; +} +static inline __m64 _mm_srl_pi32(__m64 a, __m64 c) +{ + asm ("wsrlw %0, %1, %2" : "=y" (a) : "y" (a), "y" (c)); + return a; +} +static inline __m64 _mm_sub_pi32(__m64 a, __m64 b) +{ + asm ("wsubw %0, %1, %2" : "=y" (a) : "y" (a), "y" (b)); + return a; +} +static inline __m64 _mm_add_pi16(__m64 a, __m64 b) +{ + asm ("waddh %0, %1, %2" : "=y" (a) : "y" (a), "y" (b)); + return a; +} +static inline __m64 _mm_add_pi32(__m64 a, __m64 b) +{ + asm ("waddw %0, %1, %2" : "=y" (a) : "y" (a), "y" (b)); + return a; +} +static inline __m64 _mm_sada_pu8(__m64 acc, __m64 a, __m64 b) +{ + asm ("wsadb %0, %1, %2" : "=y" (acc) : "y" (a), "y" (b), "0" (acc)); + return acc; +} +static inline __m64 _mm_madd_pu16(__m64 a, __m64 b) +{ + asm ("wmaddu %0, %1, %2" : "=y" (a) : "y" (a), "y" (b)); + return a; +} +static inline __m64 _mm_mac_pu16(__m64 acc, __m64 a, __m64 b) +{ + asm ("wmacu %0, %1, %2" : "=y" (acc) : "y" (a), "y" (b), "0" (acc)); + return acc; +} +static inline __m64 _mm_unpackel_pu8(__m64 a) +{ + asm ("wunpckelub %0, %1" : "=y" (a) : "y" (a)); + return a; +} +static inline __m64 _mm_unpackeh_pu8(__m64 a) +{ + asm ("wunpckehub %0, %1" : "=y" (a) : "y" (a)); + return a; +} +static inline __m64 _mm_shuffle_pi16(__m64 a, const int m) +{ + asm ("wshufh %0, %1, %2" : "=y" (a) : "y" (a), "i" (m)); + return a; +} +static inline unsigned int _mm_extract_pu32(__m64 a, const int m) +{ + unsigned int r; + asm ("textrmuw %0, %1, %2" : "=r" (r) : "y" (a), "i" (m)); + return r; +} +static inline __m64 _mm_insert_pi32(__m64 a, unsigned int b, const int m) +{ + asm ("tinsrw %0, %1, %2" : "=y" (a) : "r" (b), "i" (m), "0" (a)); + return a; +} +static inline __m64 _mm_align_si64(__m64 a, __m64 b, int c) +{ + asm ("walignr%U3 %0, %1, %2" : "=y" (a) : "y" (a), "y" (b), "z" (c)); + return a; +} +static inline __m64 _mm_set_pi16(short a, short b, short c, short d) +{ + __m64 r = (unsigned long long)d; + r |= ((unsigned long long)c) << 16; + r |= ((unsigned long long)b) << 32; + r |= ((unsigned long long)a) << 48; + return r; +} +# endif + +// TODO: we could go over NMAX, since we have split the vs2 sum +/* but we shuffle vs1_r only every 2056 byte, so we can not go full */ +# define VNMAX (3*NMAX) +# define HAVE_ADLER32_VEC +# define MIN_WORK 32 +# define SOV8 (sizeof(__m64)) + +/* ========================================================================= */ +local inline __m64 vector_chop(__m64 x) +{ + static const __m64 four = 4; + static const __m64 sixten = 16; + __m64 y = _mm_sll_pi32(x, sixten); + x = _mm_srl_pi32(x, sixten); + y = _mm_srl_pi32(y, sixten); + y = _mm_sub_pi32(y, x); + x = _mm_add_pi32(y, _mm_sll_pi32(x, four)); + return x; +} + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned int s1, s2; + unsigned int k; + + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + if (likely(len >= 4 * SOV8)) { + static const __m64 three = 3; + __m64 vs1, vs2; + __m64 vzero; + __m64 vorder_l, vorder_h; + unsigned int f, n; + + vzero = _mm_setzero_si64(); + + /* align hard down */ + f = (unsigned int) ALIGN_DOWN_DIFF(buf, SOV8); + buf = (const Bytef *)ALIGN_DOWN(buf, SOV8); + n = SOV8 - f; + + /* add n times s1 to s2 for start round */ + s2 += s1 * n; + + k = len < VNMAX ? len : VNMAX; + len -= k; + + /* insert scalar start */ + vs1 = _mm_insert_pi32(vzero, s1, 0); + vs2 = _mm_insert_pi32(vzero, s2, 0); + +// TODO: byte order? + if (host_is_bigendian()) { + vorder_l = _mm_set_pi16(4, 3, 2, 1); + vorder_h = _mm_set_pi16(8, 7, 6, 5); + } else { + vorder_l = _mm_set_pi16(5, 6, 7, 8); + vorder_h = _mm_set_pi16(1, 2, 3, 4); + } + + { + __m64 in = *(const __m64 *)buf; + + /* mask excess info out */ + if (host_is_bigendian()) { + in = _mm_align_si64(vzero, in, n); + in = _mm_align_si64(in, vzero, f); + } else { + in = _mm_align_si64(in, vzero, f); + in = _mm_align_si64(vzero, in, n); + } + + /* add horizontal and acc */ + vs1 = _mm_sada_pu8(vs1, in, vzero); + + /* widen bytes to words, apply order and acc */ + vs2 = _mm_mac_pu16(vs2, _mm_unpackel_pu8(in), vorder_l); + vs2 = _mm_mac_pu16(vs2, _mm_unpackeh_pu8(in), vorder_h); + } + + buf += SOV8; + k -= n; + + do { + __m64 vs1_r = vzero; + + do { + __m64 vs2_l = vzero, vs2_h = vzero; + unsigned int j; + + j = k >= (257 * SOV8) ? 257 * SOV8 : k; + j /= SOV8; + k -= j * SOV8; + do { + /* get input data */ + __m64 in = *(const __m64 *)buf; + buf += SOV8; + + /* add vs1 for this round */ + vs1_r = _mm_add_pi32(vs1_r, vs1); + + /* add horizontal and acc */ +// TODO: how does wsad really work? + /* + * the Intel iwmmxt 1 & 2 manual says the wsad instruction + * always zeros the upper word (32 in the arm context), + * and then adds all sad into the lower word (again 32 + * bit). If the z version is choosen, the lower word is + * also zeroed before, otherwise we get an acc. + * + * Visual studio only knows the sada intrinsic to reflect + * that, but no description, no prototype. + * + * But there is no sada intrinsic in the Intel manual. + * The Intel iwmmxt-1 manual only knows sad & sadz, two + * operands, instead the acc is done with the lvalue + * (which only really works with spec. compiler builtins). + * GCC follows the intel manual (but does gcc manages to + * use the lvalue?). + * To make matters worse the description for the _mm_sad_pu8 + * intrinsic says it clears the upper _3_ fields, and only + * acc in the lowest, so only working in 16 Bit. + * So who is wrong? + * + * If this is different between 1 & 2 we are screwed, esp. + * since i can not find a preprocessor define if 1 or 2. + */ + vs1 = _mm_sada_pu8(vs1, in, vzero); + + /* widen bytes to words and acc */ + vs2_l = _mm_add_pi16(vs2_l, _mm_unpackel_pu8(in)); + vs2_h = _mm_add_pi16(vs2_h, _mm_unpackeh_pu8(in)); + } while (--j); + /* shake and roll vs1_r, so both 32 bit sums get some input */ + vs1_r = _mm_shuffle_pi16(vs1_r, 0x4e); + /* apply order and add to 32 bit */ + vs2_l = _mm_madd_pu16(vs2_l, vorder_l); + vs2_h = _mm_madd_pu16(vs2_h, vorder_h); + /* acc */ + vs2 = _mm_add_pi32(vs2, vs2_l); + vs2 = _mm_add_pi32(vs2, vs2_h); + } while (k >= SOV8); + /* chop vs1 round sum before multiplying by 8 */ + vs1_r = vector_chop(vs1_r); + /* add vs1 for this round (8 times) */ + vs2 = _mm_add_pi32(vs2, _mm_sll_pi32(vs1_r, three)); + /* chop both sums to something within 16 bit */ + vs2 = vector_chop(vs2); + vs1 = vector_chop(vs1); + len += k; + k = len < VNMAX ? len : VNMAX; + len -= k; + } while (likely(k >= SOV8)); + len += k; + vs1 = _mm_add_pi32(vs1, _mm_shuffle_pi16(vs1, 0x4e)); + vs2 = _mm_add_pi32(vs2, _mm_shuffle_pi16(vs2, 0x4e)); + s1 = _mm_extract_pu32(vs1, 0); + s2 = _mm_extract_pu32(vs2, 0); + } + + if (unlikely(len)) do { + s1 += *buf++; + s2 += s1; + } while (--len); + /* at this point we should have not so big s1 & s2 */ + MOD28(s1); + MOD28(s2); + + return (s2 << 16) | s1; +} + +/* inline asm, so only on GCC (or compatible) && ARM v6 or better */ +#elif 0 && defined(__GNUC__) && ( \ + defined(__thumb2__) && ( \ + !defined(__ARM_ARCH_7__) && !defined(__ARM_ARCH_7M__) \ + ) || ( \ + !defined(__thumb__) && ( \ + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) \ + )) \ + ) +/* This code is disabled, since it is not faster, only for reference. + * We are at speedup: 0.952830 + * Again counting instructions is futile, 5 instructions per 4 bytes + * against at least 3 per byte (loop overhead excluded) is no win. + * And split sums also does not save us. + */ +# define SOU32 (sizeof(unsigned int)) +# define HAVE_ADLER32_VEC +# define MIN_WORK 16 +// TODO: maybe 2*NMAX is possible, but that's very thin +/* this way we are at 0xda */ +# define VNMAX (NMAX+((NMAX*9)/10)) + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned int s1, s2; + unsigned int k; + + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + k = ALIGN_DIFF(buf, SOU32); + len -= k; + if (k) do { + s1 += *buf++; + s2 += s1; + } while (--k); + + if (likely(len >= 4 * SOU32)) { + unsigned int vs1 = s1, vs2 = s2; + unsigned int order_lo, order_hi; + + if (host_is_bigendian()) { + order_lo = 0x00030001; + order_hi = 0x00040002; + } else { + order_lo = 0x00020004; + order_hi = 0x00010003; + } + k = len < VNMAX ? len : VNMAX; + len -= k; + + do { + unsigned int vs1_r = 0; + do { + unsigned int j; + unsigned int vs2_lo = 0, vs2_hi = 0; + + j = (k/4) >= 128 ? 128 : (k/4); + k -= j * 4; + do { + /* get input data */ + unsigned int in = *(const unsigned int *)buf; + buf += SOU32; + /* add vs1 for this round */ + vs1_r += vs1; + /* add horizontal and acc */ + asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1)); + /* widen bytes to words and acc */ + asm ("uxtab16 %0, %1, %2" : "=r" (vs2_lo) : "r" (vs2_lo), "r" (in)); + asm ("uxtab16 %0, %1, %2, ror #8" : "=r" (vs2_hi) : "r" (vs2_hi), "r" (in)); + } while (--j); + /* aply order and acc */ + asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (vs2_lo) , "r" (order_lo), "r" (vs2)); + asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (vs2_hi) , "r" (order_hi), "r" (vs2)); + } while (k >= SOU32); + /* chop vs1 round sum before multiplying by 4 */ + CHOP(vs1_r); + /* add vs1 for this round (4 times) */ + vs2 += vs1_r * 4; + /* chop both sums */ + CHOP(vs2); + CHOP(vs1); + len += k; + k = len < VNMAX ? len : VNMAX; + len -= k; + } while (likely(k >= SOU32)); + len += k; + s1 = vs1; + s2 = vs2; + } + + if (unlikely(len)) do { + s1 += *buf++; + s2 += s1; + } while (--len); + /* at this point we should not have so big s1 & s2 */ + MOD28(s1); + MOD28(s2); + + return (s2 << 16) | s1; +} +#endif diff --git a/zutil.h b/zutil.h index b079ea6..9267fd5 100644 --- a/zutil.h +++ b/zutil.h @@ -19,6 +19,20 @@ # define ZLIB_INTERNAL #endif +#ifndef noinline +# define noinline __attribute__((__noinline__)) +#endif + +#ifndef likely +# define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +# define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L)) +#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L)) + #include "zlib.h" #if defined(STDC) && !defined(Z_SOLO)