summaryrefslogtreecommitdiffstats
path: root/3rdparty/openpgm-svn-r1135/pgm/checksum.c
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/openpgm-svn-r1135/pgm/checksum.c')
-rw-r--r--3rdparty/openpgm-svn-r1135/pgm/checksum.c941
1 files changed, 941 insertions, 0 deletions
diff --git a/3rdparty/openpgm-svn-r1135/pgm/checksum.c b/3rdparty/openpgm-svn-r1135/pgm/checksum.c
new file mode 100644
index 0000000..5e367ea
--- /dev/null
+++ b/3rdparty/openpgm-svn-r1135/pgm/checksum.c
@@ -0,0 +1,941 @@
+/* vim:ts=8:sts=8:sw=4:noai:noexpandtab
+ *
+ * PGM checksum routines
+ *
+ * Copyright (c) 2006-2010 Miru Limited.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <impl/framework.h>
+
+
+/* locals */
+
+static inline uint16_t do_csum (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_8bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_16bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_32bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_64bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+#if defined(__amd64) || defined(__x86_64__)
+static uint16_t do_csum_vector (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+#endif
+
+
+/* endian independent checksum routine
+ */
+
+static
+uint16_t
+do_csum_8bit (
+ const void* addr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast32_t acc;
+ uint16_t src;
+ const uint8_t* buf;
+
+ acc = csum;
+ buf = (const uint8_t*)addr;
+ while (len > 1) {
+/* first byte as most significant */
+ src = (*buf++) << 8;
+/* second byte as least significant */
+ src |= (*buf++);
+ acc += src;
+ len -= 2;
+ }
+/* trailing odd byte */
+ if (len > 0) {
+ src = (*buf) << 8;
+ acc += src;
+ }
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ return htons ((uint16_t)acc);
+}
+
+static
+uint16_t
+do_csumcpy_8bit (
+ const void* restrict srcaddr,
+ void* restrict dstaddr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast32_t acc;
+ const uint8_t*restrict srcbuf;
+ uint8_t*restrict dstbuf;
+ uint_fast16_t val16;
+
+ acc = csum;
+ srcbuf = (const uint8_t*restrict)srcaddr;
+ dstbuf = (uint8_t*restrict)dstaddr;
+ while (len > 1) {
+/* first byte as most significant */
+ val16 = (*dstbuf++ = *srcbuf++) << 8;
+/* second byte as least significant */
+ val16 |= (*dstbuf++ = *srcbuf++);
+ acc += val16;
+ len -= 2;
+ }
+/* trailing odd byte */
+ if (len > 0) {
+ val16 = (*dstbuf = *srcbuf) << 8;
+ acc += val16;
+ }
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ return htons ((uint16_t)acc);
+}
+
+static
+uint16_t
+do_csum_16bit (
+ const void* addr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast32_t acc;
+ const uint8_t* buf;
+ uint16_t remainder;
+ uint_fast16_t count8;
+ bool is_odd;
+
+ acc = csum;
+ buf = (const uint8_t*)addr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+ is_odd = ((uintptr_t)buf & 1);
+/* align first byte */
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*)&remainder)[1] = *buf++;
+ len--;
+ }
+/* 8-byte unrolls */
+ count8 = len >> 3;
+ while (count8--) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ acc += ((const uint16_t*)buf)[ 1 ];
+ acc += ((const uint16_t*)buf)[ 2 ];
+ acc += ((const uint16_t*)buf)[ 3 ];
+ buf = &buf[ 8 ];
+ }
+ len %= 8;
+/* final 7 bytes */
+ while (len > 1) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ buf = &buf[ 2 ];
+ len -= 2;
+ }
+/* trailing odd byte */
+ if (len > 0) {
+ ((uint8_t*)&remainder)[0] = *buf;
+ }
+ acc += remainder;
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_16bit (
+ const void* restrict srcaddr,
+ void* restrict dstaddr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast32_t acc;
+ const uint8_t*restrict srcbuf;
+ uint8_t*restrict dstbuf;
+ uint16_t remainder;
+ uint_fast16_t count8;
+ bool is_odd;
+
+ acc = csum;
+ srcbuf = (const uint8_t*restrict)srcaddr;
+ dstbuf = (uint8_t*restrict)dstaddr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+ is_odd = ((uintptr_t)srcbuf & 1);
+/* align first byte */
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+ len--;
+ }
+/* 8-byte unrolls, anything larger than 16-byte or less than 8 loses performance */
+ count8 = len >> 3;
+ while (count8--) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ acc += ((uint16_t*restrict)dstbuf)[ 1 ] = ((const uint16_t*restrict)srcbuf)[ 1 ];
+ acc += ((uint16_t*restrict)dstbuf)[ 2 ] = ((const uint16_t*restrict)srcbuf)[ 2 ];
+ acc += ((uint16_t*restrict)dstbuf)[ 3 ] = ((const uint16_t*restrict)srcbuf)[ 3 ];
+ srcbuf = &srcbuf[ 8 ];
+ dstbuf = &dstbuf[ 8 ];
+ }
+ len %= 8;
+/* final 7 bytes */
+ while (len > 1) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 2 ];
+ dstbuf = &dstbuf[ 2 ];
+ len -= 2;
+ }
+/* trailing odd byte */
+ if (len > 0) {
+ ((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+ }
+ acc += remainder;
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csum_32bit (
+ const void* addr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast32_t acc;
+ const uint8_t* buf;
+ uint16_t remainder;
+ uint_fast16_t count;
+ bool is_odd;
+
+ acc = csum;
+ buf = (const uint8_t*)addr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+ is_odd = ((uintptr_t)buf & 1);
+/* align first byte */
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*)&remainder)[1] = *buf++;
+ len--;
+ }
+/* 16-bit words */
+ count = len >> 1;
+ if (count)
+ {
+ if ((uintptr_t)buf & 2) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ buf = &buf[ 2 ];
+ count--;
+ len -= 2;
+ }
+/* 32-bit words */
+ count >>= 1;
+ if (count)
+ {
+ uint32_t carry = 0;
+ while (count) {
+ acc += carry;
+ acc += ((const uint32_t*)buf)[ 0 ];
+ carry = ((const uint32_t*)buf)[ 0 ] > acc;
+ buf = &buf[ 4 ];
+ count--;
+ }
+ acc += carry;
+ acc = (acc >> 16) + (acc & 0xffff);
+ }
+ if (len & 2) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ buf = &buf[ 2 ];
+ }
+ }
+/* trailing odd byte */
+ if (len & 1) {
+ ((uint8_t*)&remainder)[0] = *buf;
+ }
+ acc += remainder;
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_32bit (
+ const void* restrict srcaddr,
+ void* restrict dstaddr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast32_t acc;
+ const uint8_t*restrict srcbuf;
+ uint8_t*restrict dstbuf;
+ uint16_t remainder;
+ uint_fast16_t count;
+ bool is_odd;
+
+ acc = csum;
+ srcbuf = (const uint8_t*restrict)srcaddr;
+ dstbuf = (uint8_t*restrict)dstaddr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+ is_odd = ((uintptr_t)srcbuf & 1);
+/* align first byte */
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+ len--;
+ }
+/* 16-bit words */
+ count = len >> 1;
+ if (count)
+ {
+ if ((uintptr_t)srcbuf & 2) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 2 ];
+ dstbuf = &dstbuf[ 2 ];
+ count--;
+ len -= 2;
+ }
+/* 32-bit words */
+ count >>= 1;
+ if (count)
+ {
+ uint32_t carry = 0;
+ while (count) {
+ acc += carry;
+ acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+ carry = ((const uint32_t*restrict)dstbuf)[ 0 ] > acc;
+ srcbuf = &srcbuf[ 4 ];
+ dstbuf = &dstbuf[ 4 ];
+ count--;
+ }
+ acc += carry;
+ acc = (acc >> 16) + (acc & 0xffff);
+ }
+ if (len & 2) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 2 ];
+ dstbuf = &dstbuf[ 2 ];
+ }
+ }
+/* trailing odd byte */
+ if (len & 1) {
+ ((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+ }
+ acc += remainder;
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+/* best if architecture has native 64-bit words
+ */
+
+static
+uint16_t
+do_csum_64bit (
+ const void* addr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast64_t acc;
+ const uint8_t* buf;
+ uint16_t remainder;
+ uint_fast16_t count;
+ bool is_odd;
+
+ acc = csum;
+ buf = (const uint8_t*)addr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+ is_odd = ((uintptr_t)buf & 1);
+/* align first byte */
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*)&remainder)[1] = *buf++;
+ len--;
+ }
+/* 16-bit words */
+ count = len >> 1;
+ if (count)
+ {
+ if ((uintptr_t)buf & 2) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ buf = &buf[ 2 ];
+ count--;
+ len -= 2;
+ }
+/* 32-bit words */
+ count >>= 1;
+ if (count)
+ {
+ if ((uintptr_t)buf & 4) {
+ acc += ((const uint32_t*)buf)[ 0 ];
+ buf = &buf[ 4 ];
+ count--;
+ len -= 4;
+ }
+/* 64-bit words */
+ count >>= 1;
+ if (count)
+ {
+ uint_fast64_t carry = 0;
+ while (count) {
+ acc += carry;
+ acc += ((const uint64_t*)buf)[ 0 ];
+ carry = ((const uint64_t*)buf)[ 0 ] > acc;
+ buf = &buf[ 8 ];
+ count--;
+ }
+ acc += carry;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ }
+ if (len & 4) {
+ acc += ((const uint32_t*)buf)[ 0 ];
+ buf = &buf[ 4 ];
+ }
+ }
+ if (len & 2) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ buf = &buf[ 2 ];
+ }
+ }
+/* trailing odd byte */
+ if (len & 1) {
+ ((uint8_t*)&remainder)[0] = *buf;
+ }
+ acc += remainder;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_64bit (
+ const void* restrict srcaddr,
+ void* restrict dstaddr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint_fast64_t acc;
+ const uint8_t* restrict srcbuf;
+ uint8_t* restrict dstbuf;
+ uint16_t remainder;
+ uint_fast16_t count;
+ bool is_odd;
+
+ acc = csum;
+ srcbuf = (const uint8_t*restrict)srcaddr;
+ dstbuf = (uint8_t*restrict)dstaddr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+ is_odd = ((uintptr_t)srcbuf & 1);
+/* align first byte */
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+ len--;
+ }
+/* 16-bit words */
+ count = len >> 1;
+ if (count)
+ {
+ if ((uintptr_t)srcbuf & 2) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 2 ];
+ dstbuf = &dstbuf[ 2 ];
+ count--;
+ len -= 2;
+ }
+/* 32-bit words */
+ count >>= 1;
+ if (count)
+ {
+ if ((uintptr_t)srcbuf & 4) {
+ acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 4 ];
+ dstbuf = &dstbuf[ 4 ];
+ count--;
+ len -= 4;
+ }
+/* 64-bit words */
+ count >>= 1;
+ if (count)
+ {
+/* 64-byte blocks */
+ uint_fast64_t carry = 0;
+ uint_fast16_t count64 = count >> 3;
+ if (count64)
+ {
+ carry = 0;
+ while (count64) {
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 0 ] = ((const uint64_t*restrict)srcbuf)[ 0 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 0 ] > acc;
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 1 ] = ((const uint64_t*restrict)srcbuf)[ 1 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 1 ] > acc;
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 2 ] = ((const uint64_t*restrict)srcbuf)[ 2 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 2 ] > acc;
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 3 ] = ((const uint64_t*restrict)srcbuf)[ 3 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 3 ] > acc;
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 4 ] = ((const uint64_t*restrict)srcbuf)[ 4 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 4 ] > acc;
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 5 ] = ((const uint64_t*restrict)srcbuf)[ 5 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 5 ] > acc;
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 6 ] = ((const uint64_t*restrict)srcbuf)[ 6 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 6 ] > acc;
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 7 ] = ((const uint64_t*restrict)srcbuf)[ 7 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 7 ] > acc;
+ srcbuf = &srcbuf[ 64 ];
+ dstbuf = &dstbuf[ 64 ];
+ count64--;
+ }
+ acc += carry;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ count %= 8;
+ }
+
+/* last 56 bytes */
+ carry = 0;
+ while (count) {
+ acc += carry;
+ acc += ((uint64_t*restrict)dstbuf)[ 0 ] = ((const uint64_t*restrict)srcbuf)[ 0 ];
+ carry = ((const uint64_t*restrict)dstbuf)[ 0 ] > acc;
+ srcbuf = &srcbuf[ 8 ];
+ dstbuf = &dstbuf[ 8 ];
+ count--;
+ }
+ acc += carry;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ }
+ if (len & 4) {
+ acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 4 ];
+ dstbuf = &dstbuf[ 4 ];
+ }
+ }
+ if (len & 2) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 2 ];
+ dstbuf = &dstbuf[ 2 ];
+ }
+ }
+/* trailing odd byte */
+ if (len & 1) {
+ ((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+ }
+ acc += remainder;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+#if defined(__amd64) || defined(__x86_64__)
+/* simd instructions unique to AMD/Intel 64-bit, so always little endian.
+ */
+
+static
+uint16_t
+do_csum_vector (
+ const void* addr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint64_t acc; /* fixed size for asm */
+ const uint8_t* buf;
+ uint16_t remainder; /* fixed size for endian swap */
+ uint_fast16_t count;
+ bool is_odd;
+
+ acc = csum;
+ buf = (const uint8_t*)addr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+/* align first byte */
+ is_odd = ((uintptr_t)buf & 1);
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*)&remainder)[1] = *buf++;
+ len--;
+ }
+/* 16-bit words */
+ count = len >> 1;
+ if (count)
+ {
+ if ((uintptr_t)buf & 2) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ buf = &buf[ 2 ];
+ count--;
+ len -= 2;
+ }
+/* 32-bit words */
+ count >>= 1;
+ if (count)
+ {
+ if ((uintptr_t)buf & 4) {
+ acc += ((const uint32_t*)buf)[ 0 ];
+ buf = &buf[ 4 ];
+ count--;
+ len -= 4;
+ }
+/* 64-bit words */
+ count >>= 1;
+ if (count)
+ {
+ uint64_t carry = 0;
+ while (count) {
+ asm volatile ( "addq %1, %0\n\t"
+ "adcq %2, %0"
+ : "=r" (acc)
+ : "m" (*(const uint64_t*)buf), "r" (carry), "0" (acc)
+ : "cc" );
+ buf = &buf[ 8 ];
+ count--;
+ }
+ acc += carry;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ }
+ if (len & 4) {
+ acc += ((const uint32_t*)buf)[ 0 ];
+ buf = &buf[ 4 ];
+ }
+ }
+ if (len & 2) {
+ acc += ((const uint16_t*)buf)[ 0 ];
+ buf = &buf[ 2 ];
+ }
+ }
+/* trailing odd byte */
+ if (len & 1) {
+ ((uint8_t*)&remainder)[0] = *buf;
+ }
+ acc += remainder;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_vector (
+ const void* restrict srcaddr,
+ void* restrict dstaddr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+ uint64_t acc; /* fixed size for asm */
+ const uint8_t*restrict srcbuf;
+ uint8_t*restrict dstbuf;
+ uint16_t remainder; /* fixed size for endian swap */
+ uint_fast16_t count;
+ bool is_odd;
+
+ acc = csum;
+ srcbuf = (const uint8_t*restrict)srcaddr;
+ dstbuf = (uint8_t*restrict)dstaddr;
+ remainder = 0;
+
+ if (PGM_UNLIKELY(len == 0))
+ return (uint16_t)acc;
+/* fill cache line with source buffer, invalidate destination buffer,
+ * perversly for testing high temporal locality is better than no locality,
+ * whilst in production no locality may be preferred depending on skb re-use.
+ */
+ pgm_prefetch (srcbuf);
+ pgm_prefetchw (dstbuf);
+/* align first byte */
+ is_odd = ((uintptr_t)srcbuf & 1);
+ if (PGM_UNLIKELY(is_odd)) {
+ ((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+ len--;
+ }
+/* 16-bit words */
+ count = len >> 1;
+ if (count)
+ {
+ if ((uintptr_t)srcbuf & 2) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 2 ];
+ dstbuf = &dstbuf[ 2 ];
+ count--;
+ len -= 2;
+ }
+/* 32-bit words */
+ count >>= 1;
+ if (count)
+ {
+ if ((uintptr_t)srcbuf & 4) {
+ acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 4 ];
+ dstbuf = &dstbuf[ 4 ];
+ count--;
+ len -= 4;
+ }
+/* 64-bit words */
+ count >>= 1;
+ if (count)
+ {
+/* 64-byte blocks */
+ uint64_t carry = 0;
+ uint_fast16_t count64 = count >> 3;
+
+ while (count64)
+ {
+ pgm_prefetch (&srcbuf[ 64 ]);
+ pgm_prefetchw (&dstbuf[ 64 ]);
+ asm volatile ( "movq 0*8(%1), %%r8\n\t" /* load */
+ "movq 1*8(%1), %%r9\n\t"
+ "movq 2*8(%1), %%r10\n\t"
+ "movq 3*8(%1), %%r11\n\t"
+ "movq 4*8(%1), %%r12\n\t"
+ "movq 5*8(%1), %%r13\n\t"
+ "movq 6*8(%1), %%r14\n\t"
+ "movq 7*8(%1), %%r15\n\t"
+ "adcq %%r8, %0\n\t" /* checksum */
+ "adcq %%r9, %0\n\t"
+ "adcq %%r10, %0\n\t"
+ "adcq %%r11, %0\n\t"
+ "adcq %%r12, %0\n\t"
+ "adcq %%r13, %0\n\t"
+ "adcq %%r14, %0\n\t"
+ "adcq %%r15, %0\n\t"
+ "adcq %3, %0\n\t"
+ "movq %%r8, 0*8(%2)\n\t" /* save */
+ "movq %%r9, 1*8(%2)\n\t"
+ "movq %%r10, 2*8(%2)\n\t"
+ "movq %%r11, 3*8(%2)\n\t"
+ "movq %%r12, 4*8(%2)\n\t"
+ "movq %%r13, 5*8(%2)\n\t"
+ "movq %%r14, 6*8(%2)\n\t"
+ "movq %%r15, 7*8(%2)"
+ : "=r" (acc)
+ : "r" (srcbuf), "r" (dstbuf), "r" (carry), "0" (acc)
+ : "cc", "memory", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" );
+ srcbuf = &srcbuf[ 64 ];
+ dstbuf = &dstbuf[ 64 ];
+ count64--;
+ }
+ count %= 8;
+/* last 56 bytes */
+ while (count) {
+ asm volatile ( "addq %1, %0\n\t"
+ "adcq %2, %0"
+ : "=r" (acc)
+ : "m" (*(const uint64_t*restrict)srcbuf), "r" (carry), "0" (acc)
+ : "cc" );
+ srcbuf = &srcbuf[ 8 ];
+ count--;
+ }
+ acc += carry;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ }
+ if (len & 4) {
+ acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 4 ];
+ dstbuf = &dstbuf[ 4 ];
+ }
+ }
+ if (len & 2) {
+ acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+ srcbuf = &srcbuf[ 2 ];
+ dstbuf = &dstbuf[ 2 ];
+ }
+ }
+/* trailing odd byte */
+ if (len & 1) {
+ ((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+ }
+ acc += remainder;
+ acc = (acc >> 32) + (acc & 0xffffffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc = (acc >> 16) + (acc & 0xffff);
+ acc += (acc >> 16);
+ if (PGM_UNLIKELY(is_odd))
+ acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+ return (uint16_t)acc;
+}
+
+#endif
+
+static inline
+uint16_t
+do_csum (
+ const void* addr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+#if defined(CONFIG_8BIT_CHECKSUM)
+ return do_csum_8bit (addr, len, csum);
+#elif defined(CONFIG_16BIT_CHECKSUM)
+ return do_csum_16bit (addr, len, csum);
+#elif defined(CONFIG_32BIT_CHECKSUM)
+ return do_csum_32bit (addr, len, csum);
+#elif defined(CONFIG_64BIT_CHECKSUM)
+ return do_csum_64bit (addr, len, csum);
+#elif defined(CONFIG_VECTOR_CHECKSUM)
+ return do_csum_vector (addr, len, csum);
+#else
+# error "checksum routine undefined"
+#endif
+}
+
+/* Calculate an IP header style checksum
+ */
+
+uint16_t
+pgm_inet_checksum (
+ const void* addr,
+ uint16_t len,
+ uint16_t csum
+ )
+{
+/* pre-conditions */
+ pgm_assert (NULL != addr);
+
+ return ~do_csum (addr, len, csum);
+}
+
+/* Calculate a partial (unfolded) checksum
+ */
+
+uint32_t
+pgm_compat_csum_partial (
+ const void* addr,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+/* pre-conditions */
+ pgm_assert (NULL != addr);
+
+ csum = (csum >> 16) + (csum & 0xffff);
+ csum += do_csum (addr, len, 0);
+ csum = (csum >> 16) + (csum & 0xffff);
+
+ return csum;
+}
+
+/* Calculate & copy a partial PGM checksum
+ */
+
+uint32_t
+pgm_compat_csum_partial_copy (
+ const void* restrict src,
+ void* restrict dst,
+ uint16_t len,
+ uint32_t csum
+ )
+{
+/* pre-conditions */
+ pgm_assert (NULL != src);
+ pgm_assert (NULL != dst);
+
+#if defined(CONFIG_8BIT_CHECKSUM)
+ return do_csumcpy_8bit (src, dst, len, csum);
+#elif defined(CONFIG_16BIT_CHECKSUM)
+ return do_csumcpy_16bit (src, dst, len, csum);
+#elif defined(CONFIG_32BIT_CHECKSUM)
+ return do_csumcpy_32bit (src, dst, len, csum);
+#elif defined(CONFIG_64BIT_CHECKSUM)
+ return do_csumcpy_64bit (src, dst, len, csum);
+#elif defined(CONFIG_VECTOR_CHECKSUM)
+ return do_csumcpy_vector (src, dst, len, csum);
+#else
+ memcpy (dst, src, len);
+ return pgm_csum_partial (dst, len, csum);
+#endif
+}
+
+/* Fold 32 bit checksum accumulator into 16 bit final value.
+ */
+
+uint16_t
+pgm_csum_fold (
+ uint32_t csum
+ )
+{
+ csum = (csum >> 16) + (csum & 0xffff);
+ csum += (csum >> 16);
+
+/* handle special case of no checksum */
+ return (uint16_t)(csum == 0xffff ? csum : ~csum);
+}
+
+/* Add together two unfolded checksum accumulators
+ */
+
+uint32_t
+pgm_csum_block_add (
+ uint32_t csum,
+ uint32_t csum2,
+ const uint16_t offset
+ )
+{
+ if (offset & 1) /* byte magic on odd offset */
+ csum2 = ((csum2 & 0xff00ff) << 8) +
+ ((csum2 >> 8) & 0xff00ff);
+
+ csum += csum2;
+ return csum + (csum < csum2);
+}
+
+/* eof */