1 files changed, 941 insertions, 0 deletions
diff --git a/3rdparty/openpgm-svn-r1135/pgm/checksum.c b/3rdparty/openpgm-svn-r1135/pgm/checksum.c
new file mode 100644
index 0000000..5e367ea
--- /dev/null
+++ b/3rdparty/openpgm-svn-r1135/pgm/checksum.c
@@ -0,0 +1,941 @@
+/* vim:ts=8:sts=8:sw=4:noai:noexpandtab
+ *
+ * PGM checksum routines
+ *
+ * Copyright (c) 2006-2010 Miru Limited.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <impl/framework.h>
+
+
+/* locals */
+
+static inline uint16_t do_csum (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_8bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_16bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_32bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+static uint16_t do_csum_64bit (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+#if defined(__amd64) || defined(__x86_64__)
+static uint16_t do_csum_vector (const void*, uint16_t, uint32_t) PGM_GNUC_PURE;
+#endif
+
+
+/* endian independent checksum routine
+ */
+
+static
+uint16_t
+do_csum_8bit (
+	const void*	addr,
+	uint16_t	len,
+	uint32_t	csum
+	)
+{
+	uint_fast32_t acc;
+	uint16_t src;
+	const uint8_t* buf;
+
+	acc = csum;
+	buf = (const uint8_t*)addr;
+	while (len > 1) {
+/* first byte as most significant */
+		src  = (*buf++) << 8;
+/* second byte as least significant */
+		src |= (*buf++);
+		acc += src;
+		len -= 2;
+	}
+/* trailing odd byte */
+	if (len > 0) {
+		src = (*buf) << 8;
+		acc += src;
+	}
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	return htons ((uint16_t)acc);
+}
+
+static
+uint16_t
+do_csumcpy_8bit (
+	const void* restrict srcaddr,
+	void*       restrict dstaddr,
+	uint16_t	     len,
+	uint32_t	     csum
+	)
+{
+	uint_fast32_t acc;
+	const uint8_t*restrict srcbuf;
+	uint8_t*restrict dstbuf;
+	uint_fast16_t val16;
+
+	acc = csum;
+	srcbuf = (const uint8_t*restrict)srcaddr;
+	dstbuf = (uint8_t*restrict)dstaddr;
+	while (len > 1) {
+/* first byte as most significant */
+		val16  = (*dstbuf++ = *srcbuf++) << 8;
+/* second byte as least significant */
+		val16 |= (*dstbuf++ = *srcbuf++);
+		acc += val16;
+		len -= 2;
+	}
+/* trailing odd byte */
+	if (len > 0) {
+		val16 = (*dstbuf = *srcbuf) << 8;
+		acc += val16;
+	}
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	return htons ((uint16_t)acc);
+}
+
+static
+uint16_t
+do_csum_16bit (
+	const void*	addr,
+	uint16_t	len,
+	uint32_t	csum
+	)
+{
+	uint_fast32_t acc;
+	const uint8_t* buf;
+	uint16_t remainder;
+	uint_fast16_t count8;
+	bool is_odd;
+
+	acc = csum;
+	buf = (const uint8_t*)addr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+	is_odd = ((uintptr_t)buf & 1);
+/* align first byte */
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*)&remainder)[1] = *buf++;
+		len--;
+	}
+/* 8-byte unrolls */
+	count8 = len >> 3;
+	while (count8--) {
+		acc += ((const uint16_t*)buf)[ 0 ];
+		acc += ((const uint16_t*)buf)[ 1 ];
+		acc += ((const uint16_t*)buf)[ 2 ];
+		acc += ((const uint16_t*)buf)[ 3 ];
+		buf  = &buf[ 8 ];
+	}
+	len %= 8;
+/* final 7 bytes */
+	while (len > 1) {
+		acc += ((const uint16_t*)buf)[ 0 ];
+		buf  = &buf[ 2 ];
+		len -= 2;
+	}
+/* trailing odd byte */
+	if (len > 0) {
+		((uint8_t*)&remainder)[0] = *buf;
+	}
+	acc += remainder;
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_16bit (
+	const void* restrict srcaddr,
+	void*	    restrict dstaddr,
+	uint16_t	     len,
+	uint32_t	     csum
+	)
+{
+	uint_fast32_t acc;
+	const uint8_t*restrict srcbuf;
+	uint8_t*restrict dstbuf;
+	uint16_t remainder;
+	uint_fast16_t count8;
+	bool is_odd;
+
+	acc = csum;
+	srcbuf = (const uint8_t*restrict)srcaddr;
+	dstbuf = (uint8_t*restrict)dstaddr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+	is_odd = ((uintptr_t)srcbuf & 1);
+/* align first byte */
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+		len--;
+	}
+/* 8-byte unrolls, anything larger than 16-byte or less than 8 loses performance */
+	count8 = len >> 3;
+	while (count8--) {
+		acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+		acc += ((uint16_t*restrict)dstbuf)[ 1 ] = ((const uint16_t*restrict)srcbuf)[ 1 ];
+		acc += ((uint16_t*restrict)dstbuf)[ 2 ] = ((const uint16_t*restrict)srcbuf)[ 2 ];
+		acc += ((uint16_t*restrict)dstbuf)[ 3 ] = ((const uint16_t*restrict)srcbuf)[ 3 ];
+		srcbuf = &srcbuf[ 8 ];
+		dstbuf = &dstbuf[ 8 ];
+	}
+	len %= 8;
+/* final 7 bytes */
+	while (len > 1) {
+		acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+		srcbuf = &srcbuf[ 2 ];
+		dstbuf = &dstbuf[ 2 ];
+		len -= 2;
+	}
+/* trailing odd byte */
+	if (len > 0) {
+		((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+	}
+	acc += remainder;
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csum_32bit (
+	const void*	addr,
+	uint16_t	len,
+	uint32_t	csum
+	)
+{
+	uint_fast32_t acc;
+	const uint8_t* buf;
+	uint16_t remainder;
+	uint_fast16_t count;
+	bool is_odd;
+
+	acc = csum;
+	buf = (const uint8_t*)addr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+	is_odd = ((uintptr_t)buf & 1);
+/* align first byte */
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*)&remainder)[1] = *buf++;
+		len--;
+	}
+/* 16-bit words */
+	count = len >> 1;
+	if (count)
+	{
+		if ((uintptr_t)buf & 2) {
+			acc += ((const uint16_t*)buf)[ 0 ];
+			buf  = &buf[ 2 ];
+			count--;
+			len -= 2;
+		}
+/* 32-bit words */
+		count >>= 1;
+		if (count)
+		{
+			uint32_t carry = 0;
+			while (count) {
+				acc += carry;
+				acc += ((const uint32_t*)buf)[ 0 ];
+				carry = ((const uint32_t*)buf)[ 0 ] > acc;
+				buf  = &buf[ 4 ];
+				count--;
+			}
+			acc += carry;
+			acc  = (acc >> 16) + (acc & 0xffff);
+		}
+		if (len & 2) {
+			acc += ((const uint16_t*)buf)[ 0 ];
+			buf  = &buf[ 2 ];
+		}
+	}
+/* trailing odd byte */
+	if (len & 1) {
+		((uint8_t*)&remainder)[0] = *buf;
+	}
+	acc += remainder;
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_32bit (
+	const void* restrict srcaddr,
+	void*       restrict dstaddr,
+	uint16_t	     len,
+	uint32_t	     csum
+	)
+{
+	uint_fast32_t acc;
+	const uint8_t*restrict srcbuf;
+	uint8_t*restrict dstbuf;
+	uint16_t remainder;
+	uint_fast16_t count;
+	bool is_odd;
+
+	acc = csum;
+	srcbuf = (const uint8_t*restrict)srcaddr;
+	dstbuf = (uint8_t*restrict)dstaddr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+	is_odd = ((uintptr_t)srcbuf & 1);
+/* align first byte */
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+		len--;
+	}
+/* 16-bit words */
+	count = len >> 1;
+	if (count)
+	{
+		if ((uintptr_t)srcbuf & 2) {
+			acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+			srcbuf = &srcbuf[ 2 ];
+			dstbuf = &dstbuf[ 2 ];
+			count--;
+			len -= 2;
+		}
+/* 32-bit words */
+		count >>= 1;
+		if (count)
+		{
+			uint32_t carry = 0;
+			while (count) {
+				acc += carry;
+				acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+				carry = ((const uint32_t*restrict)dstbuf)[ 0 ] > acc;
+				srcbuf = &srcbuf[ 4 ];
+				dstbuf = &dstbuf[ 4 ];
+				count--;
+			}
+			acc += carry;
+			acc  = (acc >> 16) + (acc & 0xffff);
+		}
+		if (len & 2) {
+			acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+			srcbuf = &srcbuf[ 2 ];
+			dstbuf = &dstbuf[ 2 ];
+		}
+	}
+/* trailing odd byte */
+	if (len & 1) {
+		((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+	}
+	acc += remainder;
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+/* best if architecture has native 64-bit words
+ */
+
+static
+uint16_t
+do_csum_64bit (
+	const void*	addr,
+	uint16_t	len,
+	uint32_t	csum
+	)
+{
+	uint_fast64_t acc;
+	const uint8_t* buf;
+	uint16_t remainder;
+	uint_fast16_t count;
+	bool is_odd;
+
+	acc = csum;
+	buf = (const uint8_t*)addr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+	is_odd = ((uintptr_t)buf & 1);
+/* align first byte */
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*)&remainder)[1] = *buf++;
+		len--;
+	}
+/* 16-bit words */
+	count = len >> 1;
+	if (count)
+	{
+		if ((uintptr_t)buf & 2) {
+			acc += ((const uint16_t*)buf)[ 0 ];
+			buf  = &buf[ 2 ];
+			count--;
+			len -= 2;
+		}
+/* 32-bit words */
+		count >>= 1;
+		if (count)
+		{
+			if ((uintptr_t)buf & 4) {
+				acc += ((const uint32_t*)buf)[ 0 ];
+				buf  = &buf[ 4 ];
+				count--;
+				len -= 4;
+			}
+/* 64-bit words */
+			count >>= 1;
+			if (count)
+			{
+				uint_fast64_t carry = 0;
+				while (count) {
+					acc += carry;
+					acc += ((const uint64_t*)buf)[ 0 ];
+					carry = ((const uint64_t*)buf)[ 0 ] > acc;
+					buf  = &buf[ 8 ];
+					count--;
+				}
+				acc += carry;
+				acc  = (acc >> 32) + (acc & 0xffffffff);
+			}
+			if (len & 4) {
+				acc += ((const uint32_t*)buf)[ 0 ];
+				buf  = &buf[ 4 ];
+			}
+		}
+		if (len & 2) {
+			acc += ((const uint16_t*)buf)[ 0 ];
+			buf  = &buf[ 2 ];
+		}
+	}
+/* trailing odd byte */
+	if (len & 1) {
+		((uint8_t*)&remainder)[0] = *buf;
+	}
+	acc += remainder;
+	acc  = (acc >> 32) + (acc & 0xffffffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_64bit (
+	const void* restrict srcaddr,
+	void*	    restrict dstaddr,
+	uint16_t	     len,
+	uint32_t	     csum
+	)
+{
+	uint_fast64_t acc;
+	const uint8_t* restrict srcbuf;
+	uint8_t* restrict dstbuf;
+	uint16_t remainder;
+	uint_fast16_t count;
+	bool is_odd;
+
+	acc = csum;
+	srcbuf = (const uint8_t*restrict)srcaddr;
+	dstbuf = (uint8_t*restrict)dstaddr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+	is_odd = ((uintptr_t)srcbuf & 1);
+/* align first byte */
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+		len--;
+	}
+/* 16-bit words */
+	count = len >> 1;
+	if (count)
+	{
+		if ((uintptr_t)srcbuf & 2) {
+			acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+			srcbuf = &srcbuf[ 2 ];
+			dstbuf = &dstbuf[ 2 ];
+			count--;
+			len -= 2;
+		}
+/* 32-bit words */
+		count >>= 1;
+		if (count)
+		{
+			if ((uintptr_t)srcbuf & 4) {
+				acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+				srcbuf = &srcbuf[ 4 ];
+				dstbuf = &dstbuf[ 4 ];
+				count--;
+				len -= 4;
+			}
+/* 64-bit words */
+			count >>= 1;
+			if (count)
+			{
+/* 64-byte blocks */
+				uint_fast64_t carry = 0;
+				uint_fast16_t count64 = count >> 3;
+				if (count64)
+				{
+					carry = 0;
+					while (count64) {
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 0 ] = ((const uint64_t*restrict)srcbuf)[ 0 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 0 ] > acc;
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 1 ] = ((const uint64_t*restrict)srcbuf)[ 1 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 1 ] > acc;
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 2 ] = ((const uint64_t*restrict)srcbuf)[ 2 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 2 ] > acc;
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 3 ] = ((const uint64_t*restrict)srcbuf)[ 3 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 3 ] > acc;
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 4 ] = ((const uint64_t*restrict)srcbuf)[ 4 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 4 ] > acc;
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 5 ] = ((const uint64_t*restrict)srcbuf)[ 5 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 5 ] > acc;
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 6 ] = ((const uint64_t*restrict)srcbuf)[ 6 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 6 ] > acc;
+						acc += carry;
+						acc += ((uint64_t*restrict)dstbuf)[ 7 ] = ((const uint64_t*restrict)srcbuf)[ 7 ];
+						carry  = ((const uint64_t*restrict)dstbuf)[ 7 ] > acc;
+						srcbuf = &srcbuf[ 64 ];
+						dstbuf = &dstbuf[ 64 ];
+						count64--;
+					}
+					acc += carry;
+					acc  = (acc >> 32) + (acc & 0xffffffff);
+					count %= 8;
+				}
+
+/* last 56 bytes */
+				carry = 0;
+				while (count) {
+					acc += carry;
+					acc += ((uint64_t*restrict)dstbuf)[ 0 ] = ((const uint64_t*restrict)srcbuf)[ 0 ];
+					carry = ((const uint64_t*restrict)dstbuf)[ 0 ] > acc;
+					srcbuf = &srcbuf[ 8 ];
+					dstbuf = &dstbuf[ 8 ];
+					count--;
+				}
+				acc += carry;
+				acc  = (acc >> 32) + (acc & 0xffffffff);
+			}
+			if (len & 4) {
+				acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+				srcbuf = &srcbuf[ 4 ];
+				dstbuf = &dstbuf[ 4 ];
+			}
+		}
+		if (len & 2) {
+			acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+			srcbuf = &srcbuf[ 2 ];
+			dstbuf = &dstbuf[ 2 ];
+		}
+	}
+/* trailing odd byte */
+	if (len & 1) {
+		((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+	}
+	acc += remainder;
+	acc  = (acc >> 32) + (acc & 0xffffffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+#if defined(__amd64) || defined(__x86_64__)
+/* simd instructions unique to AMD/Intel 64-bit, so always little endian.
+ */
+
+static
+uint16_t
+do_csum_vector (
+	const void*	addr,
+	uint16_t	len,
+	uint32_t	csum
+	)
+{
+	uint64_t acc;			/* fixed size for asm */
+	const uint8_t* buf;
+	uint16_t remainder;		/* fixed size for endian swap */
+	uint_fast16_t count;
+	bool is_odd;
+
+	acc = csum;
+	buf = (const uint8_t*)addr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+/* align first byte */
+	is_odd = ((uintptr_t)buf & 1);
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*)&remainder)[1] = *buf++;
+		len--;
+	}
+/* 16-bit words */
+	count = len >> 1;
+	if (count)
+	{
+		if ((uintptr_t)buf & 2) {
+			acc += ((const uint16_t*)buf)[ 0 ];
+			buf  = &buf[ 2 ];
+			count--;
+			len -= 2;
+		}
+/* 32-bit words */
+		count >>= 1;
+		if (count)
+		{
+			if ((uintptr_t)buf & 4) {
+				acc += ((const uint32_t*)buf)[ 0 ];
+				buf  = &buf[ 4 ];
+				count--;
+				len -= 4;
+			}
+/* 64-bit words */
+			count >>= 1;
+			if (count)
+			{
+				uint64_t carry = 0;
+				while (count) {
+					asm volatile (	"addq %1, %0\n\t"
+							"adcq %2, %0"
+					     	      : "=r" (acc)
+					      	      : "m" (*(const uint64_t*)buf), "r" (carry), "0" (acc)
+						      : "cc"  );
+					buf  = &buf[ 8 ];
+					count--;
+				}
+				acc += carry;
+				acc  = (acc >> 32) + (acc & 0xffffffff);
+			}
+			if (len & 4) {
+				acc += ((const uint32_t*)buf)[ 0 ];
+				buf  = &buf[ 4 ];
+			}
+		}
+		if (len & 2) {
+			acc += ((const uint16_t*)buf)[ 0 ];
+			buf  = &buf[ 2 ];
+		}
+	}
+/* trailing odd byte */
+	if (len & 1) {
+		((uint8_t*)&remainder)[0] = *buf;
+	}
+	acc += remainder;
+	acc  = (acc >> 32) + (acc & 0xffffffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+static
+uint16_t
+do_csumcpy_vector (
+	const void* restrict srcaddr,
+	void* restrict       dstaddr,
+	uint16_t	     len,
+	uint32_t	     csum
+	)
+{
+	uint64_t acc;			/* fixed size for asm */
+	const uint8_t*restrict srcbuf;
+	uint8_t*restrict dstbuf;
+	uint16_t remainder;		/* fixed size for endian swap */
+	uint_fast16_t count;
+	bool is_odd;
+
+	acc = csum;
+	srcbuf = (const uint8_t*restrict)srcaddr;
+	dstbuf = (uint8_t*restrict)dstaddr;
+	remainder = 0;
+
+	if (PGM_UNLIKELY(len == 0))
+		return (uint16_t)acc;
+/* fill cache line with source buffer, invalidate destination buffer,
+ * perversly for testing high temporal locality is better than no locality,
+ * whilst in production no locality may be preferred depending on skb re-use.
+ */
+	pgm_prefetch (srcbuf);
+	pgm_prefetchw (dstbuf);
+/* align first byte */
+	is_odd = ((uintptr_t)srcbuf & 1);
+	if (PGM_UNLIKELY(is_odd)) {
+		((uint8_t*restrict)&remainder)[1] = *dstbuf++ = *srcbuf++;
+		len--;
+	}
+/* 16-bit words */
+	count = len >> 1;
+	if (count)
+	{
+		if ((uintptr_t)srcbuf & 2) {
+			acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+			srcbuf = &srcbuf[ 2 ];
+			dstbuf = &dstbuf[ 2 ];
+			count--;
+			len -= 2;
+		}
+/* 32-bit words */
+		count >>= 1;
+		if (count)
+		{
+			if ((uintptr_t)srcbuf & 4) {
+				acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+				srcbuf = &srcbuf[ 4 ];
+				dstbuf = &dstbuf[ 4 ];
+				count--;
+				len -= 4;
+			}
+/* 64-bit words */
+			count >>= 1;
+			if (count)
+			{
+/* 64-byte blocks */
+				uint64_t carry = 0;
+				uint_fast16_t count64 = count >> 3;
+
+				while (count64)
+				{
+					pgm_prefetch (&srcbuf[ 64 ]);
+					pgm_prefetchw (&dstbuf[ 64 ]);
+					asm volatile (	"movq 0*8(%1), %%r8\n\t"	/* load */
+							"movq 1*8(%1), %%r9\n\t"
+							"movq 2*8(%1), %%r10\n\t"
+							"movq 3*8(%1), %%r11\n\t"
+							"movq 4*8(%1), %%r12\n\t"
+							"movq 5*8(%1), %%r13\n\t"
+							"movq 6*8(%1), %%r14\n\t"
+							"movq 7*8(%1), %%r15\n\t"
+							"adcq %%r8, %0\n\t"		/* checksum */
+							"adcq %%r9, %0\n\t"
+							"adcq %%r10, %0\n\t"
+							"adcq %%r11, %0\n\t"
+							"adcq %%r12, %0\n\t"
+							"adcq %%r13, %0\n\t"
+							"adcq %%r14, %0\n\t"
+							"adcq %%r15, %0\n\t"
+							"adcq %3, %0\n\t"
+							"movq %%r8, 0*8(%2)\n\t"	/* save */
+							"movq %%r9, 1*8(%2)\n\t"
+							"movq %%r10, 2*8(%2)\n\t"
+							"movq %%r11, 3*8(%2)\n\t"
+							"movq %%r12, 4*8(%2)\n\t"
+							"movq %%r13, 5*8(%2)\n\t"
+							"movq %%r14, 6*8(%2)\n\t"
+							"movq %%r15, 7*8(%2)"
+						      : "=r" (acc)
+						      : "r" (srcbuf), "r" (dstbuf), "r" (carry), "0" (acc)
+						      : "cc", "memory", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"  );
+					srcbuf = &srcbuf[ 64 ];
+					dstbuf = &dstbuf[ 64 ];
+					count64--;
+				}
+				count %= 8;
+/* last 56 bytes */
+				while (count) {
+					asm volatile (	"addq %1, %0\n\t"
+							"adcq %2, %0"
+						      : "=r" (acc)
+						      : "m" (*(const uint64_t*restrict)srcbuf), "r" (carry), "0" (acc)
+						      : "cc"  );
+					srcbuf  = &srcbuf[ 8 ];
+					count--;
+				}
+				acc += carry;
+				acc  = (acc >> 32) + (acc & 0xffffffff);
+			}
+			if (len & 4) {
+				acc += ((uint32_t*restrict)dstbuf)[ 0 ] = ((const uint32_t*restrict)srcbuf)[ 0 ];
+				srcbuf = &srcbuf[ 4 ];
+				dstbuf = &dstbuf[ 4 ];
+			}
+		}
+		if (len & 2) {
+			acc += ((uint16_t*restrict)dstbuf)[ 0 ] = ((const uint16_t*restrict)srcbuf)[ 0 ];
+			srcbuf = &srcbuf[ 2 ];
+			dstbuf = &dstbuf[ 2 ];
+		}
+	}
+/* trailing odd byte */
+	if (len & 1) {
+		((uint8_t*restrict)&remainder)[0] = *dstbuf = *srcbuf;
+	}
+	acc += remainder;
+	acc  = (acc >> 32) + (acc & 0xffffffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc  = (acc >> 16) + (acc & 0xffff);
+	acc += (acc >> 16);
+	if (PGM_UNLIKELY(is_odd))
+		acc = ((acc & 0xff) << 8) | ((acc & 0xff00) >> 8);
+	return (uint16_t)acc;
+}
+
+#endif
+
+static inline
+uint16_t
+do_csum (
+	const void*	addr,
+	uint16_t	len,
+	uint32_t	csum
+	)
+{
+#if defined(CONFIG_8BIT_CHECKSUM)
+	return do_csum_8bit (addr, len, csum);
+#elif defined(CONFIG_16BIT_CHECKSUM)
+	return do_csum_16bit (addr, len, csum);
+#elif defined(CONFIG_32BIT_CHECKSUM)
+	return do_csum_32bit (addr, len, csum);
+#elif defined(CONFIG_64BIT_CHECKSUM)
+	return do_csum_64bit (addr, len, csum);
+#elif defined(CONFIG_VECTOR_CHECKSUM)
+	return do_csum_vector (addr, len, csum);
+#else
+#	error "checksum routine undefined"
+#endif
+}
+
+/* Calculate an IP header style checksum
+ */
+
+uint16_t
+pgm_inet_checksum (
+	const void*	addr,
+	uint16_t	len,
+	uint16_t	csum
+	)
+{
+/* pre-conditions */
+	pgm_assert (NULL != addr);
+
+	return ~do_csum (addr, len, csum);
+}
+
+/* Calculate a partial (unfolded) checksum
+ */
+
+uint32_t
+pgm_compat_csum_partial (
+	const void*	addr,
+	uint16_t	len,
+	uint32_t	csum
+	)
+{
+/* pre-conditions */
+	pgm_assert (NULL != addr);
+
+	csum  = (csum >> 16) + (csum & 0xffff);
+	csum += do_csum (addr, len, 0);
+	csum  = (csum >> 16) + (csum & 0xffff);
+
+	return csum;
+}
+
+/* Calculate & copy a partial PGM checksum
+ */
+
+uint32_t
+pgm_compat_csum_partial_copy (
+	const void* restrict src,
+	void*	    restrict dst,
+	uint16_t	     len,
+	uint32_t	     csum
+	)
+{
+/* pre-conditions */
+	pgm_assert (NULL != src);
+	pgm_assert (NULL != dst);
+
+#if defined(CONFIG_8BIT_CHECKSUM)
+	return do_csumcpy_8bit (src, dst, len, csum);
+#elif defined(CONFIG_16BIT_CHECKSUM)
+	return do_csumcpy_16bit (src, dst, len, csum);
+#elif defined(CONFIG_32BIT_CHECKSUM)
+	return do_csumcpy_32bit (src, dst, len, csum);
+#elif defined(CONFIG_64BIT_CHECKSUM)
+	return do_csumcpy_64bit (src, dst, len, csum);
+#elif defined(CONFIG_VECTOR_CHECKSUM)
+	return do_csumcpy_vector (src, dst, len, csum);
+#else
+	memcpy (dst, src, len);
+	return pgm_csum_partial (dst, len, csum);
+#endif
+}
+
+/* Fold 32 bit checksum accumulator into 16 bit final value.
+ */
+
+uint16_t
+pgm_csum_fold (
+	uint32_t	csum
+	)
+{
+	csum  = (csum >> 16) + (csum & 0xffff);
+	csum += (csum >> 16);
+
+/* handle special case of no checksum */
+	return (uint16_t)(csum == 0xffff ? csum : ~csum);
+}
+
+/* Add together two unfolded checksum accumulators
+ */
+
+uint32_t
+pgm_csum_block_add (
+	uint32_t	csum,
+	uint32_t	csum2,
+	const uint16_t	offset
+	)
+{
+	if (offset & 1)			/* byte magic on odd offset */
+		csum2 = ((csum2 & 0xff00ff) << 8) +
+			((csum2 >> 8) & 0xff00ff);
+
+	csum += csum2;
+	return csum + (csum < csum2);
+}
+
+/* eof */