summaryrefslogblamecommitdiffstats
path: root/src/arch/x86/include/bits/string.h
blob: c26fe30d502347e95ce8ea8536819f7b08ce67c9 (plain) (tree)
1
2
3
4
5
6
7
8
9
10


                         
  





                                                                     
  








                                                                      



                                                                    

   
                                       



                              
  

   
                                                                   
                                                                           
 







                                                             

                                                               









                                                           














                                                                      
                                              

                              
                                               

                              
                                               





                                                                 

                                               

                               

                                               

                               

                                               

                               

                                               






















































                                                                               
 
 















                                                            
 






















                                                                    

 







                                           

                                                      







                                                                         

 




























































































































                                                                               
                              
#ifndef X86_BITS_STRING_H
#define X86_BITS_STRING_H

/*
 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA.
 *
 * You can also choose to distribute this program under the terms of
 * the Unmodified Binary Distribution Licence (as given in the file
 * COPYING.UBDL), provided that you have satisfied its requirements.
 */

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

/** @file
 *
 * Optimised string operations
 *
 */

extern void * __memcpy ( void *dest, const void *src, size_t len );
extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );

/**
 * Copy memory area (where length is a compile-time constant)
 *
 * @v dest		Destination address
 * @v src		Source address
 * @v len		Length
 * @ret dest		Destination address
 */
static inline __attribute__ (( always_inline )) void *
__constant_memcpy ( void *dest, const void *src, size_t len ) {
	union {
		uint32_t u32[2];
		uint16_t u16[4];
		uint8_t  u8[8];
	} __attribute__ (( __may_alias__ )) *dest_u = dest;
	const union {
		uint32_t u32[2];
		uint16_t u16[4];
		uint8_t  u8[8];
	} __attribute__ (( __may_alias__ )) *src_u = src;
	const void *esi;
	void *edi;

	switch ( len ) {
	case 0 : /* 0 bytes */
		return dest;
	/*
	 * Single-register moves; these are always better than a
	 * string operation.  We can clobber an arbitrary two
	 * registers (data, source, dest can re-use source register)
	 * instead of being restricted to esi and edi.  There's also a
	 * much greater potential for optimising with nearby code.
	 *
	 */
	case 1 : /* 4 bytes */
		dest_u->u8[0]  = src_u->u8[0];
		return dest;
	case 2 : /* 6 bytes */
		dest_u->u16[0] = src_u->u16[0];
		return dest;
	case 4 : /* 4 bytes */
		dest_u->u32[0] = src_u->u32[0];
		return dest;
	/*
	 * Double-register moves; these are probably still a win.
	 *
	 */
	case 3 : /* 12 bytes */
		dest_u->u16[0] = src_u->u16[0];
		dest_u->u8[2]  = src_u->u8[2];
		return dest;
	case 5 : /* 10 bytes */
		dest_u->u32[0] = src_u->u32[0];
		dest_u->u8[4]  = src_u->u8[4];
		return dest;
	case 6 : /* 12 bytes */
		dest_u->u32[0] = src_u->u32[0];
		dest_u->u16[2] = src_u->u16[2];
		return dest;
	case 8 : /* 10 bytes */
		dest_u->u32[0] = src_u->u32[0];
		dest_u->u32[1] = src_u->u32[1];
		return dest;
	}

	/* Even if we have to load up esi and edi ready for a string
	 * operation, we can sometimes save space by using multiple
	 * single-byte "movs" operations instead of loading up ecx and
	 * using "rep movsb".
	 *
	 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
	 * to allow for saving/restoring ecx 50% of the time.
	 *
	 * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
	 * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
	 * but "movsl" moves twice as much data, so it balances out).
	 *
	 * The cutoff point therefore occurs around 26 bytes; the byte
	 * requirements for each method are:
	 *
	 * len		   16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
	 * #bytes (ecx)	    8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
	 * #bytes (no ecx)  4  5  6  7  5  6  7  8  6  7  8  9  7  8  9 10
	 */

	esi = src;
	edi = dest;
	
	if ( len >= 26 )
		return __memcpy ( dest, src, len );
	
	if ( len >= 6*4 )
		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );
	if ( len >= 5*4 )
		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );
	if ( len >= 4*4 )
		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );
	if ( len >= 3*4 )
		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );
	if ( len >= 2*4 )
		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );
	if ( len >= 1*4 )
		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );
	if ( ( len % 4 ) >= 2 )
		__asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );
	if ( ( len % 2 ) >= 1 )
		__asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
				       : "0" ( edi ), "1" ( esi ) : "memory" );

	return dest;
}

/**
 * Copy memory area
 *
 * @v dest		Destination address
 * @v src		Source address
 * @v len		Length
 * @ret dest		Destination address
 */
static inline __attribute__ (( always_inline )) void *
memcpy ( void *dest, const void *src, size_t len ) {
	if ( __builtin_constant_p ( len ) ) {
		return __constant_memcpy ( dest, src, len );
	} else {
		return __memcpy ( dest, src, len );
	}
}

extern void * __memmove ( void *dest, const void *src, size_t len );

/**
 * Copy (possibly overlapping) memory area
 *
 * @v dest		Destination address
 * @v src		Source address
 * @v len		Length
 * @ret dest		Destination address
 */
static inline __attribute__ (( always_inline )) void *
memmove ( void *dest, const void *src, size_t len ) {
	ssize_t offset = ( dest - src );

	if ( __builtin_constant_p ( offset ) ) {
		if ( offset <= 0 ) {
			return memcpy ( dest, src, len );
		} else {
			return __memcpy_reverse ( dest, src, len );
		}
	} else {
		return __memmove ( dest, src, len );
	}
}

/**
 * Fill memory region
 *
 * @v dest		Destination address
 * @v fill		Fill pattern
 * @v len		Length
 * @ret dest		Destination address
 */
static inline __attribute__ (( always_inline )) void *
__memset ( void *dest, int fill, size_t len ) {
	void *discard_D;
	size_t discard_c;

	__asm__ __volatile__ ( "rep stosb"
			       : "=&D" ( discard_D ), "=&c" ( discard_c )
			       : "0" ( dest ), "1" ( len ), "a" ( fill )
			       : "memory" );
	return dest;
}

/**
 * Fill memory region with zero (where length is a compile-time constant)
 *
 * @v dest		Destination address
 * @v len		Length
 * @ret dest		Destination address
 */
static inline __attribute__ (( always_inline )) void *
__constant_memset_zero ( void *dest, size_t len ) {
	union {
		uint32_t u32[2];
		uint16_t u16[4];
		uint8_t  u8[8];
	} __attribute__ (( __may_alias__ )) *dest_u = dest;
	void *edi;
	uint32_t eax;

	switch ( len ) {
	case 0 : /* 0 bytes */
		return dest;

	/* Single-register moves.  Almost certainly better than a
	 * string operation.  We can avoid clobbering any registers,
	 * we can reuse a zero that happens to already be in a
	 * register, and we can optimise away the code entirely if the
	 * memset() is used to clear a region which then gets
	 * immediately overwritten.
	 */
	case 1 : /* 3 bytes */
		dest_u->u8[0] = 0;
		return dest;
	case 2: /* 5 bytes */
		dest_u->u16[0] = 0;
		return dest;
	case 4: /* 6 bytes */
		dest_u->u32[0] = 0;
		return dest;

	/* Double-register moves.  Very probably better than a string
	 * operation.
	 */
	case 3 : /* 9 bytes */
		dest_u->u16[0] = 0;
		dest_u->u8[2]  = 0;
		return dest;
	case 5 : /* 10 bytes */
		dest_u->u32[0] = 0;
		dest_u->u8[4]  = 0;
		return dest;
	case 6 : /* 12 bytes */
		dest_u->u32[0] = 0;
		dest_u->u16[2] = 0;
		return dest;
	case 8 : /* 13 bytes */
		dest_u->u32[0] = 0;
		dest_u->u32[1] = 0;
		return dest;
	}

	/* As with memcpy(), we can potentially save space by using
	 * multiple single-byte "stos" instructions instead of loading
	 * up ecx and using "rep stosb".
	 *
	 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
	 * to allow for saving/restoring ecx 50% of the time.
	 *
	 * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
	 *
	 * The calculations are therefore the same as for memcpy(),
	 * giving a cutoff point of around 26 bytes.
	 */

	edi = dest;
	eax = 0;

	if ( len >= 26 )
		return __memset ( dest, 0, len );

	if ( len >= 6*4 )
		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );
	if ( len >= 5*4 )
		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );
	if ( len >= 4*4 )
		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );
	if ( len >= 3*4 )
		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );
	if ( len >= 2*4 )
		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );
	if ( len >= 1*4 )
		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );
	if ( ( len % 4 ) >= 2 )
		__asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );
	if ( ( len % 2 ) >= 1 )
		__asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
				       : "0" ( edi ), "1" ( eax ) : "memory" );

	return dest;
}

/**
 * Fill memory region
 *
 * @v dest		Destination address
 * @v fill		Fill pattern
 * @v len		Length
 * @ret dest		Destination address
 */
static inline __attribute__ (( always_inline )) void *
memset ( void *dest, int fill, size_t len ) {

	if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
	     __builtin_constant_p ( len ) ) {
		return __constant_memset_zero ( dest, len );
	} else {
		return __memset ( dest, fill, len );
	}
}

#endif /* X86_BITS_STRING_H */