path: root/src/core/utf8.c



/*
 * Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA.
 *
 * You can also choose to distribute this program under the terms of
 * the Unmodified Binary Distribution Licence (as given in the file
 * COPYING.UBDL), provided that you have satisfied its requirements.
 */

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

#include <stdint.h>
#include <assert.h>
#include <ipxe/utf8.h>

/** @file
 *
 * UTF-8 Unicode encoding
 *
 */

/**
 * Accumulate Unicode character from UTF-8 byte sequence
 *
 * @v utf8		UTF-8 accumulator
 * @v byte		UTF-8 byte
 * @ret character	Unicode character, or 0 if incomplete
 */
unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, uint8_t byte ) {
	static unsigned int min[] = {
		UTF8_MIN_TWO,
		UTF8_MIN_THREE,
		UTF8_MIN_FOUR,
	};
	unsigned int shift;
	unsigned int len;
	uint8_t tmp;

	/* Handle continuation bytes */
	if ( UTF8_IS_CONTINUATION ( byte ) ) {

		/* Fail if this is an unexpected continuation byte */
		if ( utf8->remaining == 0 ) {
			DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
			return UTF8_INVALID;
		}

		/* Apply continuation byte */
		utf8->character <<= UTF8_CONTINUATION_BITS;
		utf8->character |= ( byte & UTF8_CONTINUATION_MASK );

		/* Return 0 if more continuation bytes are expected */
		if ( --utf8->remaining != 0 )
			return 0;

		/* Fail if sequence is illegal */
		if ( utf8->character < utf8->min ) {
			DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
			       utf8->character );
			return UTF8_INVALID;
		}

		/* Sanity check */
		assert ( utf8->character != 0 );

		/* Return completed character */
		DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
			utf8, utf8->character );
		return utf8->character;
	}

	/* Reset state and report failure if this is an unexpected
	 * non-continuation byte.  Do not return UTF8_INVALID since
	 * doing so could cause us to drop a valid ASCII character.
	 */
	if ( utf8->remaining != 0 ) {
		shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
		DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
		       utf8, byte, ( utf8->character << shift ),
		       ( ( 1 << shift ) - 1 ) );
		utf8->remaining = 0;
	}

	/* Handle initial bytes */
	if ( ! UTF8_IS_ASCII ( byte ) ) {

		/* Sanity check */
		assert ( utf8->remaining == 0 );

		/* Count total number of bytes in sequence */
		tmp = byte;
		len = 0;
		while ( tmp & UTF8_HIGH_BIT ) {
			tmp <<= 1;
			len++;
		}

		/* Check for illegal length */
		if ( len > UTF8_MAX_LEN ) {
			DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
			       utf8, byte, len );
			return UTF8_INVALID;
		}

		/* Store initial bits of character */
		utf8->character = ( tmp >> len );

		/* Store number of bytes remaining */
		len--;
		utf8->remaining = len;
		assert ( utf8->remaining > 0 );

		/* Store minimum legal value */
		utf8->min = min[ len - 1 ];
		assert ( utf8->min > 0 );

		/* Await continuation bytes */
		return 0;
	}

	/* Handle ASCII bytes */
	return byte;
}