//parse_utf8.cpp

#include "parse_utf8.h"

parse_utf8::parse_utf8()
{
	m_input_so_far=0;
}

const char* parse_utf8::strerror( return_status s )
{
	switch(s)
	{
	case INPUT_OK:
		return "Input OK";
	case SEQ_COMPLETE:
		return "Sequence Complete";
	case INVALID_UTF8_BYTE:
		return "Invalid utf-8 byte";
	case UNEXPECTED_UTF8_CONTINUING_CHARACTER:
		return "unexpected continuing character";
	case OVERCODED_UTF8_SEQUENCE:
		return "overcoded utf-8";
	case TRUNCATED_UTF8_SEQUENCE_1:
		return "truncated utf-8 sequence, new sequence completed";
	case TRUNCATED_UTF8_SEQUENCE_2:
		return "truncated utf-8 sequence, new sequence started";
	case TRUNCATED_UTF8_SEQUENCE_3:
		return "truncated utf-8 sequence, invalid byte dropped";
	case TRUNCATED_UTF8_SEQUENCE_4:
		return "truncated utf-8 sequence, at eof";
	default:
		return "Unknown status";
	}
}

parse_utf8::return_status parse_utf8::proc_input( int inchar )
{
	if(inchar<0 || inchar >253)
	{
		//eof
		if(m_input_so_far>0)
		{
			m_input_so_far=0;
			return inchar == -1 ? TRUNCATED_UTF8_SEQUENCE_4 : TRUNCATED_UTF8_SEQUENCE_3;
		}
		else
		{
			return inchar == -1 ? INPUT_OK : INVALID_UTF8_BYTE;
		}
	}
	
	unsigned char inch = (unsigned char)inchar;
	bool seq_dropped=false;

	if(m_input_so_far > 0)
	{
		//get continuing chars
		if( (inch & 0xC0) != 0x80)
		{
			//we have a problem! drop the current sequence
			m_input_so_far=0;
			seq_dropped=true;
		}
		else
		{
			++m_input_so_far;
			m_last_result <<= 6;
			m_last_result |= inch & (0x3F);

			if(m_input_so_far == m_detect_length)
			{
				//completed character
				int minvalue;
				switch( m_detect_length )
				{
					case 2:
						minvalue=0x80;
						break;
					case 3:
						minvalue=0x800;
						break;
					case 4:
						minvalue=0x10000;
						break;
					case 5:
						minvalue=0x200000;
						break;
					case 6:
						minvalue=0x4000000;
						break;
				}

				m_input_so_far=0;

				if( m_last_result < minvalue )
					return OVERCODED_UTF8_SEQUENCE;

				return SEQ_COMPLETE;
			}

			return INPUT_OK;
		}
	}

	//start a new character

	//determine "m_detect_length"
	if( inch < 0x80 )
	{
		//we have a complete sequence
		m_last_result=(int)inch;
		return seq_dropped ? TRUNCATED_UTF8_SEQUENCE_1: SEQ_COMPLETE;
	}
	else if( inch < 0xC0 )
	{
		// (0x80 -> 0xBF )
		//we have a continuing character

		return seq_dropped ? TRUNCATED_UTF8_SEQUENCE_3: UNEXPECTED_UTF8_CONTINUING_CHARACTER;
	}
	else if( inch < 0xE0 )
	{
		// (0xC0 -> 0xDF )
		// len 2
		m_detect_length=2;
		m_last_result = inch & 0x1F;
	}
	else if( inch < 0xF0 )
	{
		// (0xE0 -> 0xEF )
		// len 3
		m_detect_length=3;
		m_last_result = inch & 0x0F;
	}
	else if( inch < 0xF8 )
	{
		// (0xF0 -> 0xF7 )
		// len 4
		m_detect_length=4;
		m_last_result = inch & 0x07;
	}
	else if( inch < 0xFC )
	{
		// (0xF8 -> 0xFB )
		// len 5
		m_detect_length=5;
		m_last_result = inch & 0x03;
	}
	//always true
	else // if( inch < 0xFE )
	{
		// (0xFC -> 0xFD )
		// len 6
		m_detect_length=6;
		m_last_result = inch & 0x01;
	}

	m_input_so_far=1;
	return seq_dropped ? TRUNCATED_UTF8_SEQUENCE_2: INPUT_OK;
}

