Index: src/lookslike.c ================================================================== --- src/lookslike.c +++ src/lookslike.c @@ -135,31 +135,26 @@ } /* ** Checks for proper UTF-8. It uses the method described in: ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences -** except for the "overlong form" of \u0000 which is not considered invalid -** here: Some languages like Java and Tcl use it. This function also -** considers valid the derivatives CESU-8 & WTF-8 (as described in the -** same wikipedia article referenced previously). For UTF-8 characters -** > 7f, the variable 'c2' not necessary means the previous character. -** It's number of higher 1-bits indicate the number of continuation bytes -** that are expected to be followed. E.g. when 'c2' has a value in the range -** 0xc0..0xdf it means that 'c' is expected to contain the last continuation -** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one -** more continuation byte is expected. +** except for the "overlong form" of \u0000 (Modified UTF-8) +** which is not considered invalid here: Some languages like +** Java and Tcl use it. This function also considers valid +** the derivatives CESU-8 & WTF-8 (as described in the same +** wikipedia article referenced previously). */ /* definitions for various UTF-8 sequence lengths */ -#define US2A 0x80, 0x80 /* for lead byte 0xC0 */ -#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ -#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ -#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ -#define US4A 0x90, 0xBF /* for lead byte 0xF0 */ -#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ -#define US4C 0x80, 0x8F /* for lead byte 0xF4 */ -#define US0A 0xFF, 0x00 /* for any other lead byte */ +#define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */ +#define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ +#define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */ +#define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ +#define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */ +#define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ +#define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */ +#define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */ /* a table used for quick lookup of the definition that goes with a * particular lead byte */ static const unsigned char lb_tab[] = { US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, @@ -173,34 +168,57 @@ }; int invalid_utf8( const Blob *pContent ){ - const unsigned char *z = (unsigned char *) blob_buffer(pContent); - unsigned int n = blob_size(pContent); - unsigned char c, c2; - - if( n==0 ) return 0; /* Empty file -> OK */ - c = *z; - while( --n>0 ){ - c2 = c; - c = *++z; - if( c2>=0xC0 ){ - const unsigned char *def = &lb_tab[(2*c2)-0x180]; - if( (c<*def) || (c>*++def) ){ - return LOOK_INVALID; /* Invalid UTF-8 */ - } - if( c2>=0xe0 ){ - c = (c2<<1)|3; - }else{ - c = ' '; - } - }else if( c2>=0x80 ){ - return LOOK_INVALID; - } - } - return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ + /* buffer pointer and size */ + const unsigned char *z = (unsigned char *)blob_buffer(pContent); + unsigned int n = blob_size(pContent); + + /* while we haven't checked all the bytes in the buffer */ + while( n>0 ){ + /* ascii is trivial */ + if( *z<0x80 ){ + ++z; + --n; + }else if( *z<0xC0 ){ + return LOOK_INVALID; + }else{ + /* get the definition for this lead byte */ + const unsigned char* def = &lb_tab[(3 * *z++)-0x240]; + unsigned char len; + + /* get the expected sequence length */ + len = *def; + /* if there aren't enough bytes left, return invalid */ + if( n*++def) ){ + /* if the byte is outside the allowed range for this definition, + * return invalid */ + return LOOK_INVALID; + } + if( len > 2 ){ + /* if the next byte is not between 0x80 and 0xBF, return invalid */ + if( (*z++&0xC0)!=0x80 ){ + return LOOK_INVALID; + } + if( len > 3 ){ + /* if the next byte is not between 0x80 and 0xBF, return invalid */ + if( (*z++&0xC0)!=0x80 ){ + return LOOK_INVALID; + } + } + } + /* advance to the next sequence */ + n -= len; + } + } + /* we made it all the way through the buffer so it's not invalid */ + return LOOK_NONE; } /* ** Define the type needed to represent a Unicode (UTF-16) character. */