Index: src/lookslike.c
==================================================================
--- src/lookslike.c
+++ src/lookslike.c
@@ -135,31 +135,26 @@
 }
 
 /*
 ** Checks for proper UTF-8. It uses the method described in:
 **   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
-** except for the "overlong form" of \u0000 which is not considered invalid
-** here: Some languages like Java and Tcl use it. This function also
-** considers valid the derivatives CESU-8 & WTF-8 (as described in the
-** same wikipedia article referenced previously). For UTF-8 characters
-** > 7f, the variable 'c2' not necessary means the previous character.
-** It's number of higher 1-bits indicate the number of continuation bytes
-** that are expected to be followed. E.g. when 'c2' has a value in the range
-** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
-** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
-** more continuation byte is expected.
+** except for the "overlong form" of \u0000 (Modified UTF-8)
+** which is not considered invalid here: Some languages like
+** Java and Tcl use it. This function also considers valid
+** the derivatives CESU-8 & WTF-8 (as described in the same
+** wikipedia article referenced previously).
 */
 
 /* definitions for various UTF-8 sequence lengths */
-#define US2A  0x80, 0x80 /* for lead byte 0xC0 */
-#define US2B  0x80, 0xBF /* for lead bytes 0xC2-0xDF */
-#define US3A  0xA0, 0xBF /* for lead byte 0xE0 */
-#define US3B  0x80, 0xBF /* for lead bytes 0xE1-0xEF */
-#define US4A  0x90, 0xBF /* for lead byte 0xF0 */
-#define US4B  0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
-#define US4C  0x80, 0x8F /* for lead byte 0xF4 */
-#define US0A  0xFF, 0x00 /* for any other lead byte */
+#define US2A  2, 0x80, 0x80 /* for lead byte 0xC0 */
+#define US2B  2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
+#define US3A  3, 0xA0, 0xBF /* for lead byte 0xE0 */
+#define US3B  3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
+#define US4A  4, 0x90, 0xBF /* for lead byte 0xF0 */
+#define US4B  4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
+#define US4C  4, 0x80, 0x8F /* for lead byte 0xF4 */
+#define US0A  0xFF, 0xFF, 0x00 /* for any other lead byte */
 
 /* a table used for quick lookup of the definition that goes with a
  * particular lead byte */
 static const unsigned char lb_tab[] = {
   US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
@@ -173,34 +168,57 @@
 };
 
 int invalid_utf8(
   const Blob *pContent
 ){
-  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
-  unsigned int n = blob_size(pContent);
-  unsigned char c, c2;
-
-  if( n==0 ) return 0;  /* Empty file -> OK */
-  c = *z;
-  while( --n>0 ){
-    c2 = c;
-    c = *++z;
-    if( c2>=0xC0 ){
-      const unsigned char *def = &lb_tab[(2*c2)-0x180];
-      if( (c<*def) || (c>*++def) ){
-        return LOOK_INVALID; /* Invalid UTF-8 */
-      }
-      if( c2>=0xe0 ){
-        c = (c2<<1)|3;
-      }else{
-        c = ' ';
-      }
-    }else if( c2>=0x80 ){
-      return LOOK_INVALID;
-    }
-  }
-  return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
+  /* buffer pointer and size */
+  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
+  unsigned int n = blob_size(pContent);
+
+  /* while we haven't checked all the bytes in the buffer */
+  while( n>0 ){
+    /* ascii is trivial */
+    if( *z<0x80 ){
+      ++z;
+      --n;
+    }else if( *z<0xC0 ){
+      return LOOK_INVALID;
+    }else{
+      /* get the definition for this lead byte */
+      const unsigned char* def = &lb_tab[(3 * *z++)-0x240];
+      unsigned char len;
+
+      /* get the expected sequence length */
+      len = *def;
+      /* if there aren't enough bytes left, return invalid */
+      if( n<len ) {
+        return LOOK_INVALID;
+      }
+      /* we already know byte #0 is good, so check the remaining bytes */
+      if( (*z<*++def) || (*z++>*++def) ){
+        /* if the byte is outside the allowed range for this definition,
+         * return invalid */
+        return LOOK_INVALID;
+      }
+      if( len > 2 ){
+        /* if the next byte is not between 0x80 and 0xBF, return invalid */
+        if( (*z++&0xC0)!=0x80 ){
+           return LOOK_INVALID;
+        }
+        if( len > 3 ){
+          /* if the next byte is not between 0x80 and 0xBF, return invalid */
+          if( (*z++&0xC0)!=0x80 ){
+            return LOOK_INVALID;
+          }
+        }
+      }
+      /* advance to the next sequence */
+      n -= len;
+    }
+  }
+  /* we made it all the way through the buffer so it's not invalid */
+  return LOOK_NONE;
 }
 
 /*
 ** Define the type needed to represent a Unicode (UTF-16) character.
 */