Fossil

Check-in [60349a66]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Use faster table-based approach when checking for invalid utf-8, in stead of complex bit-operations.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 60349a6617490676a2ea1a31fdce56decb641dc6
User & Date: jan.nijtmans 2016-06-16 09:13:25
Context
2016-06-16
09:44
Further invalid_utf8() improvement: Save one indirection and a check, and make the table size even smaller. check-in: 6a59dbbb user: jan.nijtmans tags: trunk
09:14
Merge trunk check-in: a3328c00 user: jan.nijtmans tags: invalid_utf8_table
09:13
Use faster table-based approach when checking for invalid utf-8, in stead of complex bit-operations. check-in: 60349a66 user: jan.nijtmans tags: trunk
2016-06-15
08:38
More consistancy in using <br /> and <hr /> HTML tags check-in: bd559ff0 user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/lookslike.c.

130
131
132
133
134
135
136
137
138
139
140
141
142


143
144
145
146
147
148
149
150













































151

152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
...
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
  }
  if( j>LENGTH_MASK ){
    flags |= LOOK_LONG;  /* Very long line -> binary */
  }
  return flags;
}


/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 which is not considered invalid
** here: Some languages like Java and Tcl use it. For UTF-8 characters


** > 7f, the variable 'c2' not necessary means the previous character.
** It's number of higher 1-bits indicate the number of continuation bytes
** that are expected to be followed. E.g. when 'c2' has a value in the range
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
** more continuation byte is expected.
*/














































int invalid_utf8(const Blob *pContent){

  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned char c, c2;

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    c2 = c;
    c = *++z;
    if( c2>=0x80 ){
      if( ((c&0xc0)!=0x80) || (((c2<0xc2) || (c2>=0xf4)) &&
          (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80)))) ){
        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){
        if ((c2==0xf0 && c<0x90)||(c2==0xe0 && c<0xa0) ){
          return LOOK_INVALID; /* Invalid UTF-8, too short */
        }
        c = (c2<<1)|3;
      }else{
        c = ' ';
      }
    }
  }
  return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
}


/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
#    define WCHAR_T wchar_t
................................................................................
      fUnicode = 0;
    }else{
      fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
    }
    if( fUnicode ){
      lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
    }else{
      lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob);
    }
  }
  fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
  fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
  fossil_print("Starts with UTF-16 BOM: %s\n",
               fUtf16?(bRevUtf16?"reversed":"yes"):"no");
  fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",







<




|
>
>








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>










|
|



<
<
<








<







 







|







130
131
132
133
134
135
136

137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213



214
215
216
217
218
219
220
221

222
223
224
225
226
227
228
...
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
  }
  if( j>LENGTH_MASK ){
    flags |= LOOK_LONG;  /* Very long line -> binary */
  }
  return flags;
}


/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 which is not considered invalid
** here: Some languages like Java and Tcl use it. This function also
** considers valid the derivatives CESU-8 & WTF-8 (as described in the
** same wikipedia article referenced previously). For UTF-8 characters
** > 7f, the variable 'c2' not necessary means the previous character.
** It's number of higher 1-bits indicate the number of continuation bytes
** that are expected to be followed. E.g. when 'c2' has a value in the range
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
** more continuation byte is expected.
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = { /* for lead byte 0xC0 */
  0x80, 0x80
};
static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
  0x80, 0xBF
};
static const unsigned char us3a[] = { /* for lead byte 0xE0 */
  0xA0, 0xBF
};
static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
  0x80, 0xBF
};
static const unsigned char us4a[] = { /* for lead byte 0xF0 */
  0x90, 0xBF
};
static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
  0x80, 0xBF
};
static const unsigned char us4c[] = { /* for lead byte 0xF4 */
  0x80, 0x8F
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* const lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};

int invalid_utf8(
  const Blob *pContent
){
  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned char c, c2;

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    c2 = c;
    c = *++z;
    if( c2>=0x80 ){
      const unsigned char *def = lb_tab[(c2)-0x80];
      if( !def || (c<*def++) || (c>*def++) ){
        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){



        c = (c2<<1)|3;
      }else{
        c = ' ';
      }
    }
  }
  return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
}


/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
#    define WCHAR_T wchar_t
................................................................................
      fUnicode = 0;
    }else{
      fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
    }
    if( fUnicode ){
      lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
    }else{
      lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
    }
  }
  fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
  fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
  fossil_print("Starts with UTF-16 BOM: %s\n",
               fUtf16?(bRevUtf16?"reversed":"yes"):"no");
  fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",