Fossil

Check-in [6a59dbbb]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Further invalid_utf8() improvement: Save one indirection and a check, and make the table size even smaller.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 6a59dbbb99b12982b4c3adea7d2a8d002600b293
User & Date: jan.nijtmans 2016-06-16 09:44:12
Context
2016-06-16
11:39
Minor further speed-up: Only increment pointer if really needed. check-in: 5be2e9cf user: jan.nijtmans tags: trunk
09:44
merge-mark check-in: c22ec007 user: jan.nijtmans tags: invalid_utf8_table
09:44
Further invalid_utf8() improvement: Save one indirection and a check, and make the table size even smaller. check-in: 6a59dbbb user: jan.nijtmans tags: trunk
09:13
Use faster table-based approach when checking for invalid utf-8, in stead of complex bit-operations. check-in: 60349a66 user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/lookslike.c.

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
...
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
** that are expected to be followed. E.g. when 'c2' has a value in the range
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
** more continuation byte is expected.
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = { /* for lead byte 0xC0 */
  0x80, 0x80
};
static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
  0x80, 0xBF
};
static const unsigned char us3a[] = { /* for lead byte 0xE0 */
  0xA0, 0xBF
};
static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
  0x80, 0xBF
};
static const unsigned char us4a[] = { /* for lead byte 0xF0 */
  0x90, 0xBF
};
static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
  0x80, 0xBF
};
static const unsigned char us4c[] = { /* for lead byte 0xF4 */
  0x80, 0x8F
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* const lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};

int invalid_utf8(
  const Blob *pContent
){
  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
................................................................................

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    c2 = c;
    c = *++z;
    if( c2>=0x80 ){
      const unsigned char *def = lb_tab[(c2)-0x80];
      if( !def || (c<*def++) || (c>*def++) ){
        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){
        c = (c2<<1)|3;
      }else{
        c = ' ';
      }







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|
|







146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180













181
182
183
184
185
186
187
...
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
** that are expected to be followed. E.g. when 'c2' has a value in the range
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
** more continuation byte is expected.
*/

/* definitions for various UTF-8 sequence lengths */
#define US2A  0x80, 0x80 /* for lead byte 0xC0 */
#define US2B  0x80, 0xBF /* for lead bytes 0xC2-0xDF */
#define US3A  0xA0, 0xBF /* for lead byte 0xE0 */
#define US3B  0x80, 0xBF /* for lead bytes 0xE1-0xEF */
#define US4A  0x90, 0xBF /* for lead byte 0xF0 */
#define US4B  0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
#define US4C  0x80, 0x8F /* for lead byte 0xF4 */
#define US0A  0xFF, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A













};

int invalid_utf8(
  const Blob *pContent
){
  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
................................................................................

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    c2 = c;
    c = *++z;
    if( c2>=0x80 ){
      const unsigned char *def = &lb_tab[(2*c2)-0x100];
      if( (c<*def++) || (c>*def++) ){
        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){
        c = (c2<<1)|3;
      }else{
        c = ' ';
      }