Fossil

Changes On Branch invalid_utf8_table
Login

Changes On Branch invalid_utf8_table

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch invalid_utf8_table Excluding Merge-Ins

This is equivalent to a diff from 69328517 to 8a877a7b

2016-06-17
07:24
Remove a function which isn't use anywhere ... (check-in: e2a280fc user: jan.nijtmans tags: trunk)
00:04
merged from trunk ... (Closed-Leaf check-in: 8a877a7b user: sdr tags: invalid_utf8_table)
2016-06-16
22:14
shrunk size of lead byte table for invalid_utf8, and took a shortcut to invalidate lead bytes between 0x80 & 0xBF inclusive ... (check-in: 69328517 user: sdr tags: trunk)
17:01
more optimizations (all lead bytes between 0x80 & 0xBF are invalid, so use simple check for those, and also can shrink the invalid_utf8 lead byte table even more) ... (check-in: 6eb9a30c user: sdr tags: invalid_utf8_table)
11:39
Minor further speed-up: Only increment pointer if really needed. ... (check-in: 5be2e9cf user: jan.nijtmans tags: trunk)

Changes to src/lookslike.c.

133
134
135
136
137
138
139
140

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177

178
179
180
181
182
183
184

185
186

187




188








189


190
191
192

193
194
195

196


197
198
199
200






201
202
203
204
205
206
207
208
  }
  return flags;
}

/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 which is not considered invalid

** here: Some languages like Java and Tcl use it. This function also
** considers valid the derivatives CESU-8 & WTF-8 (as described in the
** same wikipedia article referenced previously). For UTF-8 characters
** > 7f, the variable 'c2' not necessary means the previous character.
** It's number of higher 1-bits indicate the number of continuation bytes
** that are expected to be followed. E.g. when 'c2' has a value in the range
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
** more continuation byte is expected.
*/

/* definitions for various UTF-8 sequence lengths */
#define US2A  0x80, 0x80 /* for lead byte 0xC0 */
#define US2B  0x80, 0xBF /* for lead bytes 0xC2-0xDF */
#define US3A  0xA0, 0xBF /* for lead byte 0xE0 */
#define US3B  0x80, 0xBF /* for lead bytes 0xE1-0xEF */
#define US4A  0x90, 0xBF /* for lead byte 0xF0 */
#define US4B  0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
#define US4C  0x80, 0x8F /* for lead byte 0xF4 */
#define US0A  0xFF, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
};

int invalid_utf8(
  const Blob *pContent
){

  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned char c, c2;

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){

    c2 = c;
    c = *++z;

    if( c2>=0xC0 ){




      const unsigned char *def = &lb_tab[(2*c2)-0x180];








      if( (c<*def) || (c>*++def) ){


        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){

        c = (c2<<1)|3;
      }else{
        c = ' ';

      }


    }else if( c2>=0x80 ){
      return LOOK_INVALID;
    }
  }






  return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32







|
>
|
|
|
<
<
<
<
<
<



|
|
|
|
|
|
|
|

















>
|

<

|
<
|
>
|
|
>
|
>
>
>
>
|
>
>
>
>
>
>
>
>
|
>
>
|

|
>
|
<
<
>
|
>
>
|
|
|
|
>
>
>
>
>
>
|







133
134
135
136
137
138
139
140
141
142
143
144






145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

176
177

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204


205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
  }
  return flags;
}

/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 (Modified UTF-8)
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).






*/

/* definitions for various UTF-8 sequence lengths */
#define US2A  2, 0x80, 0x80 /* for lead byte 0xC0 */
#define US2B  2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
#define US3A  3, 0xA0, 0xBF /* for lead byte 0xE0 */
#define US3B  3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
#define US4A  4, 0x90, 0xBF /* for lead byte 0xF0 */
#define US4B  4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
#define US4C  4, 0x80, 0x8F /* for lead byte 0xF4 */
#define US0A  0xFF, 0xFF, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
};

int invalid_utf8(
  const Blob *pContent
){
  /* buffer pointer and size */
  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);


  /* while we haven't checked all the bytes in the buffer */

  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else if( *z<0xC0 ){
      return LOOK_INVALID;
    }else{
      /* get the definition for this lead byte */
      const unsigned char* def = &lb_tab[(3 * *z++)-0x240];
      unsigned char len;

      /* get the expected sequence length */
      len = *def;
      /* if there aren't enough bytes left, return invalid */
      if( n<len ) {
        return LOOK_INVALID;
      }
      /* we already know byte #0 is good, so check the remaining bytes */
      if( (*z<*++def) || (*z++>*++def) ){
        /* if the byte is outside the allowed range for this definition,
         * return invalid */
        return LOOK_INVALID;
      }
      if( len > 2 ){
        /* if the next byte is not between 0x80 and 0xBF, return invalid */
        if( (*z++&0xC0)!=0x80 ){


           return LOOK_INVALID;
        }
        if( len > 3 ){
          /* if the next byte is not between 0x80 and 0xBF, return invalid */
          if( (*z++&0xC0)!=0x80 ){
            return LOOK_INVALID;
          }
        }
      }
      /* advance to the next sequence */
      n -= len;
    }
  }
  /* we made it all the way through the buffer so it's not invalid */
  return LOOK_NONE;
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32