Fossil

Check-in [ec7f6b2e]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:More optimizations, taken over from trunk.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | invalid_utf8_table
Files: files | file ages | folders
SHA1: ec7f6b2e71c5001416d37d2d5a88b63ab1b0da23
User & Date: jan.nijtmans 2016-06-16 12:14:57
Context
2016-06-16
17:01
more optimizations (all lead bytes between 0x80 & 0xBF are invalid, so use simple check for those, and also can shrink the invalid_utf8 lead byte table even more) check-in: 6eb9a30c user: sdr tags: invalid_utf8_table
12:14
More optimizations, taken over from trunk. check-in: ec7f6b2e user: jan.nijtmans tags: invalid_utf8_table
11:39
Minor further speed-up: Only increment pointer if really needed. check-in: 5be2e9cf user: jan.nijtmans tags: trunk
09:44
merge-mark check-in: c22ec007 user: jan.nijtmans tags: invalid_utf8_table
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/lookslike.c.

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
...
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = { /* for lead byte 0xC0 */
  2, 0x80, 0x80
};
static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
  2, 0x80, 0xBF
};
static const unsigned char us3a[] = { /* for lead byte 0xE0 */
  3, 0xA0, 0xBF
};
static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
  3, 0x80, 0xBF
};
static const unsigned char us4a[] = { /* for lead byte 0xF0 */
  4, 0x90, 0xBF
};
static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
  4, 0x80, 0xBF
};
static const unsigned char us4c[] = { /* for lead byte 0xF4 */
  4, 0x80, 0x8F
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* const lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
  us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
  us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};

int invalid_utf8(
  const Blob *pContent
){
  /* buffer pointer and size */
  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
................................................................................
  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      const unsigned char* def = lb_tab[(*z++)-0x80];
      unsigned char len;

      /* if the definition doesn't exist, return invalid */
      if( !def ) return LOOK_INVALID;
      /* get the expected sequence length */
      len = *def++;
      /* if there aren't enough bytes left, return invalid */
      if( n<len ) {
        return LOOK_INVALID;
      }
      /* we already know byte #0 is good, so check the remaining bytes */
      if( (*z<*def++) || (*z++>*def++) ){
        /* if the byte is outside the allowed range for this definition,
         * return invalid */
        return LOOK_INVALID;
      }
      if( len > 2 ){
        /* if the next byte is not between 0x80 and 0xBF, return invalid */
        if( (*z++&0xC0)!=0x80 ){







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|


<
<

|





|







141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175













176
177
178
179
180
181
182
...
186
187
188
189
190
191
192
193
194
195


196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
** which is not considered invalid here: Some languages like
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
#define US2A  2, 0x80, 0x80 /* for lead byte 0xC0 */
#define US2B  2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
#define US3A  3, 0xA0, 0xBF /* for lead byte 0xE0 */
#define US3B  3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
#define US4A  4, 0x90, 0xBF /* for lead byte 0xF0 */
#define US4B  4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
#define US4C  4, 0x80, 0x8F /* for lead byte 0xF4 */
#define US0A  0xFF, 0xFF, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A













};

int invalid_utf8(
  const Blob *pContent
){
  /* buffer pointer and size */
  const unsigned char *z = (unsigned char *)blob_buffer(pContent);
................................................................................
  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      const unsigned char* def = &lb_tab[(3 * *z++)-0x180];
      unsigned char len;



      /* get the expected sequence length */
      len = *def;
      /* if there aren't enough bytes left, return invalid */
      if( n<len ) {
        return LOOK_INVALID;
      }
      /* we already know byte #0 is good, so check the remaining bytes */
      if( (*z<*++def) || (*z++>*++def) ){
        /* if the byte is outside the allowed range for this definition,
         * return invalid */
        return LOOK_INVALID;
      }
      if( len > 2 ){
        /* if the next byte is not between 0x80 and 0xBF, return invalid */
        if( (*z++&0xC0)!=0x80 ){