Fossil

Check-in [6051c441]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Simplifications and constification
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | invalid_utf8_table
Files: files | file ages | folders
SHA1:6051c441edda95837da361f9bb764befb93954b0
User & Date: jan.nijtmans 2016-06-15 08:19:12
Context
2016-06-15
15:00
added a few comments check-in: 63313a5f user: sdr tags: invalid_utf8_table
08:19
Simplifications and constification check-in: 6051c441 user: jan.nijtmans tags: invalid_utf8_table
2016-06-14
18:08
merged from trunk check-in: 12675ab7 user: sdr tags: invalid_utf8_table
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/lookslike.c.

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
...
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = {
  2, 0xC0, 0xC0, 0x80, 0x80
};
static const unsigned char us2b[] = {
  2, 0xC2, 0xDF, 0x80, 0xBF
};
static const unsigned char us3a[] = {
  3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
};
static const unsigned char us3b[] = {
  3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4a[] = {
  4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4b[] = {
  4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4c[] = {
  4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
................................................................................
  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      unsigned char* def = lb_tab[(*z++)-0x80];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if( !def ) return LOOK_INVALID;
      /* get the expected sequence length */
      len = *def;
      /* if there aren't enough bytes left, return invalid */
      if( n<len ) return LOOK_INVALID;
      /* skip the length & lead byte range */
      def += 3;
      /* we already know byte #0 is good, so check the remaining bytes */
      for(i=1; i<len; ++i){
        /* if the byte is outside the allowed range for this definition,
         * return invalid */
        if( (*z<*def++) || (*z++>*def++) ){
          return LOOK_INVALID;
        }







|


|


|


|


|


|


|




|







 







|





|


<
<







142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
...
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214


215
216
217
218
219
220
221
** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = {
  2, 0x80, 0x80
};
static const unsigned char us2b[] = {
  2, 0x80, 0xBF
};
static const unsigned char us3a[] = {
  3, 0xA0, 0xBF, 0x80, 0xBF
};
static const unsigned char us3b[] = {
  3, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4a[] = {
  4, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4b[] = {
  4, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4c[] = {
  4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* const lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
................................................................................
  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      const unsigned char* def = lb_tab[(*z++)-0x80];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if( !def ) return LOOK_INVALID;
      /* get the expected sequence length */
      len = *def++;
      /* if there aren't enough bytes left, return invalid */
      if( n<len ) return LOOK_INVALID;


      /* we already know byte #0 is good, so check the remaining bytes */
      for(i=1; i<len; ++i){
        /* if the byte is outside the allowed range for this definition,
         * return invalid */
        if( (*z<*def++) || (*z++>*def++) ){
          return LOOK_INVALID;
        }