Fossil

Check-in [7c08a685]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:micro-optimizing invalid_utf8 function, should be as fast as possible now
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:7c08a68503a45da327ac55ca9252d9a71b43ff17
User & Date: jan.nijtmans 2016-06-26 17:05:45
Context
2016-06-28
09:10
Unnecessary type='text/javascript on <script> tag. check-in: 5cdaeb0d user: jan.nijtmans tags: trunk
2016-06-26
17:05
micro-optimizing invalid_utf8 function, should be as fast as possible now check-in: 7c08a685 user: jan.nijtmans tags: trunk
17:04
Improve comments Closed-Leaf check-in: 8bdd0abc user: jan.nijtmans tags: invalid_utf8_improvements
2016-06-24
03:36
If the FOSSIL_SECURITY_LEVEL environment variable is 2 or more, then present a simple substitution matrix when entering passwords, as a defense against key loggers. For FOSSIL_SECURITY_LEVEL of 1 or more, do not remember the remote-url password. check-in: e1034c4c user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/lookslike.c.

48
49
50
51
52
53
54































55
56
57
58
59
60
61
...
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188


189

190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#define LOOK_ODD     ((int)0x00000080) /* An odd number of bytes was found. */
#define LOOK_SHORT   ((int)0x00000100) /* Unable to perform full check. */
#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
#define LOOK_BINARY  (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
#define LOOK_EOL     (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
#endif /* INTERFACE */

































/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  The return value
** is a combination of one or more of the LOOK_XXX flags (see above):
**
** !LOOK_BINARY -- The content appears to consist entirely of text; however,
................................................................................
  }
  return flags;
}

/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 which is not considered invalid
** here: Some languages like Java and Tcl use it. This function also
** considers valid the derivatives CESU-8 & WTF-8 (as described in the
** same wikipedia article referenced previously). For UTF-8 characters
** > 7f, the variable 'c2' not necessary means the previous character.
** It's number of higher 1-bits indicate the number of continuation bytes
** that are expected to be followed. E.g. when 'c2' has a value in the range
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
** more continuation byte is expected.
*/

/* definitions for various UTF-8 sequence lengths */
#define US2A  0x80, 0x80 /* for lead byte 0xC0 */
#define US2B  0x80, 0xBF /* for lead bytes 0xC2-0xDF */
#define US3A  0xA0, 0xBF /* for lead byte 0xE0 */
#define US3B  0x80, 0xBF /* for lead bytes 0xE1-0xEF */
#define US4A  0x90, 0xBF /* for lead byte 0xF0 */
#define US4B  0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
#define US4C  0x80, 0x8F /* for lead byte 0xF4 */
#define US0A  0xFF, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
};

int invalid_utf8(
  const Blob *pContent
){
  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned char c, c2;

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    c2 = c;
    c = *++z;
    if( c2>=0xC0 ){
      const unsigned char *def = &lb_tab[(2*c2)-0x180];


      if( (c<*def) || (c>*++def) ){

        return LOOK_INVALID; /* Invalid UTF-8 */
      }
      if( c2>=0xe0 ){
        c = (c2<<1)|3;
      }else{
        c = ' ';
      }
    }else if( c2>=0x80 ){
      return LOOK_INVALID;
    }
  }
  return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|
|
|
|
|
|
|
|
|
|


<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<





|




|
<
<
|
>
>
|
>


<
|
|
|
|
<
<
|
<
|







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
...
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182























183
184
185
186
187
188
189
190
191
192
193


194
195
196
197
198
199
200

201
202
203
204


205

206
207
208
209
210
211
212
213
#define LOOK_ODD     ((int)0x00000080) /* An odd number of bytes was found. */
#define LOOK_SHORT   ((int)0x00000100) /* Unable to perform full check. */
#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
#define LOOK_BINARY  (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
#define LOOK_EOL     (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
#endif /* INTERFACE */

/* definitions for various UTF-8 sequence lengths, encoded as start value
 * and size of each valid range belonging to some lead byte*/
#define US2A  0x80, 0x01 /* for lead byte 0xC0 */
#define US2B  0x80, 0x40 /* for lead bytes 0xC2-0xDF */
#define US3A  0xA0, 0x20 /* for lead byte 0xE0 */
#define US3B  0x80, 0x40 /* for lead bytes 0xE1-0xEF */
#define US4A  0x90, 0x30 /* for lead byte 0xF0 */
#define US4B  0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
#define US4C  0x80, 0x10 /* for lead byte 0xF4 */
#define US0A  0x00, 0x00 /* for any other lead byte */

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char lb_tab[] = {
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
  US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
  US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
  US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
  US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
};

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  The return value
** is a combination of one or more of the LOOK_XXX flags (see above):
**
** !LOOK_BINARY -- The content appears to consist entirely of text; however,
................................................................................
  }
  return flags;
}

/*
** Checks for proper UTF-8. It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" of \u0000 which is not considered
** invalid here: Some languages like Java and Tcl use it. This function
** also considers valid the derivatives CESU-8 & WTF-8 (as described in
** the same wikipedia article referenced previously). For UTF-8 characters
** > 0x7f, the variable 'c' not necessary means the real lead byte.
** It's number of higher 1-bits indicate the number of continuation
** bytes that are expected to be followed. E.g. when 'c' has a value
** in the range 0xc0..0xdf it means that after 'c' a single continuation
** byte is expected. A value 0xe0..0xef means that after 'c' two more
** continuation bytes are expected.
*/
























int invalid_utf8(
  const Blob *pContent
){
  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned char c; /* lead byte to be handled. */

  if( n==0 ) return 0;  /* Empty file -> OK */
  c = *z;
  while( --n>0 ){
    if( c>=0x80 ){


      const unsigned char *def; /* pointer to range table*/

      c <<= 1; /* multiply by 2 and get rid of highest bit */
      def = &lb_tab[c]; /* search fb's valid range in table */
      if( (unsigned int)(*++z-def[0])>=def[1] ){
        return LOOK_INVALID; /* Invalid UTF-8 */
      }

      c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
    } else {
      c = *++z;
    }


  }

  return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32