Fossil

Check-in [4e86b06a]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Generate warning when to-be-committed file contains invalid UTF-8
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | improve_commit_warning
Files: files | file ages | folders
SHA1: 4e86b06a9f03db12baffae8509741f5ebd8bcae9
User & Date: jan.nijtmans 2012-11-02 10:55:01
Context
2012-11-02
14:22
don't forget to compensate for the line-length check check-in: 9011fe12 user: jan.nijtmans tags: improve_commit_warning
10:55
Generate warning when to-be-committed file contains invalid UTF-8 check-in: 4e86b06a user: jan.nijtmans tags: improve_commit_warning
08:31
speedup mimetype_from_content() by using a 256 byte array.
Mark VT and Ctrl-Z as text bytes, not binary.
Decrease maximum UTF-16 line length to 2731
Check for FFFF in addition to 0, in UTF-16/binary detection.
check-in: d804902f user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/checkin.c.

893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return;
  fUnicode = starts_with_utf16_bom(p);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    Blob ans;
    char cReply;

    if( eType==-1 && fUnicode ){
      zWarning = "Unicode and CR/NL line endings";
    }else if( eType==-1 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";
    }else if( eType==0 ){
      zWarning = "binary data";
    }else{
      zWarning = "Unicode";
    }
    file_relative_name(zFilename, &fname, 0);
    blob_zero(&ans);
    zMsg = mprintf(
         "%s contains %s.  commit anyhow (a=all/y/N)? ",







|











|
|







893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return;
  fUnicode = starts_with_utf16_bom(p);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType<0 || fUnicode ){
    const char *zWarning;
    Blob ans;
    char cReply;

    if( eType==-1 && fUnicode ){
      zWarning = "Unicode and CR/NL line endings";
    }else if( eType==-1 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";
    }else if( eType==-2 ){
      zWarning = "invalid UTF-8 or ASCII";
    }else{
      zWarning = "Unicode";
    }
    file_relative_name(zFilename, &fname, 0);
    blob_zero(&ans);
    zMsg = mprintf(
         "%s contains %s.  commit anyhow (a=all/y/N)? ",

Changes to src/diff.c.

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190



191
192

193
194
195
196

197
198
199
200
201
202

203























204
205
206

207
208
209
210
211
212
213
214
215
216




















217
218
219
220
221
222
223

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.



**
*/

int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;

  int result = 1;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;

  if( c==0 ) return 0;  /* Zero byte in a file -> binary */























  j = (c!='\n');
  while( --n>0 ){
    c = *++z; ++j;

    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
    if( c=='\n' ){
      int c2 = z[-1];
      if( c2=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;




















    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}







|
<







|
|
>
>
>


>

|

|
>






>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



>
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







173
174
175
176
177
178
179
180

181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters.

**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs.
**
** (-2) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters or carriage-return,
**         line-feed pairs; however, the encoding is not UTF-8 or ASCII.
**
*/

int looks_like_utf8(const Blob *pContent){
  unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned int j;
  unsigned char c;
  int result = 1;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c<0x80 ){
    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
  }else if( c<0xC0 ){
    result = -2;  /* Invalid UTF-8, continue */
  }else if( c<0xE0 ){
    if( n<2 || ((z[1]&0xC0)!=0x80) ){
      result = -2; /* Invalid 2-byte UTF-8, continue */
    }else{
      --n; ++z;
    }
  }else if( c<0xF0 ){
    if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
      result = -2; /* Invalid 3-byte UTF-8, continue */
    }else{
      n-=2; z+=2;
    }
  }else if( c<0xF8 ){
    if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
      result = -2; /* Invalid 4-byte UTF-8, continue */
    }else{
      n-=3; z+=3;
    }
  }else{
    result = -2;  /* Invalid multi-byte UTF-8, continue */
  }
  j = (c!='\n');
  while( --n>0 ){
    c = *++z; ++j;
    if( c<0x80 ){
      if( c==0 ) return 0;  /* Zero byte in a file -> binary */
      if( c=='\n' ){
        unsigned char c2 = z[-1];
        if( c2=='\r' && result>0 ){
          result = -1;  /* Contains CR/NL, continue */
        }
        if( j>LENGTH_MASK ){
          return 0;  /* Very long line -> binary */
        }
        j = 0;
      }
    }else if( c<0xC0 ){
      result = -2;  /* Invalid UTF-8, continue */
    }else if( c<0xE0 ){
      if( n<2 || ((z[1]&0xC0)!=0x80) ){
        result = -2; continue; /* Invalid 2-byte UTF-8, continue */
      }
      --n; ++z;
    }else if( c<0xF0 ){
      if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
        result = -2; continue; /* Invalid 3-byte UTF-8, continue */
      }
      n-=2; z+=2;
    }else if( c<0xF8 ){
      if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
        result = -2; continue; /* Invalid 4-byte UTF-8, continue */
      }
      n-=3; z+=3;
    }else{
      result = -2;  /* Invalid multi-byte UTF-8, continue */
    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}