Fossil

Check-in [0c7c6144]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge commit warning and file content type detection changes to trunk.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 0c7c61447f969839ca86f41c032ee77f206264a2
User & Date: mistachkin 2012-11-02 02:27:58
References
2013-03-05
16:14 Ticket [ed23ef59] Fossil ignoring "binary-glob" setting status still Open with 3 other changes artifact: 1e9f722a user: jan.nijtmans
Context
2012-11-02
02:36
Merge the "moderation" branch into trunk. This adds the ability to have an approval process for edits to Wiki and Tickets, including creating new Wiki and Tickets and adding attachments. Probably there are still some problems, but things are working well enough for trunk. check-in: ba418ee1 user: drh tags: trunk
02:27
Merge commit warning and file content type detection changes to trunk. check-in: 0c7c6144 user: mistachkin tags: trunk
2012-11-01
20:19
Add detection of binary data with no leading UTF-16 byte-order-mark. Closed-Leaf check-in: 7d3a06b8 user: mistachkin tags: commitWarningV2
10:20
Restore Style fix, which got lost by [618258421767778c] check-in: ef6c243e user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/checkin.c.

884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911


912
913
914
915
916
917
918

/*
** Issue a warning and give the user an opportunity to abandon out
** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
** is seen in a text file.
*/
static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
  int eType;              /* return value of looks_like_text() */
  int fUnicode;           /* return value of starts_with_utf16_bom() */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return;
  eType = looks_like_text(p);
  fUnicode = starts_with_utf16_bom(p);
  if( eType==-1 || fUnicode ){
    const char *zWarning;
    Blob ans;
    char cReply;

    if( eType==-1 && fUnicode ){
      zWarning = "Unicode and CR/NL line endings";
    }else if( eType==-1 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";


    }else{
      zWarning = "Unicode";
    }
    file_relative_name(zFilename, &fname, 0);
    blob_zero(&ans);
    zMsg = mprintf(
         "%s contains %s.  commit anyhow (a=all/y/N)? ",







|






|
|
|











>
>







884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920

/*
** Issue a warning and give the user an opportunity to abandon out
** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
** is seen in a text file.
*/
static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
  int eType;              /* return value of looks_like_utf8/utf16() */
  int fUnicode;           /* return value of starts_with_utf16_bom() */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return;
  fUnicode = starts_with_utf16_bom(p);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    Blob ans;
    char cReply;

    if( eType==-1 && fUnicode ){
      zWarning = "Unicode and CR/NL line endings";
    }else if( eType==-1 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";
    }else if( eType==0 ){
      zWarning = "binary data";
    }else{
      zWarning = "Unicode";
    }
    file_relative_name(zFilename, &fname, 0);
    blob_zero(&ans);
    zMsg = mprintf(
         "%s contains %s.  commit anyhow (a=all/y/N)? ",

Changes to src/diff.c.

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219




































































220
221
222
223
224
225
226
*/
#define DIFF_CANNOT_COMPUTE_BINARY \
    "cannot compute difference between binary files\n"

#define DIFF_CANNOT_COMPUTE_SYMLINK \
    "cannot compute difference between symlink and regular file\n"

#define looks_like_binary(blob) (looks_like_text((blob)) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file.  (8192)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)

/*
** Information about each line of a file being diffed.
**
................................................................................
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL (\000) characters or an extremely long line.  Since this
**         function does not understand UTF-16, it may falsely consider
**         UTF-16 text to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**
*/
int looks_like_text(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* \000 byte in a file -> binary */
  j = (c!='\n');
  while( --n>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* \000 byte in a file -> binary */
    if( c=='\n' ){
      if( z[-1]=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */




































































  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns non-zero if the blob starts with a UTF-16le or
** UTF-16be byte-order-mark (BOM).







|



|







 







|
|
|






|



|





|



|












>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
*/
#define DIFF_CANNOT_COMPUTE_BINARY \
    "cannot compute difference between binary files\n"

#define DIFF_CANNOT_COMPUTE_SYMLINK \
    "cannot compute difference between symlink and regular file\n"

#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (8192)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)

/*
** Information about each line of a file being diffed.
**
................................................................................
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**
*/
int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
  j = (c!='\n');
  while( --n>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
    if( c=='\n' ){
      if( z[-1]=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
** The number of bytes represented by this value cannot exceed LENGTH_MASK
** bytes, because that is the line buffer size used by the diff engine.
*/
#define UTF16_LENGTH_MASK_SZ  (LENGTH_MASK_SZ-1)
#define UTF16_LENGTH_MASK     ((1<<UTF16_LENGTH_MASK_SZ)-1)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((wchar_t)'\r')
#define UTF16BE_LF  ((wchar_t)'\n')
#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-16.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.
**
*/
int looks_like_utf16(const Blob *pContent){
  const wchar_t *z = (wchar_t *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-16 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>UTF16_LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns non-zero if the blob starts with a UTF-16le or
** UTF-16be byte-order-mark (BOM).