Fossil

Check-in [d804902f]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:speedup mimetype_from_content() by using a 256 byte array.
Mark VT and Ctrl-Z as text bytes, not binary.
Decrease maximum UTF-16 line length to 2731
Check for FFFF in addition to 0, in UTF-16/binary detection.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: d804902f2333e4198223063c27cbbc17ec81f5ac
User & Date: jan.nijtmans 2012-11-02 08:31:20
Context
2012-11-02
17:22
Adjustments to looks_like_utf16 to handle wchar_t being missing or not 2 bytes. check-in: 7d881d82 user: mistachkin tags: trunk
10:55
Generate warning when to-be-committed file contains invalid UTF-8 check-in: 4e86b06a user: jan.nijtmans tags: improve_commit_warning
08:31
speedup mimetype_from_content() by using a 256 byte array.
Mark VT and Ctrl-Z as text bytes, not binary.
Decrease maximum UTF-16 line length to 2731
Check for FFFF in addition to 0, in UTF-16/binary detection.
check-in: d804902f user: jan.nijtmans tags: trunk
03:30
Add the new moderation permissions to the list maintained by the JSON code. check-in: 1cc7e8ce user: mistachkin tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/diff.c.

219
220
221
222
223
224
225
226
227

228
229
230
231
232
233
234
235
236
237
238
239
240

241
242
243
244
245
246
247
...
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
** The number of bytes represented by this value cannot exceed LENGTH_MASK

** bytes, because that is the line buffer size used by the diff engine.
*/
#define UTF16_LENGTH_MASK_SZ  (LENGTH_MASK_SZ-1)
#define UTF16_LENGTH_MASK     ((1<<UTF16_LENGTH_MASK_SZ)-1)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((wchar_t)'\r')
#define UTF16BE_LF  ((wchar_t)'\n')
#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))


/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
................................................................................
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */







|
|
>


<
|









>







 







|







219
220
221
222
223
224
225
226
227
228
229
230

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
...
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (2731)
** The number of bytes represented by this value after conversion to
** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
** bytes, because that is the line buffer size used by the diff engine.
*/

#define UTF16_LENGTH_MASK     (LENGTH_MASK/3)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((wchar_t)'\r')
#define UTF16BE_LF  ((wchar_t)'\n')
#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
#define UTF16_FFFF  ((wchar_t)-1)

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
................................................................................
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 || c==UTF16_FFFF ) return 0;  /* NUL/FFFF character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */

Changes to src/doc.c.

33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
..
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
** For any other binary type, return "unknown/unknown".
*/
const char *mimetype_from_content(Blob *pBlob){
  int i;
  int n;
  const unsigned char *x;

  static const char isBinary[] = {
     1, 1, 1, 1,  1, 1, 1, 1,    1, 0, 0, 1,  0, 0, 1, 1,
     1, 1, 1, 1,  1, 1, 1, 1,    1, 1, 1, 0,  1, 1, 1, 1,
  };

  /* A table of mimetypes based on file content prefixes
  */
  static const struct {
    const char *zPrefix;       /* The file prefix */
    int size;                  /* Length of the prefix */
................................................................................
    { "\377\330\377",            3, "image/jpeg" },
  };

  x = (const unsigned char*)blob_buffer(pBlob);
  n = blob_size(pBlob);
  for(i=0; i<n; i++){
    unsigned char c = x[i];
    if( c<=0x1f && isBinary[c] ){
      break;
    }
  }
  if( i>=n ){
    return 0;   /* Plain text */
  }
  for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){







|
|
|







 







|







33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
..
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
** For any other binary type, return "unknown/unknown".
*/
const char *mimetype_from_content(Blob *pBlob){
  int i;
  int n;
  const unsigned char *x;

  static const char isBinary[256] = {
     1, 1, 1, 1,  1, 1, 1, 1,    1, 0, 0, 0,  0, 0, 1, 1,
     1, 1, 1, 1,  1, 1, 1, 1,    1, 1, 0, 0,  1, 1, 1, 1
  };

  /* A table of mimetypes based on file content prefixes
  */
  static const struct {
    const char *zPrefix;       /* The file prefix */
    int size;                  /* Length of the prefix */
................................................................................
    { "\377\330\377",            3, "image/jpeg" },
  };

  x = (const unsigned char*)blob_buffer(pBlob);
  n = blob_size(pBlob);
  for(i=0; i<n; i++){
    unsigned char c = x[i];
    if( isBinary[c] ){
      break;
    }
  }
  if( i>=n ){
    return 0;   /* Plain text */
  }
  for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){