Fossil

Check-in [348637de]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:merge trunk

let looks_like_text() give different values for UTF-16 BE/LE. Not used yet.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | improve_looks_like_binary
Files: files | file ages | folders
SHA1: 348637dedfdef286ad80d1cb9002dec242fef3b1
User & Date: jan.nijtmans 2012-11-01 11:48:55
Context
2012-11-01
11:52
style fix check-in: 0ccbb44f user: jan.nijtmans tags: improve_looks_like_binary
11:48
merge trunk

let looks_like_text() give different values for UTF-16 BE/LE. Not used yet.

check-in: 348637de user: jan.nijtmans tags: improve_looks_like_binary
10:20
Restore Style fix, which got lost by [618258421767778c] check-in: ef6c243e user: jan.nijtmans tags: trunk
2012-10-31
20:56
adapt comments accordingly check-in: b5123d51 user: jan.nijtmans tags: improve_looks_like_binary
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/checkin.c.

896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
  if( allOk ) return;
  eType = looks_like_text(p);
  if( eType<0 ){
    const char *zWarning ;
    Blob ans;
    char cReply;

    if( eType&1 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";
    }else{
      zWarning = "Unicode";
    }







|







896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
  if( allOk ) return;
  eType = looks_like_text(p);
  if( eType<0 ){
    const char *zWarning ;
    Blob ans;
    char cReply;

    if( eType==-3 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";
    }else{
      zWarning = "Unicode";
    }

Changes to src/diff.c.

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
...
179
180
181
182
183
184
185






186
187
188
189
190
191
192
193
194
195
196
197
198
...
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
...
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
*/
#define DIFF_CANNOT_COMPUTE_BINARY \
    "cannot compute difference between binary files\n"

#define DIFF_CANNOT_COMPUTE_SYMLINK \
    "cannot compute difference between symlink and regular file\n"

#define looks_like_binary(blob) ((looks_like_text(blob)&1) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file.  (8192)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
................................................................................
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL (\000) characters or an extremely long line.
**






** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**
** (-2) -- The content appears to consist entirely of text, in the
**         UTF-16 (BE or LE) encoding.
*/
int looks_like_text(const Blob *pContent){
  unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j;
  unsigned char c;
  int result = 1;  /* Assume text with no CR/NL */
................................................................................
  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* \000 byte in a file -> binary */
  if ( (n&1)==0 ){ /* UTF-16 must have an even blob length */
    if ( (c==0xff) && (z[1]==0xfe) ){ /* UTF-16 LE BOM */
      result = -2;
      j = LENGTH_MASK/3;
      while( (n-=2)>0 ){
        c = *(z+=2);
        if( z[1]==0 ){ /* High-byte must be 0 for further checks */
          if( c==0 ) return 0;  /* \000 char in a file -> binary */
          if( c=='\n' ){
            j = LENGTH_MASK/3;
................................................................................
  }
  j = LENGTH_MASK - (c!='\n');
  while( --n>0 ){
    c = *++z;
    if( c==0 ) return 0;  /* \000 byte in a file -> binary */
    if( c=='\n' ){
      if( z[-1]=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      j = LENGTH_MASK;
    }
    if( --j==0 ){
      return 0;  /* Very long line -> binary */
    }
  }







|







 







>
>
>
>
>
>
|



<
<







 







|







 







|







46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195


196
197
198
199
200
201
202
...
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
...
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
*/
#define DIFF_CANNOT_COMPUTE_BINARY \
    "cannot compute difference between binary files\n"

#define DIFF_CANNOT_COMPUTE_SYMLINK \
    "cannot compute difference between symlink and regular file\n"

#define looks_like_binary(blob) ((looks_like_text(blob)&3) == 1)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file.  (8192)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
................................................................................
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL (\000) characters or an extremely long line.
**
** (-1) -- The content appears to consist entirely of text, in the
**         UTF-16 (LE) encoding.
**
** (-2) -- The content appears to consist entirely of text, in the
**         UTF-16 (BE) encoding.
**
** (-3) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**


*/
int looks_like_text(const Blob *pContent){
  unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j;
  unsigned char c;
  int result = 1;  /* Assume text with no CR/NL */
................................................................................
  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* \000 byte in a file -> binary */
  if ( (n&1)==0 ){ /* UTF-16 must have an even blob length */
    if ( (c==0xff) && (z[1]==0xfe) ){ /* UTF-16 LE BOM */
      result = -1;
      j = LENGTH_MASK/3;
      while( (n-=2)>0 ){
        c = *(z+=2);
        if( z[1]==0 ){ /* High-byte must be 0 for further checks */
          if( c==0 ) return 0;  /* \000 char in a file -> binary */
          if( c=='\n' ){
            j = LENGTH_MASK/3;
................................................................................
  }
  j = LENGTH_MASK - (c!='\n');
  while( --n>0 ){
    c = *++z;
    if( c==0 ) return 0;  /* \000 byte in a file -> binary */
    if( c=='\n' ){
      if( z[-1]=='\r' ){
        result = -3;  /* Contains CR/NL, continue */
      }
      j = LENGTH_MASK;
    }
    if( --j==0 ){
      return 0;  /* Very long line -> binary */
    }
  }