Fossil

Check-in [1bbca2c3]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Modify the comment formatter to avoid output of incomplete UTF-8 sequences, and to avoid line breaks inside UTF-8 sequences. See https://fossil-scm.org/forum/forumpost/1247e4a3c4 for detailed information and tests.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | comment-formatter-utf8
Files: files | file ages | folders
SHA3-256:1bbca2c3f89b826d3350ca34a0e1a69a31180b72dcbece58f2714c87f7a8267e
User & Date: florian 2018-10-17 14:16:00
Context
2018-10-17
14:16
Modify the comment formatter to avoid output of incomplete UTF-8 sequences, and to avoid line breaks inside UTF-8 sequences. See https://fossil-scm.org/forum/forumpost/1247e4a3c4 for detailed information and tests. Leaf check-in: 1bbca2c3 user: florian tags: comment-formatter-utf8
2018-10-12
16:14
Fix a comment on the "html" and "puts" TH1 commands. Before this fix, the meanings of the two commands were reversed. check-in: 35563f3d user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/comformat.c.

223
224
225
226
227
228
229
























230
231
232
233
234
235
236
237
...
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
...
285
286
287
288
289
290
291
292
293




















294
295
296
297
298
299
300
301
        break;
      }
      charCnt++;
    }else{
      charCnt++;
    }
    assert( c!='\n' || charCnt==0 );
























    fossil_print("%c", c);
    if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
    if( maxChars<=0 ) break;
    if( c=='\n' ) break;
  }
  if( charCnt>0 ){
    fossil_print("\n");
    lineCnt++;
................................................................................
*/
static int comment_print_legacy(
  const char *zText, /* The comment text to be printed. */
  int indent,        /* Number of spaces to indent each non-initial line. */
  int width          /* Maximum number of characters per line. */
){
  int maxChars = width - indent;
  int si, sk, i, k;
  int doIndent = 0;
  char *zBuf;
  char zBuffer[400];
  int lineCnt = 0;

  if( width<0 ){
    comment_set_maxchars(indent, &maxChars);
................................................................................
      if( doIndent==0 ){
        fossil_print("\n");
        lineCnt = 1;
      }
      if( zBuf!=zBuffer) fossil_free(zBuf);
      return lineCnt;
    }
    for(sk=si=i=k=0; zText[i] && k<maxChars; i++){
      char c = zText[i];




















      if( fossil_isspace(c) ){
        si = i;
        sk = k;
        if( k==0 || zBuf[k-1]!=' ' ){
          zBuf[k++] = ' ';
        }
      }else{
        zBuf[k] = c;







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|







 







|







 







|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|







223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
...
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
...
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
        break;
      }
      charCnt++;
    }else{
      charCnt++;
    }
    assert( c!='\n' || charCnt==0 );
    /*
    ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
    ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
    ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
    ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
    ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
    ** sequences (as lone trail bytes).
    */
    if( (c&0xc0)==0xc0 && zLine[index]!=0 ){  /* Any UTF-8 lead byte 11xxxxxx */
      char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */
      int cchUTF8=1; /* Code units consumed. */
      int maxUTF8=1; /* Expected sequence length. */
      zUTF8[0]=c;
      if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
      else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
      else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
      while( cchUTF8<maxUTF8 &&
              (zLine[index]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
        zUTF8[cchUTF8++] = zLine[index++];
      }
      zUTF8[cchUTF8]=0;
      fossil_print("%s", zUTF8);
    }
    else
      fossil_print("%c", c);
    if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
    if( maxChars<=0 ) break;
    if( c=='\n' ) break;
  }
  if( charCnt>0 ){
    fossil_print("\n");
    lineCnt++;
................................................................................
*/
static int comment_print_legacy(
  const char *zText, /* The comment text to be printed. */
  int indent,        /* Number of spaces to indent each non-initial line. */
  int width          /* Maximum number of characters per line. */
){
  int maxChars = width - indent;
  int si, sk, i, k, kc;
  int doIndent = 0;
  char *zBuf;
  char zBuffer[400];
  int lineCnt = 0;

  if( width<0 ){
    comment_set_maxchars(indent, &maxChars);
................................................................................
      if( doIndent==0 ){
        fossil_print("\n");
        lineCnt = 1;
      }
      if( zBuf!=zBuffer) fossil_free(zBuf);
      return lineCnt;
    }
    for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
      char c = zText[i];
      kc++; /* Count complete UTF-8 sequences. */
      /*
      ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
      ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
      ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
      ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
      ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
      */
      if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){  /* Any UTF-8 lead byte 11xxxxxx */
        int cchUTF8=1; /* Code units consumed. */
        int maxUTF8=1; /* Expected sequence length. */
        if( (c&0xe0)==0xc0 )maxUTF8=2;        /* UTF-8 lead byte 110vvvvv */
        else if( (c&0xf0)==0xe0 )maxUTF8=3;   /* UTF-8 lead byte 1110vvvv */
        else if( (c&0xf8)==0xf0 )maxUTF8=4;   /* UTF-8 lead byte 11110vvv */
        zBuf[k++] = c;
        while( cchUTF8<maxUTF8 &&
                (zText[i+1]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
          zBuf[k++] = zText[++i];
        }
      }
      else if( fossil_isspace(c) ){
        si = i;
        sk = k;
        if( k==0 || zBuf[k-1]!=' ' ){
          zBuf[k++] = ' ';
        }
      }else{
        zBuf[k] = c;