Fossil

Check-in [490d38ff]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Minor optimizations: drop a few redundant comparisons and calculations, and take advantage of the logical AND short-circuit by testing the least expensive and most unlikely condition first. Also fold away the iterative comments into cross references.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | comment-formatter-utf8
Files: files | file ages | folders
SHA1:490d38ff2e0079e76054bf207cb94de8ab1ecdb8
User & Date: florian 2018-11-24 07:49:00
Context
2018-11-28
23:43
Minor stylistic changes to the comment formatter. Closed-Leaf check-in: cc9c422d user: drh tags: comment-formatter-utf8
2018-11-24
07:49
Minor optimizations: drop a few redundant comparisons and calculations, and take advantage of the logical AND short-circuit by testing the least expensive and most unlikely condition first. Also fold away the iterative comments into cross references. check-in: 490d38ff user: florian tags: comment-formatter-utf8
07:16
Fix two bugs (introduced with this branch) that become manifest with invalid UTF-8 sequences. check-in: b86a2fc7 user: florian tags: comment-formatter-utf8
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/comformat.c.

155
156
157
158
159
160
161

162
163
164
165
166
167
168
169
170
171
172
173
174

175
176
177
178
179
180
181
182
183
184
185
...
221
222
223
224
225
226
227

228
229
230
231
232
233
234
...
292
293
294
295
296
297
298
299

300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
...
360
361
362
363
364
365
366

367
368
369
370
371
372
373
...
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406

407
408
409
410
411
412
413
414
415
416
** as one, so this function does not calculate the effective "display width".
*/
int strlen_utf8(const char *zString, int lengthBytes)
{
#if 0
  assert( lengthBytes>=0 );
#endif

  int lengthUTF8=0; /* Counted UTF-8 sequences. */
  int i;
  for( i=0; i<lengthBytes; i++ ){
    char c = zString[i];
    lengthUTF8++;
    if( (c&0xc0)==0xc0 ){                     /* Any UTF-8 lead byte 11xxxxxx */
      int cchUTF8=1; /* Code units consumed. */
      int maxUTF8=1; /* Expected sequence length. */
      if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
      else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
      else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
      while( i<lengthBytes-1 &&
              cchUTF8<maxUTF8 &&

              (zString[i+1]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
        cchUTF8++;
        i++;
      }
    }
  }
  return lengthUTF8;
}

/*
** This function is called when printing a logical comment line to calculate
................................................................................
  int wordBreak,         /* [in] Non-zero to try breaking on word boundaries. */
  int origBreak,         /* [in] Non-zero to break before original comment. */
  int *pLineCnt,         /* [in/out] Pointer to the total line count. */
  const char **pzLine    /* [out] Pointer to the end of the logical line. */
){
  int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
  char zBuf[400]; int iBuf=0; /* Output buffer and counter. */

  if( !zLine ) return;
  if( lineChars<=0 ) return;
#if 0
  assert( indent<sizeof(zBuf)-5 );       /* See following comments to explain */
  assert( origIndent<sizeof(zBuf)-5 );   /* these limits. */
#endif
  if( indent>sizeof(zBuf)-6 )   /* Limit initial indent to fit output buffer. */
................................................................................
        break;
      }
      charCnt++;
    }else{
      charCnt++;
    }
    assert( c!='\n' || charCnt==0 );
    /*

    ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
    ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
    ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
    ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
    ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
    ** sequences (as lone trail bytes).
    */
    if( (c&0xc0)==0xc0 && zLine[index]!=0 ){  /* Any UTF-8 lead byte 11xxxxxx */
      int cchUTF8=1; /* Code units consumed. */
      int maxUTF8=1; /* Expected sequence length. */
      zBuf[iBuf++]=c;
      if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
      else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
      else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
      while( cchUTF8<maxUTF8 &&
              (zLine[index]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
        cchUTF8++;
        zBuf[iBuf++] = zLine[index++];
      }
      maxChars--;
    }else{
      zBuf[iBuf++] = c;
      maxChars -= useChars;
    }
    if( maxChars<=0 ) break;
    if( c=='\n' ) break;
  }
  if( charCnt>0 ){
    zBuf[iBuf++] = '\n';
    lineCnt++;
  }
................................................................................
){
  int maxChars = width - indent;
  int si, sk, i, k, kc;
  int doIndent = 0;
  char *zBuf;
  char zBuffer[400];
  int lineCnt = 0;


  if( width<0 ){
    comment_set_maxchars(indent, &maxChars);
  }
  if( zText==0 ) zText = "(NULL)";
  if( maxChars<=0 ){
    maxChars = strlen(zText);
................................................................................
      }
      if( zBuf!=zBuffer) fossil_free(zBuf);
      return lineCnt;
    }
    for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
      char c = zText[i];
      kc++; /* Count complete UTF-8 sequences. */
      /*
      ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
      ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
      ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
      ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
      ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
      */
      if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){  /* Any UTF-8 lead byte 11xxxxxx */
        int cchUTF8=1; /* Code units consumed. */
        int maxUTF8=1; /* Expected sequence length. */
        if( (c&0xe0)==0xc0 )maxUTF8=2;        /* UTF-8 lead byte 110vvvvv */
        else if( (c&0xf0)==0xe0 )maxUTF8=3;   /* UTF-8 lead byte 1110vvvv */
        else if( (c&0xf8)==0xf0 )maxUTF8=4;   /* UTF-8 lead byte 11110vvv */

        zBuf[k++] = c;
        while( cchUTF8<maxUTF8 &&
                (zText[i+1]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
          cchUTF8++;
          zBuf[k++] = zText[++i];
        }
      }
      else if( fossil_isspace(c) ){
        si = i;
        sk = k;







>
|
<
|

<
<
|
|
|
|
|
<
|
>
|
|
|
<







 







>







 







<
>
|
<
<
<
<
<
<
<
|
|
<
|
|
|
|
|
|
|
|
<
<
<
|
<







 







>







 







|
<
<
<
<
<
<
<
|
|
|
|
|
>


|







155
156
157
158
159
160
161
162
163

164
165


166
167
168
169
170

171
172
173
174
175

176
177
178
179
180
181
182
...
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
...
290
291
292
293
294
295
296

297
298







299
300

301
302
303
304
305
306
307
308



309

310
311
312
313
314
315
316
...
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
...
374
375
376
377
378
379
380
381







382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
** as one, so this function does not calculate the effective "display width".
*/
int strlen_utf8(const char *zString, int lengthBytes)
{
#if 0
  assert( lengthBytes>=0 );
#endif
  int i;          /* Counted bytes. */
  int lengthUTF8; /* Counted UTF-8 sequences. */

  for( i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++ ){
    char c = zString[i];


    int cchUTF8=1; /* Code units consumed. */
    int maxUTF8=1; /* Expected sequence length. */
    if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
    else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
    else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */

    while( cchUTF8<maxUTF8 &&
            i<lengthBytes-1 &&
            (zString[i+1]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
      cchUTF8++;
      i++;

    }
  }
  return lengthUTF8;
}

/*
** This function is called when printing a logical comment line to calculate
................................................................................
  int wordBreak,         /* [in] Non-zero to try breaking on word boundaries. */
  int origBreak,         /* [in] Non-zero to break before original comment. */
  int *pLineCnt,         /* [in/out] Pointer to the total line count. */
  const char **pzLine    /* [out] Pointer to the end of the logical line. */
){
  int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
  char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
  int cchUTF8, maxUTF8;       /* Helper variables to count UTF-8 sequences. */
  if( !zLine ) return;
  if( lineChars<=0 ) return;
#if 0
  assert( indent<sizeof(zBuf)-5 );       /* See following comments to explain */
  assert( origIndent<sizeof(zBuf)-5 );   /* these limits. */
#endif
  if( indent>sizeof(zBuf)-6 )   /* Limit initial indent to fit output buffer. */
................................................................................
        break;
      }
      charCnt++;
    }else{
      charCnt++;
    }
    assert( c!='\n' || charCnt==0 );

    zBuf[iBuf++] = c;
    /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */







    cchUTF8=1; /* Code units consumed. */
    maxUTF8=1; /* Expected sequence length. */

    if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
    else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
    else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
    while( cchUTF8<maxUTF8 &&
            (zLine[index]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
      cchUTF8++;
      zBuf[iBuf++] = zLine[index++];
    }



    maxChars -= useChars;

    if( maxChars<=0 ) break;
    if( c=='\n' ) break;
  }
  if( charCnt>0 ){
    zBuf[iBuf++] = '\n';
    lineCnt++;
  }
................................................................................
){
  int maxChars = width - indent;
  int si, sk, i, k, kc;
  int doIndent = 0;
  char *zBuf;
  char zBuffer[400];
  int lineCnt = 0;
  int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */

  if( width<0 ){
    comment_set_maxchars(indent, &maxChars);
  }
  if( zText==0 ) zText = "(NULL)";
  if( maxChars<=0 ){
    maxChars = strlen(zText);
................................................................................
      }
      if( zBuf!=zBuffer) fossil_free(zBuf);
      return lineCnt;
    }
    for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
      char c = zText[i];
      kc++; /* Count complete UTF-8 sequences. */
      /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */







      cchUTF8=1; /* Code units consumed. */
      maxUTF8=1; /* Expected sequence length. */
      if( (c&0xe0)==0xc0 )maxUTF8=2;        /* UTF-8 lead byte 110vvvvv */
      else if( (c&0xf0)==0xe0 )maxUTF8=3;   /* UTF-8 lead byte 1110vvvv */
      else if( (c&0xf8)==0xf0 )maxUTF8=4;   /* UTF-8 lead byte 11110vvv */
      if( maxUTF8>1 ){
        zBuf[k++] = c;
        while( cchUTF8<maxUTF8 &&
                (zText[i+1]&0xc0)==0x80 ){  /* UTF-8 trail byte 10vvvvvv */
          cchUTF8++;
          zBuf[k++] = zText[++i];
        }
      }
      else if( fossil_isspace(c) ){
        si = i;
        sk = k;