Index: src/comformat.c ================================================================== --- src/comformat.c +++ src/comformat.c @@ -2,11 +2,11 @@ ** Copyright (c) 2007 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the Simplified BSD License (also ** known as the "2-Clause License" or "FreeBSD License".) - +** ** This program is distributed in the hope that it will be useful, ** but without any warranty; without even the implied warranty of ** merchantability or fitness for a particular purpose. ** ** Author contact information: @@ -95,21 +95,20 @@ #endif } /* ** This function checks the current line being printed against the original -** comment text. Upon matching, it emits a new line and updates the provided -** character and line counts, if applicable. +** comment text. Upon matching, it updates the provided character and line +** counts, if applicable. The caller needs to emit a new line, if desired. */ static int comment_check_orig( const char *zOrigText, /* [in] Original comment text ONLY, may be NULL. */ const char *zLine, /* [in] The comment line to print. */ int *pCharCnt, /* [in/out] Pointer to the line character count. */ int *pLineCnt /* [in/out] Pointer to the total line count. */ ){ if( zOrigText && fossil_strcmp(zLine, zOrigText)==0 ){ - fossil_print("\n"); if( pCharCnt ) *pCharCnt = 0; if( pLineCnt ) (*pLineCnt)++; return 1; } return 0; @@ -121,37 +120,76 @@ ** zero if such a character cannot be found. For the purposes of this ** algorithm, the NUL character is treated the same as a spacing character. */ static int comment_next_space( const char *zLine, /* [in] The comment line being printed. */ - int index /* [in] The current character index being handled. */ + int index, /* [in] The current character index being handled. */ + int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */ ){ int nextIndex = index + 1; + int fNonASCII=0; for(;;){ char c = zLine[nextIndex]; + if( (c&0x80)==0x80 ) fNonASCII=1; if( c==0 || fossil_isspace(c) ){ + if( distUTF8 ){ + if( fNonASCII!=0 ){ + *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index); + }else{ + *distUTF8 = nextIndex-index; + } + } return nextIndex; } nextIndex++; } return 0; /* NOT REACHED */ } /* -** This function is called when printing a logical comment line to perform -** the necessary indenting. +** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and +** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0 +** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte +** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are +** treated as invalid 1-byte sequences (as lone trail bytes). +** Combining characters and East Asian Wide and Fullwidth characters are counted +** as one, so this function does not calculate the effective "display width". +*/ +int strlen_utf8(const char *zString, int lengthBytes){ + int i; /* Counted bytes. */ + int lengthUTF8; /* Counted UTF-8 sequences. */ +#if 0 + assert( lengthBytes>=0 ); +#endif + for(i=0, lengthUTF8=0; i0 ){ - fossil_print("%*s", indent, ""); - } if( zLine && piIndex ){ int index = *piIndex; if( trimCrLf ){ while( zLine[index]=='\r' || zLine[index]=='\n' ){ index++; } } @@ -179,26 +217,56 @@ int wordBreak, /* [in] Non-zero to try breaking on word boundaries. */ int origBreak, /* [in] Non-zero to break before original comment. */ int *pLineCnt, /* [in/out] Pointer to the total line count. */ const char **pzLine /* [out] Pointer to the end of the logical line. */ ){ - int index = 0, charCnt = 0, lineCnt = 0, maxChars; + int index = 0, charCnt = 0, lineCnt = 0, maxChars, i; + char zBuf[400]; int iBuf=0; /* Output buffer and counter. */ + int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ if( !zLine ) return; if( lineChars<=0 ) return; - comment_print_indent(zLine, indent, trimCrLf, trimSpace, &index); +#if 0 + assert( indentsizeof(zBuf)-6 ){ + /* Limit initial indent to fit output buffer. */ + indent = sizeof(zBuf)-6; + } + comment_calc_indent(zLine, indent, trimCrLf, trimSpace, &index); + if( indent>0 ){ + for(i=0; isizeof(zBuf)-6 ){ + /* Limit line indent to fit output buffer. */ + origIndent = sizeof(zBuf)-6; + } maxChars = lineChars; for(;;){ int useChars = 1; char c = zLine[index]; + /* Flush the output buffer if there's no space left for at least one more + ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces, + ** a new line, and a terminating NULL. */ + if( iBuf>sizeof(zBuf)-origIndent-6 ){ + zBuf[iBuf]=0; + iBuf=0; + fossil_print("%s", zBuf); + } if( c==0 ){ break; }else{ if( origBreak && index>0 ){ const char *zCurrent = &zLine[index]; if( comment_check_orig(zOrigText, zCurrent, &charCnt, &lineCnt) ){ - comment_print_indent(zCurrent, origIndent, trimCrLf, trimSpace, - &index); + zBuf[iBuf++] = '\n'; + comment_calc_indent(zLine, origIndent, trimCrLf, trimSpace, &index); + for( i=0; imaxChars ){ + int distUTF8; + int nextIndex = comment_next_space(zLine, index, &distUTF8); + if( nextIndex<=0 || distUTF8>maxChars ){ break; } charCnt++; useChars = COMMENT_TAB_WIDTH; if( maxCharsmaxChars ){ + int distUTF8; + int nextIndex = comment_next_space(zLine, index, &distUTF8); + if( nextIndex<=0 || distUTF8>maxChars ){ break; } charCnt++; }else{ charCnt++; } assert( c!='\n' || charCnt==0 ); - fossil_print("%c", c); - if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; + zBuf[iBuf++] = c; + /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ + cchUTF8=1; /* Code units consumed. */ + maxUTF8=1; /* Expected sequence length. */ + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ + while( cchUTF80 ){ - fossil_print("\n"); + zBuf[iBuf++] = '\n'; lineCnt++; + } + /* Flush the remaining output buffer. */ + if( iBuf>0 ){ + zBuf[iBuf]=0; + iBuf=0; + fossil_print("%s", zBuf); } if( pLineCnt ){ *pLineCnt += lineCnt; } if( pzLine ){ @@ -259,25 +346,27 @@ const char *zText, /* The comment text to be printed. */ int indent, /* Number of spaces to indent each non-initial line. */ int width /* Maximum number of characters per line. */ ){ int maxChars = width - indent; - int si, sk, i, k; + int si, sk, i, k, kc; int doIndent = 0; char *zBuf; char zBuffer[400]; int lineCnt = 0; + int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ if( width<0 ){ comment_set_maxchars(indent, &maxChars); } if( zText==0 ) zText = "(NULL)"; if( maxChars<=0 ){ maxChars = strlen(zText); } - if( maxChars >= (sizeof(zBuffer)) ){ - zBuf = fossil_malloc(maxChars+1); + /* Ensure the buffer can hold the longest-possible UTF-8 sequences. */ + if( maxChars >= (sizeof(zBuffer)/4-1) ){ + zBuf = fossil_malloc(maxChars*4+1); }else{ zBuf = zBuffer; } for(;;){ while( fossil_isspace(zText[0]) ){ zText++; } @@ -287,13 +376,28 @@ lineCnt = 1; } if( zBuf!=zBuffer) fossil_free(zBuf); return lineCnt; } - for(sk=si=i=k=0; zText[i] && k1 ){ + zBuf[k++] = c; + while( cchUTF8