Index: src/comformat.c
==================================================================
--- src/comformat.c
+++ src/comformat.c
@@ -2,11 +2,11 @@
 ** Copyright (c) 2007 D. Richard Hipp
 **
 ** This program is free software; you can redistribute it and/or
 ** modify it under the terms of the Simplified BSD License (also
 ** known as the "2-Clause License" or "FreeBSD License".)
-
+**
 ** This program is distributed in the hope that it will be useful,
 ** but without any warranty; without even the implied warranty of
 ** merchantability or fitness for a particular purpose.
 **
 ** Author contact information:
@@ -95,21 +95,20 @@
 #endif
 }
 
 /*
 ** This function checks the current line being printed against the original
-** comment text.  Upon matching, it emits a new line and updates the provided
-** character and line counts, if applicable.
+** comment text.  Upon matching, it updates the provided character and line
+** counts, if applicable.  The caller needs to emit a new line, if desired.
 */
 static int comment_check_orig(
   const char *zOrigText, /* [in] Original comment text ONLY, may be NULL. */
   const char *zLine,     /* [in] The comment line to print. */
   int *pCharCnt,         /* [in/out] Pointer to the line character count. */
   int *pLineCnt          /* [in/out] Pointer to the total line count. */
 ){
   if( zOrigText && fossil_strcmp(zLine, zOrigText)==0 ){
-    fossil_print("\n");
     if( pCharCnt ) *pCharCnt = 0;
     if( pLineCnt ) (*pLineCnt)++;
     return 1;
   }
   return 0;
@@ -121,37 +120,76 @@
 ** zero if such a character cannot be found.  For the purposes of this
 ** algorithm, the NUL character is treated the same as a spacing character.
 */
 static int comment_next_space(
   const char *zLine, /* [in] The comment line being printed. */
-  int index          /* [in] The current character index being handled. */
+  int index,         /* [in] The current character index being handled. */
+  int *distUTF8      /* [out] Distance to next space in UTF-8 sequences. */
 ){
   int nextIndex = index + 1;
+  int fNonASCII=0;
   for(;;){
     char c = zLine[nextIndex];
+    if( (c&0x80)==0x80 ) fNonASCII=1;
     if( c==0 || fossil_isspace(c) ){
+      if( distUTF8 ){
+        if( fNonASCII!=0 ){
+          *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
+        }else{
+          *distUTF8 = nextIndex-index;
+        }
+      }
       return nextIndex;
     }
     nextIndex++;
   }
   return 0; /* NOT REACHED */
 }
 
 /*
-** This function is called when printing a logical comment line to perform
-** the necessary indenting.
+** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
+** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
+** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
+** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
+** treated as invalid 1-byte sequences (as lone trail bytes).
+** Combining characters and East Asian Wide and Fullwidth characters are counted
+** as one, so this function does not calculate the effective "display width".
+*/
+int strlen_utf8(const char *zString, int lengthBytes){
+  int i;          /* Counted bytes. */
+  int lengthUTF8; /* Counted UTF-8 sequences. */
+#if 0
+  assert( lengthBytes>=0 );
+#endif
+  for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
+    char c = zString[i];
+    int cchUTF8=1; /* Code units consumed. */
+    int maxUTF8=1; /* Expected sequence length. */
+    if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
+    else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
+    else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
+    while( cchUTF8<maxUTF8 &&
+            i<lengthBytes-1 &&
+            (zString[i+1]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
+      cchUTF8++;
+      i++;
+    }
+  }
+  return lengthUTF8;
+}
+
+/*
+** This function is called when printing a logical comment line to calculate
+** the necessary indenting.  The caller needs to emit the indenting spaces.
 */
-static void comment_print_indent(
+static void comment_calc_indent(
   const char *zLine, /* [in] The comment line being printed. */
   int indent,        /* [in] Number of spaces to indent, zero for none. */
   int trimCrLf,      /* [in] Non-zero to trim leading/trailing CR/LF. */
   int trimSpace,     /* [in] Non-zero to trim leading/trailing spaces. */
   int *piIndex       /* [in/out] Pointer to first non-space character. */
 ){
-  if( indent>0 ){
-    fossil_print("%*s", indent, "");
-  }
   if( zLine && piIndex ){
     int index = *piIndex;
     if( trimCrLf ){
       while( zLine[index]=='\r' || zLine[index]=='\n' ){ index++; }
     }
@@ -179,26 +217,56 @@
   int wordBreak,         /* [in] Non-zero to try breaking on word boundaries. */
   int origBreak,         /* [in] Non-zero to break before original comment. */
   int *pLineCnt,         /* [in/out] Pointer to the total line count. */
   const char **pzLine    /* [out] Pointer to the end of the logical line. */
 ){
-  int index = 0, charCnt = 0, lineCnt = 0, maxChars;
+  int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
+  char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
+  int cchUTF8, maxUTF8;       /* Helper variables to count UTF-8 sequences. */
   if( !zLine ) return;
   if( lineChars<=0 ) return;
-  comment_print_indent(zLine, indent, trimCrLf, trimSpace, &index);
+#if 0
+  assert( indent<sizeof(zBuf)-5 );       /* See following comments to explain */
+  assert( origIndent<sizeof(zBuf)-5 );   /* these limits. */
+#endif
+  if( indent>sizeof(zBuf)-6 ){
+    /* Limit initial indent to fit output buffer. */
+    indent = sizeof(zBuf)-6;
+  }
+  comment_calc_indent(zLine, indent, trimCrLf, trimSpace, &index);
+  if( indent>0 ){
+    for(i=0; i<indent; i++){
+      zBuf[iBuf++] = ' ';
+    }
+  }
+  if( origIndent>sizeof(zBuf)-6 ){
+    /* Limit line indent to fit output buffer. */
+    origIndent = sizeof(zBuf)-6;
+  }
   maxChars = lineChars;
   for(;;){
     int useChars = 1;
     char c = zLine[index];
+    /* Flush the output buffer if there's no space left for at least one more
+    ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
+    ** a new line, and a terminating NULL. */
+    if( iBuf>sizeof(zBuf)-origIndent-6 ){
+      zBuf[iBuf]=0;
+      iBuf=0;
+      fossil_print("%s", zBuf);
+    }
     if( c==0 ){
       break;
     }else{
       if( origBreak && index>0 ){
         const char *zCurrent = &zLine[index];
         if( comment_check_orig(zOrigText, zCurrent, &charCnt, &lineCnt) ){
-          comment_print_indent(zCurrent, origIndent, trimCrLf, trimSpace,
-                               &index);
+          zBuf[iBuf++] = '\n';
+          comment_calc_indent(zLine, origIndent, trimCrLf, trimSpace, &index);
+          for( i=0; i<origIndent; i++ ){
+            zBuf[iBuf++] = ' ';
+          }
           maxChars = lineChars;
         }
       }
       index++;
     }
@@ -205,38 +273,57 @@
     if( c=='\n' ){
       lineCnt++;
       charCnt = 0;
       useChars = 0;
     }else if( c=='\t' ){
-      int nextIndex = comment_next_space(zLine, index);
-      if( nextIndex<=0 || (nextIndex-index)>maxChars ){
+      int distUTF8;
+      int nextIndex = comment_next_space(zLine, index, &distUTF8);
+      if( nextIndex<=0 || distUTF8>maxChars ){
         break;
       }
       charCnt++;
       useChars = COMMENT_TAB_WIDTH;
       if( maxChars<useChars ){
-        fossil_print(" ");
+        zBuf[iBuf++] = ' ';
         break;
       }
     }else if( wordBreak && fossil_isspace(c) ){
-      int nextIndex = comment_next_space(zLine, index);
-      if( nextIndex<=0 || (nextIndex-index)>maxChars ){
+      int distUTF8;
+      int nextIndex = comment_next_space(zLine, index, &distUTF8);
+      if( nextIndex<=0 || distUTF8>maxChars ){
         break;
       }
       charCnt++;
     }else{
       charCnt++;
     }
     assert( c!='\n' || charCnt==0 );
-    fossil_print("%c", c);
-    if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
+    zBuf[iBuf++] = c;
+    /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
+    cchUTF8=1; /* Code units consumed. */
+    maxUTF8=1; /* Expected sequence length. */
+    if( (c&0xe0)==0xc0 )maxUTF8=2;          /* UTF-8 lead byte 110vvvvv */
+    else if( (c&0xf0)==0xe0 )maxUTF8=3;     /* UTF-8 lead byte 1110vvvv */
+    else if( (c&0xf8)==0xf0 )maxUTF8=4;     /* UTF-8 lead byte 11110vvv */
+    while( cchUTF8<maxUTF8 &&
+            (zLine[index]&0xc0)==0x80 ){    /* UTF-8 trail byte 10vvvvvv */
+      cchUTF8++;
+      zBuf[iBuf++] = zLine[index++];
+    }
+    maxChars -= useChars;
     if( maxChars<=0 ) break;
     if( c=='\n' ) break;
   }
   if( charCnt>0 ){
-    fossil_print("\n");
+    zBuf[iBuf++] = '\n';
     lineCnt++;
+  }
+  /* Flush the remaining output buffer. */
+  if( iBuf>0 ){
+    zBuf[iBuf]=0;
+    iBuf=0;
+    fossil_print("%s", zBuf);
   }
   if( pLineCnt ){
     *pLineCnt += lineCnt;
   }
   if( pzLine ){
@@ -259,25 +346,27 @@
   const char *zText, /* The comment text to be printed. */
   int indent,        /* Number of spaces to indent each non-initial line. */
   int width          /* Maximum number of characters per line. */
 ){
   int maxChars = width - indent;
-  int si, sk, i, k;
+  int si, sk, i, k, kc;
   int doIndent = 0;
   char *zBuf;
   char zBuffer[400];
   int lineCnt = 0;
+  int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
 
   if( width<0 ){
     comment_set_maxchars(indent, &maxChars);
   }
   if( zText==0 ) zText = "(NULL)";
   if( maxChars<=0 ){
     maxChars = strlen(zText);
   }
-  if( maxChars >= (sizeof(zBuffer)) ){
-    zBuf = fossil_malloc(maxChars+1);
+  /* Ensure the buffer can hold the longest-possible UTF-8 sequences. */
+  if( maxChars >= (sizeof(zBuffer)/4-1) ){
+    zBuf = fossil_malloc(maxChars*4+1);
   }else{
     zBuf = zBuffer;
   }
   for(;;){
     while( fossil_isspace(zText[0]) ){ zText++; }
@@ -287,13 +376,28 @@
         lineCnt = 1;
       }
       if( zBuf!=zBuffer) fossil_free(zBuf);
       return lineCnt;
     }
-    for(sk=si=i=k=0; zText[i] && k<maxChars; i++){
+    for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
       char c = zText[i];
-      if( fossil_isspace(c) ){
+      kc++; /* Count complete UTF-8 sequences. */
+      /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
+      cchUTF8=1; /* Code units consumed. */
+      maxUTF8=1; /* Expected sequence length. */
+      if( (c&0xe0)==0xc0 )maxUTF8=2;        /* UTF-8 lead byte 110vvvvv */
+      else if( (c&0xf0)==0xe0 )maxUTF8=3;   /* UTF-8 lead byte 1110vvvv */
+      else if( (c&0xf8)==0xf0 )maxUTF8=4;   /* UTF-8 lead byte 11110vvv */
+      if( maxUTF8>1 ){
+        zBuf[k++] = c;
+        while( cchUTF8<maxUTF8 &&
+                (zText[i+1]&0xc0)==0x80 ){  /* UTF-8 trail byte 10vvvvvv */
+          cchUTF8++;
+          zBuf[k++] = zText[++i];
+        }
+      }
+      else if( fossil_isspace(c) ){
         si = i;
         sk = k;
         if( k==0 || zBuf[k-1]!=' ' ){
           zBuf[k++] = ' ';
         }