Fossil

Check-in [bab2f28b]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:completed cp1252 table and conversion
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | improve_commit_warning
Files: files | file ages | folders
SHA1: bab2f28b601c0ab6ddb28463f68f1015f33dab3a
User & Date: jan.nijtmans 2012-12-12 15:35:31
Context
2013-01-21
10:29
merge trunk check-in: a68dffbf user: jan.nijtmans tags: improve_commit_warning
2012-12-12
15:35
completed cp1252 table and conversion check-in: bab2f28b user: jan.nijtmans tags: improve_commit_warning
13:53
Add optional iso8859-1 to utf-8 conversion. Still to do: special cp1252 characters. check-in: 4f060f6a user: jan.nijtmans tags: improve_commit_warning
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/blob.c.

1030
1031
1032
1033
1034
1035
1036
1037
1038








1039
1040
1041
1042
1043
1044





1045
1046
1047
1048
1049
1050
1051
1052
1053
1054











1055
1056

1057
1058
1059
1060
1061
1062
1063
  p->nUsed = j;
}

/*
** Convert blob from cp1252 to utf-8. As cp1252 is a superset
** of iso8895-1, this is useful on UNIX as well.
**
** TODO: the bytes 0x80..0xBF need a special table, iso8895-1 works.
*/








void blob_cp1252_to_utf8(Blob *p){
  unsigned char *z = (unsigned char *)p->aData;
  int j   = p->nUsed;
  int i, n;
  for(i=n=0; i<j; i++){
    if( z[i]>=0x80 ) n++;





  }
  j += n;
  if( j>=p->nAlloc ){
    blob_resize(p, j);
    z = (unsigned char *)p->aData;
  }
  p->nUsed = j;
  z[j] = 0;
  while( j>i ){
    if( z[--i]>=0x80 ){











      z[--j] = 0x80 | (z[i]&0x3F);
      z[--j] = 0xC0 | (z[i]>>6);

    }else{
      z[--j] = z[i];
    }
  }
}

/*







|

>
>
>
>
>
>
>
>





|
>
>
>
>
>










>
>
>
>
>
>
>
>
>
>
>
|
|
>







1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
  p->nUsed = j;
}

/*
** Convert blob from cp1252 to utf-8. As cp1252 is a superset
** of iso8895-1, this is useful on UNIX as well.
**
** This table contains the character translations for 0x80..0xA0.
*/

static const unsigned short cp1252[32] = {
  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
};

void blob_cp1252_to_utf8(Blob *p){
  unsigned char *z = (unsigned char *)p->aData;
  int j   = p->nUsed;
  int i, n;
  for(i=n=0; i<j; i++){
    if( z[i]>=0x80 ){
      if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
        n++;
      }
      n++;
    }
  }
  j += n;
  if( j>=p->nAlloc ){
    blob_resize(p, j);
    z = (unsigned char *)p->aData;
  }
  p->nUsed = j;
  z[j] = 0;
  while( j>i ){
    if( z[--i]>=0x80 ){
      if( z[i]<0xa0 ){
        unsigned short sym = cp1252[z[i]&0x1f];
        if( sym>=0x800 ){
          z[--j] = 0x80 | (sym&0x3f);
          z[--j] = 0x80 | ((sym>>6)&0x3f);
          z[--j] = 0xe0 | (sym>>12);
        }else{
          z[--j] = 0x80 | (sym&0x3f);
          z[--j] = 0xc0 | (sym>>6);
        }
      }else{
        z[--j] = 0x80 | (z[i]&0x3F);
        z[--j] = 0xC0 | (z[i]>>6);
      }
    }else{
      z[--j] = z[i];
    }
  }
}

/*