gdal/ogr/generate_encoding_table.c

244 lines
6.9 KiB
C

/******************************************************************************
* $Id$
*
* Project: OGR
* Purpose: Generate a mapping table from a 1-byte encoding to unicode,
* for ogr_expat.cpp
* Author: Even Rouault, even dot rouault at spatialys.com
*
******************************************************************************
* Copyright (c) 2012, Even Rouault <even dot rouault at spatialys.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
****************************************************************************/
#include <errno.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static unsigned utf8decode(const char *p, const char *end, int *len)
{
unsigned char c = *(unsigned char *)p;
if (c < 0x80)
{
*len = 1;
return c;
#if ERRORS_TO_CP1252
}
else if (c < 0xa0)
{
*len = 1;
return cp1252[c - 0x80];
#endif
}
else if (c < 0xc2)
{
goto FAIL;
}
if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
goto FAIL;
if (c < 0xe0)
{
*len = 2;
return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
}
else if (c == 0xe0)
{
if (((unsigned char *)p)[1] < 0xa0)
goto FAIL;
goto UTF8_3;
#if STRICT_RFC3629
}
else if (c == 0xed)
{
// RFC 3629 says surrogate chars are illegal.
if (((unsigned char *)p)[1] >= 0xa0)
goto FAIL;
goto UTF8_3;
}
else if (c == 0xef)
{
// 0xfffe and 0xffff are also illegal characters
if (((unsigned char *)p)[1] == 0xbf && ((unsigned char *)p)[2] >= 0xbe)
goto FAIL;
goto UTF8_3;
#endif
}
else if (c < 0xf0)
{
UTF8_3:
if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
goto FAIL;
*len = 3;
return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
}
else if (c == 0xf0)
{
if (((unsigned char *)p)[1] < 0x90)
goto FAIL;
goto UTF8_4;
}
else if (c < 0xf4)
{
UTF8_4:
if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
goto FAIL;
*len = 4;
#if STRICT_RFC3629
// RFC 3629 says all codes ending in fffe or ffff are illegal:
if ((p[1] & 0xf) == 0xf && ((unsigned char *)p)[2] == 0xbf &&
((unsigned char *)p)[3] >= 0xbe)
goto FAIL;
#endif
return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
}
else if (c == 0xf4)
{
if (((unsigned char *)p)[1] > 0x8f)
goto FAIL; // after 0x10ffff
goto UTF8_4;
}
else
{
FAIL:
*len = 1;
#if ERRORS_TO_ISO8859_1
return c;
#else
return 0xfffd; // Unicode REPLACEMENT CHARACTER
#endif
}
}
int main(int argc, char *argv[])
{
iconv_t sConv;
const char *pszSrcEncoding;
const char *pszDstEncoding = "UTF-8";
int i;
int nLastIdentical = -1;
if (argc != 2)
{
fprintf(stderr, "Usage: generate_encoding_table encoding_name\n");
return 1;
}
pszSrcEncoding = argv[1];
sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
if (sConv == (iconv_t)-1)
{
fprintf(stderr, "Recode from %s to %s failed with the error: \"%s\".",
pszSrcEncoding, pszDstEncoding, strerror(errno));
return 1;
}
for (i = 0; i < 256; i++)
{
char szSrcBuf[2] = {(char)i, 0};
char szDstBuf[5] = {0, 0, 0, 0, 0};
char *pszSrcBuf = szSrcBuf;
char *pszDstBuf = szDstBuf;
size_t nSrcLen = strlen(szSrcBuf);
size_t nDstLen = sizeof(szDstBuf);
size_t nConverted =
iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
int nUnicode = -1;
if (nConverted == -1)
{
if (errno == EILSEQ)
{
/* fprintf(stderr, "EILSEQ for %d\n", i); */
}
else if (errno == E2BIG)
{
fprintf(stderr, "E2BIG for %d\n", i);
return 1;
}
else
{
fprintf(stderr, "other error for %d\n", i);
return 1;
}
}
else
{
int len;
nUnicode = utf8decode(szDstBuf, szDstBuf + strlen(szDstBuf), &len);
if (nUnicode == 0xfffd)
nUnicode = -1;
}
if (nLastIdentical >= 0 && i != nUnicode)
{
if (nLastIdentical + 1 == i)
printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical,
nLastIdentical);
else
{
printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
printf(" info->map[i] = i;\n");
}
nLastIdentical = -1;
}
if (nUnicode < 0)
printf("info->map[0x%02X] = -1;\n", i);
else if (nUnicode <= 0xFF)
{
if (i == nUnicode)
{
if (nLastIdentical < 0)
nLastIdentical = i;
}
else
printf("info->map[0x%02X] = 0x%02X;\n", i, nUnicode);
}
else if (nUnicode <= 0xFFFF)
printf("info->map[0x%02X] = 0x%04X;\n", i, nUnicode);
else if (nUnicode <= 0xFFFFFF)
printf("info->map[0x%02X] = 0x%06X;\n", i, nUnicode);
else
printf("info->map[0x%02X] = 0x%08X;\n", i, nUnicode);
}
if (nLastIdentical >= 0)
{
if (nLastIdentical + 1 == i)
printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical,
nLastIdentical);
else
{
printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
printf(" info->map[i] = i;\n");
}
}
iconv_close(sConv);
return 0;
}