gdal/ogr/generate_encoding_table.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

244 lines
6.9 KiB
C
Raw Normal View History

2022-05-14 03:28:14 +08:00
/******************************************************************************
2024-04-24 11:11:49 +08:00
* $Id$
2022-05-14 03:28:14 +08:00
*
* Project: OGR
* Purpose: Generate a mapping table from a 1-byte encoding to unicode,
* for ogr_expat.cpp
2024-04-24 11:11:49 +08:00
* Author: Even Rouault, even dot rouault at spatialys.com
2022-05-14 03:28:14 +08:00
*
******************************************************************************
2024-04-24 11:11:49 +08:00
* Copyright (c) 2012, Even Rouault <even dot rouault at spatialys.com>
2022-05-14 03:28:14 +08:00
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
****************************************************************************/
#include <errno.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
2024-04-24 11:11:49 +08:00
static unsigned utf8decode(const char *p, const char *end, int *len)
2022-05-14 03:28:14 +08:00
{
2024-04-24 11:11:49 +08:00
unsigned char c = *(unsigned char *)p;
if (c < 0x80)
{
*len = 1;
return c;
2022-05-14 03:28:14 +08:00
#if ERRORS_TO_CP1252
2024-04-24 11:11:49 +08:00
}
else if (c < 0xa0)
{
*len = 1;
return cp1252[c - 0x80];
2022-05-14 03:28:14 +08:00
#endif
2024-04-24 11:11:49 +08:00
}
else if (c < 0xc2)
{
goto FAIL;
}
if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
goto FAIL;
if (c < 0xe0)
{
*len = 2;
return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
}
else if (c == 0xe0)
{
if (((unsigned char *)p)[1] < 0xa0)
goto FAIL;
goto UTF8_3;
2022-05-14 03:28:14 +08:00
#if STRICT_RFC3629
2024-04-24 11:11:49 +08:00
}
else if (c == 0xed)
{
// RFC 3629 says surrogate chars are illegal.
if (((unsigned char *)p)[1] >= 0xa0)
goto FAIL;
goto UTF8_3;
}
else if (c == 0xef)
{
// 0xfffe and 0xffff are also illegal characters
if (((unsigned char *)p)[1] == 0xbf && ((unsigned char *)p)[2] >= 0xbe)
goto FAIL;
goto UTF8_3;
2022-05-14 03:28:14 +08:00
#endif
2024-04-24 11:11:49 +08:00
}
else if (c < 0xf0)
{
UTF8_3:
if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
goto FAIL;
*len = 3;
return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
}
else if (c == 0xf0)
{
if (((unsigned char *)p)[1] < 0x90)
goto FAIL;
goto UTF8_4;
}
else if (c < 0xf4)
{
UTF8_4:
if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
goto FAIL;
*len = 4;
2022-05-14 03:28:14 +08:00
#if STRICT_RFC3629
2024-04-24 11:11:49 +08:00
// RFC 3629 says all codes ending in fffe or ffff are illegal:
if ((p[1] & 0xf) == 0xf && ((unsigned char *)p)[2] == 0xbf &&
((unsigned char *)p)[3] >= 0xbe)
goto FAIL;
2022-05-14 03:28:14 +08:00
#endif
2024-04-24 11:11:49 +08:00
return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
}
else if (c == 0xf4)
{
if (((unsigned char *)p)[1] > 0x8f)
goto FAIL; // after 0x10ffff
goto UTF8_4;
}
else
{
FAIL:
*len = 1;
2022-05-14 03:28:14 +08:00
#if ERRORS_TO_ISO8859_1
2024-04-24 11:11:49 +08:00
return c;
2022-05-14 03:28:14 +08:00
#else
2024-04-24 11:11:49 +08:00
return 0xfffd; // Unicode REPLACEMENT CHARACTER
2022-05-14 03:28:14 +08:00
#endif
2024-04-24 11:11:49 +08:00
}
2022-05-14 03:28:14 +08:00
}
2024-04-24 11:11:49 +08:00
int main(int argc, char *argv[])
2022-05-14 03:28:14 +08:00
{
iconv_t sConv;
2024-04-24 11:11:49 +08:00
const char *pszSrcEncoding;
const char *pszDstEncoding = "UTF-8";
2022-05-14 03:28:14 +08:00
int i;
int nLastIdentical = -1;
2024-04-24 11:11:49 +08:00
if (argc != 2)
2022-05-14 03:28:14 +08:00
{
fprintf(stderr, "Usage: generate_encoding_table encoding_name\n");
return 1;
}
pszSrcEncoding = argv[1];
2024-04-24 11:11:49 +08:00
sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
2022-05-14 03:28:14 +08:00
2024-04-24 11:11:49 +08:00
if (sConv == (iconv_t)-1)
2022-05-14 03:28:14 +08:00
{
2024-04-24 11:11:49 +08:00
fprintf(stderr, "Recode from %s to %s failed with the error: \"%s\".",
pszSrcEncoding, pszDstEncoding, strerror(errno));
2022-05-14 03:28:14 +08:00
return 1;
}
2024-04-24 11:11:49 +08:00
for (i = 0; i < 256; i++)
2022-05-14 03:28:14 +08:00
{
char szSrcBuf[2] = {(char)i, 0};
2024-04-24 11:11:49 +08:00
char szDstBuf[5] = {0, 0, 0, 0, 0};
2022-05-14 03:28:14 +08:00
char *pszSrcBuf = szSrcBuf;
char *pszDstBuf = szDstBuf;
2024-04-24 11:11:49 +08:00
size_t nSrcLen = strlen(szSrcBuf);
size_t nDstLen = sizeof(szDstBuf);
size_t nConverted =
iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
2022-05-14 03:28:14 +08:00
int nUnicode = -1;
2024-04-24 11:11:49 +08:00
if (nConverted == -1)
2022-05-14 03:28:14 +08:00
{
2024-04-24 11:11:49 +08:00
if (errno == EILSEQ)
2022-05-14 03:28:14 +08:00
{
/* fprintf(stderr, "EILSEQ for %d\n", i); */
}
2024-04-24 11:11:49 +08:00
else if (errno == E2BIG)
2022-05-14 03:28:14 +08:00
{
fprintf(stderr, "E2BIG for %d\n", i);
return 1;
}
else
{
fprintf(stderr, "other error for %d\n", i);
return 1;
}
}
else
{
int len;
nUnicode = utf8decode(szDstBuf, szDstBuf + strlen(szDstBuf), &len);
2024-04-24 11:11:49 +08:00
if (nUnicode == 0xfffd)
2022-05-14 03:28:14 +08:00
nUnicode = -1;
}
2024-04-24 11:11:49 +08:00
if (nLastIdentical >= 0 && i != nUnicode)
2022-05-14 03:28:14 +08:00
{
2024-04-24 11:11:49 +08:00
if (nLastIdentical + 1 == i)
printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical,
nLastIdentical);
2022-05-14 03:28:14 +08:00
else
{
printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
printf(" info->map[i] = i;\n");
}
nLastIdentical = -1;
}
2024-04-24 11:11:49 +08:00
if (nUnicode < 0)
2022-05-14 03:28:14 +08:00
printf("info->map[0x%02X] = -1;\n", i);
2024-04-24 11:11:49 +08:00
else if (nUnicode <= 0xFF)
2022-05-14 03:28:14 +08:00
{
2024-04-24 11:11:49 +08:00
if (i == nUnicode)
2022-05-14 03:28:14 +08:00
{
2024-04-24 11:11:49 +08:00
if (nLastIdentical < 0)
2022-05-14 03:28:14 +08:00
nLastIdentical = i;
}
else
printf("info->map[0x%02X] = 0x%02X;\n", i, nUnicode);
}
2024-04-24 11:11:49 +08:00
else if (nUnicode <= 0xFFFF)
2022-05-14 03:28:14 +08:00
printf("info->map[0x%02X] = 0x%04X;\n", i, nUnicode);
2024-04-24 11:11:49 +08:00
else if (nUnicode <= 0xFFFFFF)
2022-05-14 03:28:14 +08:00
printf("info->map[0x%02X] = 0x%06X;\n", i, nUnicode);
else
printf("info->map[0x%02X] = 0x%08X;\n", i, nUnicode);
}
2024-04-24 11:11:49 +08:00
if (nLastIdentical >= 0)
2022-05-14 03:28:14 +08:00
{
2024-04-24 11:11:49 +08:00
if (nLastIdentical + 1 == i)
printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical,
nLastIdentical);
2022-05-14 03:28:14 +08:00
else
{
printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
printf(" info->map[i] = i;\n");
}
}
2024-04-24 11:11:49 +08:00
iconv_close(sConv);
2022-05-14 03:28:14 +08:00
return 0;
}