542 lines
12 KiB
C
542 lines
12 KiB
C
/*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*/
|
|
|
|
#include "port.h"
|
|
|
|
#define UNICODE_INTERNAL
|
|
#include "libs/unicode.h"
|
|
|
|
#include <ctype.h>
|
|
#include <stddef.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include "libs/log.h"
|
|
#include "libs/misc.h"
|
|
|
|
|
|
// Resynchronise (skip everything starting with 0x10xxxxxx):
|
|
static inline void
|
|
resyncUTF8(const unsigned char **ptr) {
|
|
while ((**ptr & 0xc0) == 0x80)
|
|
(*ptr)++;
|
|
}
|
|
|
|
// Get one character from a UTF-8 encoded string.
|
|
// *ptr will point to the start of the next character.
|
|
// Returns 0 if the encoding is bad. This can be distinguished from the
|
|
// '\0' character by checking whether **ptr == '\0' before calling this
|
|
// function.
|
|
UniChar
|
|
getCharFromString(const unsigned char **ptr) {
|
|
UniChar result;
|
|
|
|
if (**ptr < 0x80) {
|
|
// 0xxxxxxx, regular ASCII
|
|
result = **ptr;
|
|
(*ptr)++;
|
|
|
|
return result;
|
|
}
|
|
|
|
if ((**ptr & 0xe0) == 0xc0) {
|
|
// 110xxxxx; 10xxxxxx must follow
|
|
// Value between 0x00000080 and 0x000007ff (inclusive)
|
|
result = **ptr & 0x1f;
|
|
(*ptr)++;
|
|
|
|
if ((**ptr & 0xc0) != 0x80)
|
|
goto err;
|
|
result = (result << 6) | ((**ptr) & 0x3f);
|
|
(*ptr)++;
|
|
|
|
if (result < 0x00000080) {
|
|
// invalid encoding - must reject
|
|
goto err;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
if ((**ptr & 0xf0) == 0xe0) {
|
|
// 1110xxxx; 10xxxxxx 10xxxxxx must follow
|
|
// Value between 0x00000800 and 0x0000ffff (inclusive)
|
|
result = **ptr & 0x0f;
|
|
(*ptr)++;
|
|
|
|
if ((**ptr & 0xc0) != 0x80)
|
|
goto err;
|
|
result = (result << 6) | ((**ptr) & 0x3f);
|
|
(*ptr)++;
|
|
|
|
if ((**ptr & 0xc0) != 0x80)
|
|
goto err;
|
|
result = (result << 6) | ((**ptr) & 0x3f);
|
|
(*ptr)++;
|
|
|
|
if (result < 0x00000800) {
|
|
// invalid encoding - must reject
|
|
goto err;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
if ((**ptr & 0xf8) == 0xf0) {
|
|
// 11110xxx; 10xxxxxx 10xxxxxx 10xxxxxx must follow
|
|
// Value between 0x00010000 and 0x0010ffff (inclusive)
|
|
result = **ptr & 0x07;
|
|
(*ptr)++;
|
|
|
|
if ((**ptr & 0xc0) != 0x80)
|
|
goto err;
|
|
result = (result << 6) | ((**ptr) & 0x3f);
|
|
(*ptr)++;
|
|
|
|
if ((**ptr & 0xc0) != 0x80)
|
|
goto err;
|
|
result = (result << 6) | ((**ptr) & 0x3f);
|
|
(*ptr)++;
|
|
|
|
if ((**ptr & 0xc0) != 0x80)
|
|
goto err;
|
|
result = (result << 6) | ((**ptr) & 0x3f);
|
|
(*ptr)++;
|
|
|
|
if (result < 0x00010000) {
|
|
// invalid encoding - must reject
|
|
goto err;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
err:
|
|
log_add(log_Warning, "Warning: Invalid UTF8 sequence.");
|
|
|
|
// Resynchronise (skip everything starting with 0x10xxxxxx):
|
|
resyncUTF8(ptr);
|
|
|
|
return 0;
|
|
}
|
|
|
|
UniChar
|
|
getCharFromStringN(const unsigned char **ptr, const unsigned char *end) {
|
|
size_t numBytes;
|
|
|
|
if (*ptr == end)
|
|
goto err;
|
|
|
|
if (**ptr < 0x80) {
|
|
numBytes = 1;
|
|
} else if ((**ptr & 0xe0) == 0xc0) {
|
|
numBytes = 2;
|
|
} else if ((**ptr & 0xf0) == 0xe0) {
|
|
numBytes = 3;
|
|
} else if ((**ptr & 0xf8) == 0xf0) {
|
|
numBytes = 4;
|
|
} else
|
|
goto err;
|
|
|
|
if (*ptr + numBytes > end)
|
|
goto err;
|
|
|
|
return getCharFromString(ptr);
|
|
|
|
err:
|
|
*ptr = end;
|
|
return 0;
|
|
}
|
|
|
|
// Get one line from a string.
|
|
// A line is terminated with either CRLF (DOS/Windows),
|
|
// LF (Unix, MacOS X), or CR (old MacOS).
|
|
// The end of the string is reached when **startNext == '\0'.
|
|
// NULL is returned if the string is not valid UTF8. In this case
|
|
// *end points to the first invalid character (or the character before if
|
|
// it was a LF), and *startNext to the start of the next (possibly invalid
|
|
// too) character.
|
|
unsigned char *
|
|
getLineFromString(const unsigned char *start, const unsigned char **end,
|
|
const unsigned char **startNext) {
|
|
const unsigned char *ptr = start;
|
|
const unsigned char *lastPtr;
|
|
UniChar ch;
|
|
|
|
// Search for the first newline.
|
|
for (;;) {
|
|
if (*ptr == '\0') {
|
|
*end = ptr;
|
|
*startNext = ptr;
|
|
return (unsigned char *) unconst(start);
|
|
}
|
|
lastPtr = ptr;
|
|
ch = getCharFromString(&ptr);
|
|
if (ch == '\0') {
|
|
// Bad string
|
|
*end = lastPtr;
|
|
*startNext = ptr;
|
|
return NULL;
|
|
}
|
|
if (ch == '\n') {
|
|
*end = lastPtr;
|
|
if (*ptr == '\0'){
|
|
// LF at the end of the string.
|
|
*startNext = ptr;
|
|
return (unsigned char *) unconst(start);
|
|
}
|
|
ch = getCharFromString(&ptr);
|
|
if (ch == '\0') {
|
|
// Bad string
|
|
return NULL;
|
|
}
|
|
if (ch == '\r') {
|
|
// LFCR
|
|
*startNext = ptr;
|
|
} else {
|
|
// LF
|
|
*startNext = *end;
|
|
}
|
|
return (unsigned char *) unconst(start);
|
|
} else if (ch == '\r') {
|
|
*end = lastPtr;
|
|
*startNext = ptr;
|
|
return (unsigned char *) unconst(start);
|
|
} // else: a normal character
|
|
}
|
|
}
|
|
|
|
size_t
|
|
utf8StringCount(const unsigned char *start) {
|
|
size_t count = 0;
|
|
UniChar ch;
|
|
|
|
for (;;) {
|
|
ch = getCharFromString(&start);
|
|
if (ch == '\0')
|
|
return count;
|
|
count++;
|
|
}
|
|
}
|
|
|
|
size_t
|
|
utf8StringCountN(const unsigned char *start, const unsigned char *end) {
|
|
size_t count = 0;
|
|
UniChar ch;
|
|
|
|
for (;;) {
|
|
ch = getCharFromStringN(&start, end);
|
|
if (ch == '\0')
|
|
return count;
|
|
count++;
|
|
}
|
|
}
|
|
|
|
// Locates a unicode character (ch) in a UTF-8 string (pStr)
|
|
// returns the char positions when found
|
|
// -1 when not found
|
|
int
|
|
utf8StringPos (const unsigned char *pStr, UniChar ch)
|
|
{
|
|
int pos;
|
|
|
|
for (pos = 0; *pStr != '\0'; ++pos)
|
|
{
|
|
if (getCharFromString (&pStr) == ch)
|
|
return pos;
|
|
}
|
|
|
|
if (ch == '\0' && *pStr == '\0')
|
|
return pos;
|
|
|
|
return -1;
|
|
}
|
|
|
|
// Safe version of strcpy(), somewhat analogous to strncpy()
|
|
// except it guarantees a 0-term when size > 0
|
|
// when size == 0, returns NULL
|
|
// BUG: this may result in the last character being only partially in the
|
|
// buffer
|
|
unsigned char *
|
|
utf8StringCopy (unsigned char *dst, size_t size, const unsigned char *src)
|
|
{
|
|
if (size == 0)
|
|
return 0;
|
|
|
|
strncpy ((char *) dst, (const char *) src, size);
|
|
dst[size - 1] = '\0';
|
|
|
|
return dst;
|
|
}
|
|
|
|
// TODO: this is not implemented with respect to collating order
|
|
int
|
|
utf8StringCompare (const unsigned char *str1, const unsigned char *str2)
|
|
{
|
|
#if 0
|
|
// UniChar comparing version
|
|
UniChar ch1;
|
|
UniChar ch2;
|
|
|
|
for (;;)
|
|
{
|
|
int cmp;
|
|
|
|
ch1 = getCharFromString(&str1);
|
|
ch2 = getCharFromString(&str2);
|
|
if (ch1 == '\0' || ch2 == '\0')
|
|
break;
|
|
|
|
cmp = utf8CompareChar (ch1, ch2);
|
|
if (cmp != 0)
|
|
return cmp;
|
|
}
|
|
|
|
if (ch1 != '\0')
|
|
{
|
|
// ch2 == '\0'
|
|
// str2 ends, str1 continues
|
|
return 1;
|
|
}
|
|
|
|
if (ch2 != '\0')
|
|
{
|
|
// ch1 == '\0'
|
|
// str1 ends, str2 continues
|
|
return -1;
|
|
}
|
|
|
|
// ch1 == '\0' && ch2 == '\0'.
|
|
// Strings match completely.
|
|
return 0;
|
|
#else
|
|
// this will do for now
|
|
return strcmp ((const char *) str1, (const char *) str2);
|
|
#endif
|
|
}
|
|
|
|
unsigned char *
|
|
skipUTF8Chars(const unsigned char *ptr, size_t num) {
|
|
UniChar ch;
|
|
const unsigned char *oldPtr;
|
|
|
|
while (num--) {
|
|
oldPtr = ptr;
|
|
ch = getCharFromString(&ptr);
|
|
if (ch == '\0')
|
|
return (unsigned char *) unconst(oldPtr);
|
|
}
|
|
return (unsigned char *) unconst(ptr);
|
|
}
|
|
|
|
// Decodes a UTF-8 string (start) into a unicode character string (wstr)
|
|
// returns number of chars decoded and stored, not counting 0-term
|
|
// any chars that do not fit are truncated
|
|
// wide string term 0 is always appended, unless the destination
|
|
// buffer is 0 chars long
|
|
size_t
|
|
getUniCharFromStringN(UniChar *wstr, size_t maxcount,
|
|
const unsigned char *start, const unsigned char *end)
|
|
{
|
|
UniChar *next;
|
|
|
|
if (maxcount == 0)
|
|
return 0;
|
|
|
|
// always leave room for 0-term
|
|
--maxcount;
|
|
|
|
for (next = wstr; maxcount > 0; ++next, --maxcount)
|
|
{
|
|
*next = getCharFromStringN(&start, end);
|
|
if (*next == 0)
|
|
break;
|
|
}
|
|
|
|
*next = 0; // term
|
|
|
|
return next - wstr;
|
|
}
|
|
|
|
// See getStringFromWideN() for functionality
|
|
// the only difference is that the source string (start) length is
|
|
// calculated by searching for 0-term
|
|
size_t
|
|
getUniCharFromString(UniChar *wstr, size_t maxcount,
|
|
const unsigned char *start)
|
|
{
|
|
UniChar *next;
|
|
|
|
if (maxcount == 0)
|
|
return 0;
|
|
|
|
// always leave room for 0-term
|
|
--maxcount;
|
|
|
|
for (next = wstr; maxcount > 0; ++next, --maxcount)
|
|
{
|
|
*next = getCharFromString(&start);
|
|
if (*next == 0)
|
|
break;
|
|
}
|
|
|
|
*next = 0; // term
|
|
|
|
return next - wstr;
|
|
}
|
|
|
|
// Encode one wide character into UTF-8
|
|
// returns number of bytes used in the buffer,
|
|
// 0 : invalid or unsupported char
|
|
// <0 : negative of bytes needed if buffer too small
|
|
// string term '\0' is *not* appended or counted
|
|
int
|
|
getStringFromChar(unsigned char *ptr, size_t size, UniChar ch)
|
|
{
|
|
int i;
|
|
static const struct range_def
|
|
{
|
|
UniChar lim;
|
|
int marker;
|
|
int mask;
|
|
}
|
|
ranges[] =
|
|
{
|
|
{0x0000007f, 0x00, 0x7f},
|
|
{0x000007ff, 0xc0, 0x1f},
|
|
{0x0000ffff, 0xe0, 0x0f},
|
|
{0x001fffff, 0xf0, 0x07},
|
|
{0x03ffffff, 0xf8, 0x03},
|
|
{0x7fffffff, 0xfc, 0x01},
|
|
{0x00000000, 0x00, 0x00} // term
|
|
};
|
|
const struct range_def *def;
|
|
|
|
// lookup the range
|
|
for (i = 0, def = ranges; ch > def->lim && def->mask != 0; ++i, ++def)
|
|
;
|
|
if (def->mask == 0)
|
|
{ // invalid or unsupported char
|
|
log_add(log_Warning, "Warning: Invalid or unsupported unicode "
|
|
"char (%lu)", (unsigned long) ch);
|
|
return 0;
|
|
}
|
|
|
|
if ((size_t)i + 1 > size)
|
|
return -(i + 1);
|
|
|
|
// unrolled for speed
|
|
switch (i)
|
|
{
|
|
case 5: ptr[5] = (ch & 0x3f) | 0x80;
|
|
ch >>= 6;
|
|
case 4: ptr[4] = (ch & 0x3f) | 0x80;
|
|
ch >>= 6;
|
|
case 3: ptr[3] = (ch & 0x3f) | 0x80;
|
|
ch >>= 6;
|
|
case 2: ptr[2] = (ch & 0x3f) | 0x80;
|
|
ch >>= 6;
|
|
case 1: ptr[1] = (ch & 0x3f) | 0x80;
|
|
ch >>= 6;
|
|
case 0: ptr[0] = (ch & def->mask) | def->marker;
|
|
}
|
|
|
|
return i + 1;
|
|
}
|
|
|
|
// Encode a wide char string (wstr) into a UTF-8 string (ptr)
|
|
// returns number of bytes used in the buffer (includes 0-term)
|
|
// any chars that do not fit are truncated
|
|
// string term '\0' is always appended, unless the destination
|
|
// buffer is 0 bytes long
|
|
size_t
|
|
getStringFromWideN(unsigned char *ptr, size_t size,
|
|
const UniChar *wstr, size_t count)
|
|
{
|
|
unsigned char *next;
|
|
int used;
|
|
|
|
if (size == 0)
|
|
return 0;
|
|
|
|
// always leave room for 0-term
|
|
--size;
|
|
|
|
for (next = ptr; size > 0 && count > 0;
|
|
size -= used, next += used, --count, ++wstr)
|
|
{
|
|
used = getStringFromChar(next, size, *wstr);
|
|
if (used < 0)
|
|
break; // not enough room
|
|
if (used == 0)
|
|
{ // bad char?
|
|
*next = '?';
|
|
used = 1;
|
|
}
|
|
}
|
|
|
|
*next = '\0'; // term
|
|
|
|
return next - ptr + 1;
|
|
}
|
|
|
|
// See getStringFromWideN() for functionality
|
|
// the only difference is that the source string (wstr) length is
|
|
// calculated by searching for 0-term
|
|
size_t
|
|
getStringFromWide(unsigned char *ptr, size_t size, const UniChar *wstr)
|
|
{
|
|
const UniChar *end;
|
|
|
|
for (end = wstr; *end != 0; ++end)
|
|
;
|
|
|
|
return getStringFromWideN(ptr, size, wstr, (end - wstr));
|
|
}
|
|
|
|
int
|
|
UniChar_isGraph(UniChar ch)
|
|
{ // this is not technically sufficient, but close enough for us
|
|
// we'll consider all non-control (CO and C1) chars in 'graph' class
|
|
// except for the "Private Use Area" (0xE000 - 0xF8FF)
|
|
|
|
// TODO: The private use area is really only glommed by OS X,
|
|
// and even there, not all of it. (Delete and Backspace both
|
|
// end up producing characters there -- see bug #942 for the
|
|
// gory details.)
|
|
return (ch > 0xa0 && (ch < 0xE000 || ch > 0xF8FF)) ||
|
|
(ch > 0x20 && ch < 0x7f);
|
|
}
|
|
|
|
int
|
|
UniChar_isPrint(UniChar ch)
|
|
{ // this is not technically sufficient, but close enough for us
|
|
// chars in 'print' class are 'graph' + 'space' classes
|
|
// the only space we currently have defined is 0x20
|
|
return (ch == 0x20) || UniChar_isGraph(ch);
|
|
}
|
|
|
|
UniChar
|
|
UniChar_toUpper(UniChar ch)
|
|
{ // this is a very basic Latin-1 implementation
|
|
// just to get things going
|
|
return (ch < 0x100) ? (UniChar) toupper((int) ch) : ch;
|
|
}
|
|
|
|
UniChar
|
|
UniChar_toLower(UniChar ch)
|
|
{ // this is a very basic Latin-1 implementation
|
|
// just to get things going
|
|
return (ch < 0x100) ? (UniChar) tolower((int) ch) : ch;
|
|
}
|
|
|