Update to 1.11.2
This commit is contained in:
@@ -192,19 +192,35 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
|
||||
|
||||
while (str <= last && *str != '\0') {
|
||||
size_t len = Utf8EncodedCharLen(*str);
|
||||
/* If the character is unknown, i.e. encoded length is 0
|
||||
* we assume worst case for the length check.
|
||||
* The length check is needed to prevent Utf8Decode to read
|
||||
* over the terminating '\0' if that happens to be placed
|
||||
* within the encoding of an UTF8 character. */
|
||||
if ((len == 0 && str + 4 > last) || str + len > last) break;
|
||||
|
||||
WChar c;
|
||||
len = Utf8Decode(&c, str);
|
||||
/* It's possible to encode the string termination character
|
||||
* into a multiple bytes. This prevents those termination
|
||||
* characters to be skipped */
|
||||
if (c == '\0') break;
|
||||
/* If the first byte does not look like the first byte of an encoded
|
||||
* character, i.e. encoded length is 0, then this byte is definitely bad
|
||||
* and it should be skipped.
|
||||
* When the first byte looks like the first byte of an encoded character,
|
||||
* then the remaining bytes in the string are checked whether the whole
|
||||
* encoded character can be there. If that is not the case, this byte is
|
||||
* skipped.
|
||||
* Finally we attempt to decode the encoded character, which does certain
|
||||
* extra validations to see whether the correct number of bytes were used
|
||||
* to encode the character. If that is not the case, the byte is probably
|
||||
* invalid and it is skipped. We could emit a question mark, but then the
|
||||
* logic below cannot just copy bytes, it would need to re-encode the
|
||||
* decoded characters as the length in bytes may have changed.
|
||||
*
|
||||
* The goals here is to get as much valid Utf8 encoded characters from the
|
||||
* source string to the destination string.
|
||||
*
|
||||
* Note: a multi-byte encoded termination ('\0') will trigger the encoded
|
||||
* char length and the decoded length to differ, so it will be ignored as
|
||||
* invalid character data. If it were to reach the termination, then we
|
||||
* would also reach the "last" byte of the string and a normal '\0'
|
||||
* termination will be placed after it.
|
||||
*/
|
||||
if (len == 0 || str + len > last || len != Utf8Decode(&c, str)) {
|
||||
/* Maybe the next byte is still a valid character? */
|
||||
str++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
|
||||
/* Copy the character back. Even if dst is current the same as str
|
||||
@@ -225,6 +241,8 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
|
||||
if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
|
||||
}
|
||||
}
|
||||
|
||||
/* String termination, if needed, is left to the caller of this function. */
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user