Update to 1.11.2

This commit is contained in:
dP
2021-05-03 22:10:57 +03:00
parent 5881c752f5
commit ac7d3eba75
103 changed files with 1631 additions and 942 deletions

View File

@@ -192,19 +192,35 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
while (str <= last && *str != '\0') {
size_t len = Utf8EncodedCharLen(*str);
/* If the character is unknown, i.e. encoded length is 0
* we assume worst case for the length check.
* The length check is needed to prevent Utf8Decode to read
* over the terminating '\0' if that happens to be placed
* within the encoding of an UTF8 character. */
if ((len == 0 && str + 4 > last) || str + len > last) break;
WChar c;
len = Utf8Decode(&c, str);
/* It's possible to encode the string termination character
* into a multiple bytes. This prevents those termination
* characters to be skipped */
if (c == '\0') break;
/* If the first byte does not look like the first byte of an encoded
* character, i.e. encoded length is 0, then this byte is definitely bad
* and it should be skipped.
* When the first byte looks like the first byte of an encoded character,
* then the remaining bytes in the string are checked whether the whole
* encoded character can be there. If that is not the case, this byte is
* skipped.
* Finally we attempt to decode the encoded character, which does certain
* extra validations to see whether the correct number of bytes were used
* to encode the character. If that is not the case, the byte is probably
* invalid and it is skipped. We could emit a question mark, but then the
* logic below cannot just copy bytes, it would need to re-encode the
* decoded characters as the length in bytes may have changed.
*
* The goals here is to get as much valid Utf8 encoded characters from the
* source string to the destination string.
*
* Note: a multi-byte encoded termination ('\0') will trigger the encoded
* char length and the decoded length to differ, so it will be ignored as
* invalid character data. If it were to reach the termination, then we
* would also reach the "last" byte of the string and a normal '\0'
* termination will be placed after it.
*/
if (len == 0 || str + len > last || len != Utf8Decode(&c, str)) {
/* Maybe the next byte is still a valid character? */
str++;
continue;
}
if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
/* Copy the character back. Even if dst is current the same as str
@@ -225,6 +241,8 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
}
}
/* String termination, if needed, is left to the caller of this function. */
}
/**