Do not interpret nor emit invalid Unicode encoding forms
Surrogates are invalid in both UTF-32 and UTF-8. See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875 and https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G31703master
parent
5fbffaf035
commit
183761ac24
|
@ -41,6 +41,8 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
|
||||||
|
#define NO_KEYSYM_UNICODE_CONVERSION 0
|
||||||
|
|
||||||
/* We don't use the uint32_t types here, to save some space. */
|
/* We don't use the uint32_t types here, to save some space. */
|
||||||
struct codepair {
|
struct codepair {
|
||||||
uint16_t keysym;
|
uint16_t keysym;
|
||||||
|
@ -847,7 +849,7 @@ bin_search(const struct codepair *table, size_t length, xkb_keysym_t keysym)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* no matching Unicode value found in table */
|
/* no matching Unicode value found in table */
|
||||||
return 0;
|
return NO_KEYSYM_UNICODE_CONVERSION;
|
||||||
}
|
}
|
||||||
|
|
||||||
XKB_EXPORT uint32_t
|
XKB_EXPORT uint32_t
|
||||||
|
@ -871,6 +873,13 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym)
|
||||||
return keysym & 0x7f;
|
return keysym & 0x7f;
|
||||||
|
|
||||||
/* also check for directly encoded Unicode codepoints */
|
/* also check for directly encoded Unicode codepoints */
|
||||||
|
|
||||||
|
/* Exclude surrogates: they are invalid in UTF-32.
|
||||||
|
* See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
|
||||||
|
* for further details.
|
||||||
|
*/
|
||||||
|
if (0x0100d800 <= keysym && keysym <= 0x0100dfff)
|
||||||
|
return NO_KEYSYM_UNICODE_CONVERSION;
|
||||||
/*
|
/*
|
||||||
* In theory, this is supposed to start from 0x100100, such that the ASCII
|
* In theory, this is supposed to start from 0x100100, such that the ASCII
|
||||||
* range, which is already covered by 0x00-0xff, can't be encoded in two
|
* range, which is already covered by 0x00-0xff, can't be encoded in two
|
||||||
|
@ -900,7 +909,8 @@ xkb_utf32_to_keysym(uint32_t ucs)
|
||||||
return XKB_KEY_Delete;
|
return XKB_KEY_Delete;
|
||||||
|
|
||||||
/* Unicode non-symbols and code points outside Unicode planes */
|
/* Unicode non-symbols and code points outside Unicode planes */
|
||||||
if ((ucs >= 0xfdd0 && ucs <= 0xfdef) ||
|
if ((ucs >= 0xd800 && ucs <= 0xdfff) ||
|
||||||
|
(ucs >= 0xfdd0 && ucs <= 0xfdef) ||
|
||||||
ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
|
ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
|
||||||
return XKB_KEY_NoSymbol;
|
return XKB_KEY_NoSymbol;
|
||||||
|
|
||||||
|
@ -948,7 +958,7 @@ xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size)
|
||||||
|
|
||||||
codepoint = xkb_keysym_to_utf32(keysym);
|
codepoint = xkb_keysym_to_utf32(keysym);
|
||||||
|
|
||||||
if (codepoint == 0)
|
if (codepoint == NO_KEYSYM_UNICODE_CONVERSION)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
return utf32_to_utf8(codepoint, buffer);
|
return utf32_to_utf8(codepoint, buffer);
|
||||||
|
|
16
src/utf8.c
16
src/utf8.c
|
@ -32,6 +32,11 @@
|
||||||
|
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
|
||||||
|
/* Conformant encoding form conversion from UTF-32 to UTF-8.
|
||||||
|
*
|
||||||
|
* See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
|
||||||
|
* for further details.
|
||||||
|
*/
|
||||||
int
|
int
|
||||||
utf32_to_utf8(uint32_t unichar, char *buffer)
|
utf32_to_utf8(uint32_t unichar, char *buffer)
|
||||||
{
|
{
|
||||||
|
@ -47,6 +52,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
|
||||||
length = 2;
|
length = 2;
|
||||||
head = 0xc0;
|
head = 0xc0;
|
||||||
}
|
}
|
||||||
|
/* Handle surrogates */
|
||||||
|
else if (0xd800 <= unichar && unichar <= 0xdfff) {
|
||||||
|
goto ill_formed_code_unit_subsequence;
|
||||||
|
}
|
||||||
else if (unichar <= 0xffff) {
|
else if (unichar <= 0xffff) {
|
||||||
length = 3;
|
length = 3;
|
||||||
head = 0xe0;
|
head = 0xe0;
|
||||||
|
@ -56,8 +65,7 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
|
||||||
head = 0xf0;
|
head = 0xf0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
buffer[0] = '\0';
|
goto ill_formed_code_unit_subsequence;
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
|
for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
|
||||||
|
@ -67,6 +75,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
|
||||||
buffer[length] = '\0';
|
buffer[length] = '\0';
|
||||||
|
|
||||||
return length + 1;
|
return length + 1;
|
||||||
|
|
||||||
|
ill_formed_code_unit_subsequence:
|
||||||
|
buffer[0] = '\0';
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
|
|
|
@ -222,6 +222,8 @@ main(void)
|
||||||
|
|
||||||
assert(test_utf8(0x10005d0, "א"));
|
assert(test_utf8(0x10005d0, "א"));
|
||||||
assert(test_utf8(0x110ffff, "\xf4\x8f\xbf\xbf"));
|
assert(test_utf8(0x110ffff, "\xf4\x8f\xbf\xbf"));
|
||||||
|
assert(test_utf8(0x0100d800, NULL) == 0); // Unicode surrogates
|
||||||
|
assert(test_utf8(0x0100dfff, NULL) == 0); // Unicode surrogates
|
||||||
assert(test_utf8(0x1110000, NULL) == 0);
|
assert(test_utf8(0x1110000, NULL) == 0);
|
||||||
|
|
||||||
assert(test_utf32_to_keysym('y', XKB_KEY_y));
|
assert(test_utf32_to_keysym('y', XKB_KEY_y));
|
||||||
|
@ -255,6 +257,8 @@ main(void)
|
||||||
assert(test_utf32_to_keysym(0x20ac, XKB_KEY_EuroSign));
|
assert(test_utf32_to_keysym(0x20ac, XKB_KEY_EuroSign));
|
||||||
|
|
||||||
// Unicode non-characters
|
// Unicode non-characters
|
||||||
|
assert(test_utf32_to_keysym(0xd800, XKB_KEY_NoSymbol)); // Unicode surrogates
|
||||||
|
assert(test_utf32_to_keysym(0xdfff, XKB_KEY_NoSymbol)); // Unicode surrogates
|
||||||
assert(test_utf32_to_keysym(0xfdd0, XKB_KEY_NoSymbol));
|
assert(test_utf32_to_keysym(0xfdd0, XKB_KEY_NoSymbol));
|
||||||
assert(test_utf32_to_keysym(0xfdef, XKB_KEY_NoSymbol));
|
assert(test_utf32_to_keysym(0xfdef, XKB_KEY_NoSymbol));
|
||||||
assert(test_utf32_to_keysym(0xfffe, XKB_KEY_NoSymbol));
|
assert(test_utf32_to_keysym(0xfffe, XKB_KEY_NoSymbol));
|
||||||
|
|
|
@ -170,6 +170,8 @@ test_utf32_to_utf8(void)
|
||||||
check_utf32_to_utf8(0x40, 2, "\x40");
|
check_utf32_to_utf8(0x40, 2, "\x40");
|
||||||
check_utf32_to_utf8(0xA1, 3, "\xc2\xa1");
|
check_utf32_to_utf8(0xA1, 3, "\xc2\xa1");
|
||||||
check_utf32_to_utf8(0x2701, 4, "\xe2\x9c\x81");
|
check_utf32_to_utf8(0x2701, 4, "\xe2\x9c\x81");
|
||||||
|
check_utf32_to_utf8(0xd800, 0, ""); // Unicode surrogates
|
||||||
|
check_utf32_to_utf8(0xdfff, 0, ""); // Unicode surrogates
|
||||||
check_utf32_to_utf8(0x1f004, 5, "\xf0\x9f\x80\x84");
|
check_utf32_to_utf8(0x1f004, 5, "\xf0\x9f\x80\x84");
|
||||||
check_utf32_to_utf8(0x110000, 0, "");
|
check_utf32_to_utf8(0x110000, 0, "");
|
||||||
check_utf32_to_utf8(0xffffffff, 0, "");
|
check_utf32_to_utf8(0xffffffff, 0, "");
|
||||||
|
|
Loading…
Reference in New Issue