From a2da57aba19b829825d54fa14cd004703ee33e56 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 30 Oct 2023 14:50:00 +0100 Subject: [PATCH] Compose: early detection of invalid encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also move “unrecognized token” error message before skiping the line, in order to fix token position. --- src/compose/parser.c | 17 ++++++---- src/scanner-utils.h | 29 +++++++++++++++++ src/utils.h | 6 ++++ test/compose.c | 77 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 122 insertions(+), 7 deletions(-) diff --git a/src/compose/parser.c b/src/compose/parser.c index 4112baa..ac11446 100644 --- a/src/compose/parser.c +++ b/src/compose/parser.c @@ -57,8 +57,8 @@ OR PERFORMANCE OF THIS SOFTWARE. #include #include "utils.h" -#include "scanner-utils.h" #include "table.h" +#include "scanner-utils.h" #include "paths.h" #include "utf8.h" #include "parser.h" @@ -244,10 +244,9 @@ skip_more_whitespace_and_comments: return TOK_IDENT; } + scanner_err(s, "unrecognized token"); /* Discard rest of line. */ scanner_skip_to_eol(s); - - scanner_err(s, "unrecognized token"); return TOK_ERROR; } @@ -527,9 +526,15 @@ parse(struct xkb_compose_table *table, struct scanner *s, enum { MAX_ERRORS = 10 }; int num_errors = 0; - /* Skip UTF-8 encoded BOM (U+FEFF) */ - /* See: https://www.unicode.org/faq/utf_bom.html#bom5 */ - scanner_str(s, "\xef\xbb\xbf", 3); + /* Basic detection of wrong character encoding. + The first character relevant to the grammar must be ASCII: + whitespace, include, modifier list, keysym, comment */ + if (!scanner_check_supported_char_encoding(s)) { + scanner_err(s, + "This could be a file encoding issue. " + "Supported file encodings are ASCII and UTF-8."); + goto fail; + } initial: production.len = 0; diff --git a/src/scanner-utils.h b/src/scanner-utils.h index 674ecaa..d9d2b42 100644 --- a/src/scanner-utils.h +++ b/src/scanner-utils.h @@ -212,4 +212,33 @@ scanner_hex(struct scanner *s, uint8_t *out) return i > 0; } +/* Basic detection of wrong character encoding based on the first bytes */ +static inline bool +scanner_check_supported_char_encoding(struct scanner *scanner) +{ + /* Skip UTF-8 encoded BOM (U+FEFF) + * See: https://www.unicode.org/faq/utf_bom.html#bom5 */ + if (scanner_str(scanner, "\xef\xbb\xbf", 3) || scanner->len < 2) { + /* Assume UTF-8 encoding or trivial short input */ + return true; + } + + /* Early detection of wrong file encoding, e.g. UTF-16 or UTF-32 */ + if (scanner->s[0] == '\0' || scanner->s[1] == '\0') { + if (scanner->s[0] != '\0') + scanner->token_column++; + scanner_err(scanner, "unexpected NULL character."); + return false; + } + /* Enforce the first character to be ASCII. + See the note before the use of this function, that explains the relevant + parts of the grammars of rules, keymap components and Compose. */ + if (!is_ascii(scanner->s[0])) { + scanner_err(scanner, "unexpected non-ASCII character."); + return false; + } + + return true; +} + #endif diff --git a/src/utils.h b/src/utils.h index aa7969c..d6efa51 100644 --- a/src/utils.h +++ b/src/utils.h @@ -173,6 +173,12 @@ strndup(const char *s, size_t n) #endif /* ctype.h is locale-dependent and has other oddities. */ +static inline bool +is_ascii(char ch) +{ + return (ch & ~0x7f) == 0; +} + static inline bool is_space(char ch) { diff --git a/test/compose.c b/test/compose.c index d7192f6..56bd889 100644 --- a/test/compose.c +++ b/test/compose.c @@ -175,12 +175,86 @@ test_compose_seq_buffer(struct xkb_context *ctx, const char *buffer, ...) static void test_compose_utf8_bom(struct xkb_context *ctx) { - const char *buffer = "\xef\xbb\xbf : X"; + const char buffer[] = "\xef\xbb\xbf : X"; assert(test_compose_seq_buffer(ctx, buffer, XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "X", XKB_KEY_X, XKB_KEY_NoSymbol)); } +static void +test_invalid_encodings(struct xkb_context *ctx) +{ + struct xkb_compose_table *table; + + /* ISO 8859-1 (latin1) */ + const char iso_8859_1[] = " : \"\xe1\" acute"; + assert(!test_compose_seq_buffer(ctx, iso_8859_1, + XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\xc3\xa1", XKB_KEY_acute, + XKB_KEY_NoSymbol)); + + /* UTF-16LE */ + const char utf_16_le[] = + "<\0A\0>\0 \0:\0 \0X\0\n\0" + "<\0B\0>\0 \0:\0 \0Y\0"; + table = xkb_compose_table_new_from_buffer(ctx, + utf_16_le, sizeof(utf_16_le), "", + XKB_COMPOSE_FORMAT_TEXT_V1, + XKB_COMPOSE_COMPILE_NO_FLAGS); + assert(!table); + + /* UTF-16BE */ + const char utf_16_be[] = + "\0<\0A\0>\0 \0:\0 \0X\0\n" + "\0<\0B\0>\0 \0:\0 \0Y"; + table = xkb_compose_table_new_from_buffer(ctx, + utf_16_be, sizeof(utf_16_be), "", + XKB_COMPOSE_FORMAT_TEXT_V1, + XKB_COMPOSE_COMPILE_NO_FLAGS); + assert(!table); + + /* UTF-16BE with BOM */ + const char utf_16_be_bom[] = + "\xfe\xff" + "\0<\0A\0>\0 \0:\0 \0X\0\n" + "\0<\0B\0>\0 \0:\0 \0Y"; + table = xkb_compose_table_new_from_buffer(ctx, + utf_16_be_bom, sizeof(utf_16_be_bom), "", + XKB_COMPOSE_FORMAT_TEXT_V1, + XKB_COMPOSE_COMPILE_NO_FLAGS); + assert(!table); + + /* UTF-32LE */ + const char utf_32_le[] = + "<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0" + "<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0"; + table = xkb_compose_table_new_from_buffer(ctx, + utf_32_le, sizeof(utf_32_le), "", + XKB_COMPOSE_FORMAT_TEXT_V1, + XKB_COMPOSE_COMPILE_NO_FLAGS); + assert(!table); + + /* UTF-32LE with BOM */ + const char utf_32_le_bom[] = + "\xff\xfe\0\0" + "<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0" + "<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0"; + table = xkb_compose_table_new_from_buffer(ctx, + utf_32_le_bom, sizeof(utf_32_le_bom), "", + XKB_COMPOSE_FORMAT_TEXT_V1, + XKB_COMPOSE_COMPILE_NO_FLAGS); + assert(!table); + + /* UTF-32BE */ + const char utf_32_be[] = + "\0\0\0<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0" + "<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y"; + table = xkb_compose_table_new_from_buffer(ctx, + utf_32_be, sizeof(utf_32_be), "", + XKB_COMPOSE_FORMAT_TEXT_V1, + XKB_COMPOSE_COMPILE_NO_FLAGS); + assert(!table); +} + static void test_seqs(struct xkb_context *ctx) @@ -734,6 +808,7 @@ main(int argc, char *argv[]) #endif test_compose_utf8_bom(ctx); + test_invalid_encodings(ctx); test_seqs(ctx); test_conflicting(ctx); test_XCOMPOSEFILE(ctx);