Compose: early detection of invalid encoding

Also move “unrecognized token” error message before skiping the
line, in order to fix token position.
master
Pierre Le Marre 2023-10-30 14:50:00 +01:00 committed by Wismill
parent 6c54681fe7
commit a2da57aba1
4 changed files with 122 additions and 7 deletions

View File

@ -57,8 +57,8 @@ OR PERFORMANCE OF THIS SOFTWARE.
#include <errno.h>
#include "utils.h"
#include "scanner-utils.h"
#include "table.h"
#include "scanner-utils.h"
#include "paths.h"
#include "utf8.h"
#include "parser.h"
@ -244,10 +244,9 @@ skip_more_whitespace_and_comments:
return TOK_IDENT;
}
scanner_err(s, "unrecognized token");
/* Discard rest of line. */
scanner_skip_to_eol(s);
scanner_err(s, "unrecognized token");
return TOK_ERROR;
}
@ -527,9 +526,15 @@ parse(struct xkb_compose_table *table, struct scanner *s,
enum { MAX_ERRORS = 10 };
int num_errors = 0;
/* Skip UTF-8 encoded BOM (U+FEFF) */
/* See: https://www.unicode.org/faq/utf_bom.html#bom5 */
scanner_str(s, "\xef\xbb\xbf", 3);
/* Basic detection of wrong character encoding.
The first character relevant to the grammar must be ASCII:
whitespace, include, modifier list, keysym, comment */
if (!scanner_check_supported_char_encoding(s)) {
scanner_err(s,
"This could be a file encoding issue. "
"Supported file encodings are ASCII and UTF-8.");
goto fail;
}
initial:
production.len = 0;

View File

@ -212,4 +212,33 @@ scanner_hex(struct scanner *s, uint8_t *out)
return i > 0;
}
/* Basic detection of wrong character encoding based on the first bytes */
static inline bool
scanner_check_supported_char_encoding(struct scanner *scanner)
{
/* Skip UTF-8 encoded BOM (U+FEFF)
* See: https://www.unicode.org/faq/utf_bom.html#bom5 */
if (scanner_str(scanner, "\xef\xbb\xbf", 3) || scanner->len < 2) {
/* Assume UTF-8 encoding or trivial short input */
return true;
}
/* Early detection of wrong file encoding, e.g. UTF-16 or UTF-32 */
if (scanner->s[0] == '\0' || scanner->s[1] == '\0') {
if (scanner->s[0] != '\0')
scanner->token_column++;
scanner_err(scanner, "unexpected NULL character.");
return false;
}
/* Enforce the first character to be ASCII.
See the note before the use of this function, that explains the relevant
parts of the grammars of rules, keymap components and Compose. */
if (!is_ascii(scanner->s[0])) {
scanner_err(scanner, "unexpected non-ASCII character.");
return false;
}
return true;
}
#endif

View File

@ -173,6 +173,12 @@ strndup(const char *s, size_t n)
#endif
/* ctype.h is locale-dependent and has other oddities. */
static inline bool
is_ascii(char ch)
{
return (ch & ~0x7f) == 0;
}
static inline bool
is_space(char ch)
{

View File

@ -175,12 +175,86 @@ test_compose_seq_buffer(struct xkb_context *ctx, const char *buffer, ...)
static void
test_compose_utf8_bom(struct xkb_context *ctx)
{
const char *buffer = "\xef\xbb\xbf<A> : X";
const char buffer[] = "\xef\xbb\xbf<A> : X";
assert(test_compose_seq_buffer(ctx, buffer,
XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "X", XKB_KEY_X,
XKB_KEY_NoSymbol));
}
static void
test_invalid_encodings(struct xkb_context *ctx)
{
struct xkb_compose_table *table;
/* ISO 8859-1 (latin1) */
const char iso_8859_1[] = "<A> : \"\xe1\" acute";
assert(!test_compose_seq_buffer(ctx, iso_8859_1,
XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\xc3\xa1", XKB_KEY_acute,
XKB_KEY_NoSymbol));
/* UTF-16LE */
const char utf_16_le[] =
"<\0A\0>\0 \0:\0 \0X\0\n\0"
"<\0B\0>\0 \0:\0 \0Y\0";
table = xkb_compose_table_new_from_buffer(ctx,
utf_16_le, sizeof(utf_16_le), "",
XKB_COMPOSE_FORMAT_TEXT_V1,
XKB_COMPOSE_COMPILE_NO_FLAGS);
assert(!table);
/* UTF-16BE */
const char utf_16_be[] =
"\0<\0A\0>\0 \0:\0 \0X\0\n"
"\0<\0B\0>\0 \0:\0 \0Y";
table = xkb_compose_table_new_from_buffer(ctx,
utf_16_be, sizeof(utf_16_be), "",
XKB_COMPOSE_FORMAT_TEXT_V1,
XKB_COMPOSE_COMPILE_NO_FLAGS);
assert(!table);
/* UTF-16BE with BOM */
const char utf_16_be_bom[] =
"\xfe\xff"
"\0<\0A\0>\0 \0:\0 \0X\0\n"
"\0<\0B\0>\0 \0:\0 \0Y";
table = xkb_compose_table_new_from_buffer(ctx,
utf_16_be_bom, sizeof(utf_16_be_bom), "",
XKB_COMPOSE_FORMAT_TEXT_V1,
XKB_COMPOSE_COMPILE_NO_FLAGS);
assert(!table);
/* UTF-32LE */
const char utf_32_le[] =
"<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
"<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0";
table = xkb_compose_table_new_from_buffer(ctx,
utf_32_le, sizeof(utf_32_le), "",
XKB_COMPOSE_FORMAT_TEXT_V1,
XKB_COMPOSE_COMPILE_NO_FLAGS);
assert(!table);
/* UTF-32LE with BOM */
const char utf_32_le_bom[] =
"\xff\xfe\0\0"
"<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
"<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0";
table = xkb_compose_table_new_from_buffer(ctx,
utf_32_le_bom, sizeof(utf_32_le_bom), "",
XKB_COMPOSE_FORMAT_TEXT_V1,
XKB_COMPOSE_COMPILE_NO_FLAGS);
assert(!table);
/* UTF-32BE */
const char utf_32_be[] =
"\0\0\0<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
"<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y";
table = xkb_compose_table_new_from_buffer(ctx,
utf_32_be, sizeof(utf_32_be), "",
XKB_COMPOSE_FORMAT_TEXT_V1,
XKB_COMPOSE_COMPILE_NO_FLAGS);
assert(!table);
}
static void
test_seqs(struct xkb_context *ctx)
@ -734,6 +808,7 @@ main(int argc, char *argv[])
#endif
test_compose_utf8_bom(ctx);
test_invalid_encodings(ctx);
test_seqs(ctx);
test_conflicting(ctx);
test_XCOMPOSEFILE(ctx);