Compose: early detection of invalid encoding
Also move “unrecognized token” error message before skiping the line, in order to fix token position.master
parent
6c54681fe7
commit
a2da57aba1
|
@ -57,8 +57,8 @@ OR PERFORMANCE OF THIS SOFTWARE.
|
|||
#include <errno.h>
|
||||
|
||||
#include "utils.h"
|
||||
#include "scanner-utils.h"
|
||||
#include "table.h"
|
||||
#include "scanner-utils.h"
|
||||
#include "paths.h"
|
||||
#include "utf8.h"
|
||||
#include "parser.h"
|
||||
|
@ -244,10 +244,9 @@ skip_more_whitespace_and_comments:
|
|||
return TOK_IDENT;
|
||||
}
|
||||
|
||||
scanner_err(s, "unrecognized token");
|
||||
/* Discard rest of line. */
|
||||
scanner_skip_to_eol(s);
|
||||
|
||||
scanner_err(s, "unrecognized token");
|
||||
return TOK_ERROR;
|
||||
}
|
||||
|
||||
|
@ -527,9 +526,15 @@ parse(struct xkb_compose_table *table, struct scanner *s,
|
|||
enum { MAX_ERRORS = 10 };
|
||||
int num_errors = 0;
|
||||
|
||||
/* Skip UTF-8 encoded BOM (U+FEFF) */
|
||||
/* See: https://www.unicode.org/faq/utf_bom.html#bom5 */
|
||||
scanner_str(s, "\xef\xbb\xbf", 3);
|
||||
/* Basic detection of wrong character encoding.
|
||||
The first character relevant to the grammar must be ASCII:
|
||||
whitespace, include, modifier list, keysym, comment */
|
||||
if (!scanner_check_supported_char_encoding(s)) {
|
||||
scanner_err(s,
|
||||
"This could be a file encoding issue. "
|
||||
"Supported file encodings are ASCII and UTF-8.");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
initial:
|
||||
production.len = 0;
|
||||
|
|
|
@ -212,4 +212,33 @@ scanner_hex(struct scanner *s, uint8_t *out)
|
|||
return i > 0;
|
||||
}
|
||||
|
||||
/* Basic detection of wrong character encoding based on the first bytes */
|
||||
static inline bool
|
||||
scanner_check_supported_char_encoding(struct scanner *scanner)
|
||||
{
|
||||
/* Skip UTF-8 encoded BOM (U+FEFF)
|
||||
* See: https://www.unicode.org/faq/utf_bom.html#bom5 */
|
||||
if (scanner_str(scanner, "\xef\xbb\xbf", 3) || scanner->len < 2) {
|
||||
/* Assume UTF-8 encoding or trivial short input */
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Early detection of wrong file encoding, e.g. UTF-16 or UTF-32 */
|
||||
if (scanner->s[0] == '\0' || scanner->s[1] == '\0') {
|
||||
if (scanner->s[0] != '\0')
|
||||
scanner->token_column++;
|
||||
scanner_err(scanner, "unexpected NULL character.");
|
||||
return false;
|
||||
}
|
||||
/* Enforce the first character to be ASCII.
|
||||
See the note before the use of this function, that explains the relevant
|
||||
parts of the grammars of rules, keymap components and Compose. */
|
||||
if (!is_ascii(scanner->s[0])) {
|
||||
scanner_err(scanner, "unexpected non-ASCII character.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -173,6 +173,12 @@ strndup(const char *s, size_t n)
|
|||
#endif
|
||||
|
||||
/* ctype.h is locale-dependent and has other oddities. */
|
||||
static inline bool
|
||||
is_ascii(char ch)
|
||||
{
|
||||
return (ch & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_space(char ch)
|
||||
{
|
||||
|
|
|
@ -175,12 +175,86 @@ test_compose_seq_buffer(struct xkb_context *ctx, const char *buffer, ...)
|
|||
static void
|
||||
test_compose_utf8_bom(struct xkb_context *ctx)
|
||||
{
|
||||
const char *buffer = "\xef\xbb\xbf<A> : X";
|
||||
const char buffer[] = "\xef\xbb\xbf<A> : X";
|
||||
assert(test_compose_seq_buffer(ctx, buffer,
|
||||
XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "X", XKB_KEY_X,
|
||||
XKB_KEY_NoSymbol));
|
||||
}
|
||||
|
||||
static void
|
||||
test_invalid_encodings(struct xkb_context *ctx)
|
||||
{
|
||||
struct xkb_compose_table *table;
|
||||
|
||||
/* ISO 8859-1 (latin1) */
|
||||
const char iso_8859_1[] = "<A> : \"\xe1\" acute";
|
||||
assert(!test_compose_seq_buffer(ctx, iso_8859_1,
|
||||
XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\xc3\xa1", XKB_KEY_acute,
|
||||
XKB_KEY_NoSymbol));
|
||||
|
||||
/* UTF-16LE */
|
||||
const char utf_16_le[] =
|
||||
"<\0A\0>\0 \0:\0 \0X\0\n\0"
|
||||
"<\0B\0>\0 \0:\0 \0Y\0";
|
||||
table = xkb_compose_table_new_from_buffer(ctx,
|
||||
utf_16_le, sizeof(utf_16_le), "",
|
||||
XKB_COMPOSE_FORMAT_TEXT_V1,
|
||||
XKB_COMPOSE_COMPILE_NO_FLAGS);
|
||||
assert(!table);
|
||||
|
||||
/* UTF-16BE */
|
||||
const char utf_16_be[] =
|
||||
"\0<\0A\0>\0 \0:\0 \0X\0\n"
|
||||
"\0<\0B\0>\0 \0:\0 \0Y";
|
||||
table = xkb_compose_table_new_from_buffer(ctx,
|
||||
utf_16_be, sizeof(utf_16_be), "",
|
||||
XKB_COMPOSE_FORMAT_TEXT_V1,
|
||||
XKB_COMPOSE_COMPILE_NO_FLAGS);
|
||||
assert(!table);
|
||||
|
||||
/* UTF-16BE with BOM */
|
||||
const char utf_16_be_bom[] =
|
||||
"\xfe\xff"
|
||||
"\0<\0A\0>\0 \0:\0 \0X\0\n"
|
||||
"\0<\0B\0>\0 \0:\0 \0Y";
|
||||
table = xkb_compose_table_new_from_buffer(ctx,
|
||||
utf_16_be_bom, sizeof(utf_16_be_bom), "",
|
||||
XKB_COMPOSE_FORMAT_TEXT_V1,
|
||||
XKB_COMPOSE_COMPILE_NO_FLAGS);
|
||||
assert(!table);
|
||||
|
||||
/* UTF-32LE */
|
||||
const char utf_32_le[] =
|
||||
"<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
|
||||
"<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0";
|
||||
table = xkb_compose_table_new_from_buffer(ctx,
|
||||
utf_32_le, sizeof(utf_32_le), "",
|
||||
XKB_COMPOSE_FORMAT_TEXT_V1,
|
||||
XKB_COMPOSE_COMPILE_NO_FLAGS);
|
||||
assert(!table);
|
||||
|
||||
/* UTF-32LE with BOM */
|
||||
const char utf_32_le_bom[] =
|
||||
"\xff\xfe\0\0"
|
||||
"<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
|
||||
"<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y\0\0\0";
|
||||
table = xkb_compose_table_new_from_buffer(ctx,
|
||||
utf_32_le_bom, sizeof(utf_32_le_bom), "",
|
||||
XKB_COMPOSE_FORMAT_TEXT_V1,
|
||||
XKB_COMPOSE_COMPILE_NO_FLAGS);
|
||||
assert(!table);
|
||||
|
||||
/* UTF-32BE */
|
||||
const char utf_32_be[] =
|
||||
"\0\0\0<\0\0\0A\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0X\0\0\0\n\0\0\0"
|
||||
"<\0\0\0B\0\0\0>\0\0\0 \0\0\0:\0\0\0 \0\0\0Y";
|
||||
table = xkb_compose_table_new_from_buffer(ctx,
|
||||
utf_32_be, sizeof(utf_32_be), "",
|
||||
XKB_COMPOSE_FORMAT_TEXT_V1,
|
||||
XKB_COMPOSE_COMPILE_NO_FLAGS);
|
||||
assert(!table);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_seqs(struct xkb_context *ctx)
|
||||
|
@ -734,6 +808,7 @@ main(int argc, char *argv[])
|
|||
#endif
|
||||
|
||||
test_compose_utf8_bom(ctx);
|
||||
test_invalid_encodings(ctx);
|
||||
test_seqs(ctx);
|
||||
test_conflicting(ctx);
|
||||
test_XCOMPOSEFILE(ctx);
|
||||
|
|
Loading…
Reference in New Issue