rules: early detection of invalid encoding

2023-10-30 15:51:34 +01:00 · 2023-10-30 15:51:34 +01:00 · 3aaa4e2a53
parent 82e9293e12
commit 3aaa4e2a53
5 changed files with 52 additions and 9 deletions
--- a/src/xkbcomp/rules.c
+++ b/src/xkbcomp/rules.c
@ -1084,29 +1084,36 @@ read_rules_file(struct xkb_context *ctx,
                FILE *file,
                const char *path)
 {
-    bool ret = false;
+    bool ret;
    char *string;
    size_t size;
    struct scanner scanner;
-    ret = map_file(file, &string, &size);
+    if (!map_file(file, &string, &size)) {
    if (!ret) {
        log_err(ctx, XKB_LOG_MESSAGE_NO_ID,
                "Couldn't read rules file \"%s\": %s\n",
                path, strerror(errno));
-        goto out;
+        return false;
    }
    scanner_init(&scanner, matcher->ctx, string, size, path, NULL);
-    /* Skip UTF-8 encoded BOM (U+FEFF) */
+    /* Basic detection of wrong character encoding.
-    /* See: https://www.unicode.org/faq/utf_bom.html#bom5 */
+       The first character relevant to the grammar must be ASCII:
-    scanner_str(&scanner, "\xef\xbb\xbf", 3);
+       whitespace, !, / (for comment) */
    if (!scanner_check_supported_char_encoding(&scanner)) {
        scanner_err(&scanner,
            "This could be a file encoding issue. "
            "Supported encodings must be backward compatible with ASCII.");
        scanner_err(&scanner,
            "E.g. ISO/CEI 8859 and UTF-8 are supported "
            "but UTF-16, UTF-32 and CP1026 are not.");
        unmap_file(string, size);
        return false;
    }
    ret = matcher_match(matcher, &scanner, include_depth, string, size, path);
    unmap_file(string, size);
 out:
    return ret;
 }
--- a/test/data/rules/utf-16be_with_bom
+++ b/test/data/rules/utf-16be_with_bom
--- a/test/data/rules/utf-16le_with_bom
+++ b/test/data/rules/utf-16le_with_bom
--- a/test/data/rules/utf-32be
+++ b/test/data/rules/utf-32be
--- a/test/rules-file.c
+++ b/test/rules-file.c
@ -106,6 +106,42 @@ main(int argc, char *argv[])
    };
    assert(test_rules(ctx, &test_utf_8_with_bom));
    struct test_data test_utf_16le_with_bom = {
        .rules = "utf-16le_with_bom",
        .model = "my_model", .layout = "my_layout", .variant = "my_variant",
        .options = "my_option",
        .keycodes = "my_keycodes", .types = "my_types",
        .compat = "my_compat|some:compat",
        .symbols = "my_symbols+extra_variant",
    };
    assert(!test_rules(ctx, &test_utf_16le_with_bom));
    struct test_data test_utf_16be_with_bom = {
        .rules = "utf-16be_with_bom",
        .model = "my_model", .layout = "my_layout", .variant = "my_variant",
        .options = "my_option",
        .keycodes = "my_keycodes", .types = "my_types",
        .compat = "my_compat|some:compat",
        .symbols = "my_symbols+extra_variant",
    };
    assert(!test_rules(ctx, &test_utf_16be_with_bom));
    struct test_data test_utf_32be = {
        .rules = "utf-32be",
        .model = "my_model", .layout = "my_layout", .variant = "my_variant",
        .options = "my_option",
        .keycodes = "my_keycodes", .types = "my_types",
        .compat = "my_compat|some:compat",
        .symbols = "my_symbols+extra_variant",
    };
    assert(!test_rules(ctx, &test_utf_32be));
    struct test_data test1 = {
        .rules = "simple",