diff --git a/src/compose/dump.h b/src/compose/dump.h new file mode 100644 index 0000000..2dbe135 --- /dev/null +++ b/src/compose/dump.h @@ -0,0 +1,95 @@ +/* + * Copyright © 2023 Pierre Le Marre + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +#ifndef COMPOSE_DUMP_H +#define COMPOSE_DUMP_H + +#include "config.h" + +#include + +#include "src/utils.h" + +/* Ad-hoc escaping for UTF-8 string + * + * Note that it only escapes the strict minimum to get a valid Compose file. + * It also escapes hexadecimal digits after an hexadecimal escape. This is not + * strictly needed by the current implementation: "\x0abcg" parses as "␊bcg", + * but better be cautious than sorry and produce "\x0a\x62\x63g" instead. + * In the latter string there is no ambiguity and no need to know the maximum + * number of digits supported by the escape sequence. + */ +static inline char* +escape_utf8_string_literal(const char *from) +{ + const size_t length = strlen(from); + /* Longest escape is converting ASCII character to "\xNN" */ + char* to = calloc(4 * length + 1, sizeof(to)); + if (!to) + return NULL; + + size_t t = 0; + bool previous_is_hex_escape = false; + uint8_t nbytes = 0; + for (size_t f = 0; f < length;) { + if ((unsigned char) from[f] < 0x80) { + /* ASCII */ + if (from[f] <= 0x10 || from[f] == 0x7f || + (is_xdigit(from[f]) && previous_is_hex_escape)) + { + /* Control character or + hexadecimal digit following an hexadecimal escape */ + snprintf_safe(&to[t], 5, "\\x%02x", from[f]); + t += 4; + previous_is_hex_escape = true; + } else if (from[f] == '"' || from[f] == '\\') { + /* Quote and backslash */ + snprintf_safe(&to[t], 3, "\\%c", from[f]); + t += 2; + previous_is_hex_escape = false; + } else { + /* Other characters */ + to[t++] = from[f]; + previous_is_hex_escape = false; + } + f++; + continue; + } + /* Test next byte for the next Unicode codepoint’s bytes count */ + else if ((unsigned char) from[f] < 0xe0) + nbytes = 2; + else if ((unsigned char) from[f] < 0xf0) + nbytes = 3; + else + nbytes = 4; + memcpy(&to[t], &from[f], nbytes); + t += nbytes; + f += nbytes; + previous_is_hex_escape = false; + } + to[t++] = '\0'; + return realloc(to, t); +} + +#endif diff --git a/src/compose/parser.c b/src/compose/parser.c index ac11446..a34d10b 100644 --- a/src/compose/parser.c +++ b/src/compose/parser.c @@ -468,6 +468,24 @@ resolve_modifier(const char *name) return XKB_MOD_INVALID; } +/* Parse a string literal ("...") and return the corresponding unescaped string, + * or NULL if it fails. + * This is aimed only for testing (un)escaping characters. */ +char * +parse_string_literal(struct xkb_context *ctx, const char *string) +{ + struct scanner s; + union lvalue val; + scanner_init(&s, ctx, string, strlen(string), "(unamed)", NULL); + switch (lex(&s, &val)) { + case TOK_STRING: + return strdup(val.string.str); + default: + fprintf(stderr, "ERROR: %s\n", s.s); + return NULL; + } +} + static bool parse(struct xkb_compose_table *table, struct scanner *s, unsigned include_depth); diff --git a/src/compose/parser.h b/src/compose/parser.h index 487f1a9..8651ee6 100644 --- a/src/compose/parser.h +++ b/src/compose/parser.h @@ -27,6 +27,9 @@ #define MAX_LHS_LEN 10 #define MAX_INCLUDE_DEPTH 5 +char * +parse_string_literal(struct xkb_context *ctx, const char *string); + bool parse_string(struct xkb_compose_table *table, const char *string, size_t len, diff --git a/test/compose.c b/test/compose.c index 56bd889..56e2e2c 100644 --- a/test/compose.c +++ b/test/compose.c @@ -22,10 +22,14 @@ */ #include "config.h" +#include #include "xkbcommon/xkbcommon-compose.h" #include "test.h" +#include "src/utf8.h" +#include "src/compose/parser.h" +#include "src/compose/dump.h" static const char * compose_status_string(enum xkb_compose_status status) @@ -769,18 +773,121 @@ test_traverse(struct xkb_context *ctx) } static void -test_escape_sequences(struct xkb_context *ctx) +test_decode_escape_sequences(struct xkb_context *ctx) { /* The following escape sequences should be ignored: * • \401 overflows * • \0 and \x0 produce NULL */ - const char *table_string = " : \"\\401f\\x0o\\0o\" X\n"; + const char table_string_1[] = " : \"\\401f\\x0o\\0o\" X\n"; - assert(test_compose_seq_buffer(ctx, table_string, + assert(test_compose_seq_buffer(ctx, table_string_1, XKB_KEY_o, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSING, "", XKB_KEY_NoSymbol, XKB_KEY_e, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "foo", XKB_KEY_X, XKB_KEY_NoSymbol)); + + /* Test various cases */ + const char table_string_2[] = + " : \"\\x0abcg\\\"x\" A\n" /* hexadecimal sequence has max 2 chars */ + " : \"éxyz\" B\n" /* non-ASCII (2 bytes) */ + " : \"€xyz\" C\n" /* non-ASCII (3 bytes) */ + " : \"✨xyz\" D\n" /* non-ASCII (4 bytes) */ + " : \"✨\\x0aé\\x0a€x\\\"\" E\n" + " : \"\" F\n"; + + assert(test_compose_seq_buffer(ctx, table_string_2, + XKB_KEY_a, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\x0a""bcg\"x", XKB_KEY_A, + XKB_KEY_b, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "éxyz", XKB_KEY_B, + XKB_KEY_c, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "€xyz", XKB_KEY_C, + XKB_KEY_d, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "✨xyz", XKB_KEY_D, + XKB_KEY_e, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "✨\x0aé\x0a€x\"", XKB_KEY_E, + XKB_KEY_f, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "", XKB_KEY_F, + XKB_KEY_NoSymbol)); +} + +static uint32_t +random_non_null_unicode_char(bool ascii) +{ + if (ascii) + return 0x01 + (rand() % 0x80); + switch (rand() % 5) { + case 0: + /* U+0080..U+07FF: 2 bytes in UTF-8 */ + return 0x80 + (rand() % 0x800); + case 1: + /* U+0800..U+FFFF: 3 bytes in UTF-8 */ + return 0x800 + (rand() % 0x10000); + case 2: + /* U+10000..U+10FFFF: 4 bytes in UTF-8 */ + return 0x10000 + (rand() % 0x110000); + default: + /* NOTE: Higher probability for ASCII */ + /* U+0001..U+007F: 1 byte in UTF-8 */ + return 0x01 + (rand() % 0x80); + } +} + +static void +test_encode_escape_sequences(struct xkb_context *ctx) +{ + char *escaped; + + /* Test empty string */ + escaped = escape_utf8_string_literal(""); + assert_streq_not_null("Empty string", "", escaped); + free(escaped); + + /* Test specific ASCII characters: ", \ */ + escaped = escape_utf8_string_literal("\"\\"); + assert_streq_not_null("Quote and backslash", "\\\"\\\\", escaped); + free(escaped); + + /* Test round-trip of random strings */ +# define SAMPLE_SIZE 1000 +# define MIN_CODE_POINT 0x0001 +# define MAX_CODE_POINTS_COUNT 15 + char buf[1 + MAX_CODE_POINTS_COUNT * 4]; + for (int ascii = 1; ascii >= 0; ascii--) { + for (size_t s = 0; s < SAMPLE_SIZE; s++) { + /* Create the string */ + size_t length = 1 + (rand() % MAX_CODE_POINTS_COUNT); + size_t c = 0; + for (size_t idx = 0; idx < length; idx++) { + int nbytes; + /* Get a random Unicode code point and encode it in UTF-8 */ + do { + const uint32_t cp = random_non_null_unicode_char(ascii); + nbytes = utf32_to_utf8(cp, &buf[c]); + } while (!nbytes); /* Handle invalid code point in UTF-8 */ + c += nbytes - 1; + assert(c <= sizeof(buf) - 1); + } + assert_printf(buf[c] == '\0', "NULL-terminated string\n"); + assert_printf(strlen(buf) == c, "Contains no NULL char\n"); + assert_printf(is_valid_utf8(buf, c), + "Invalid input UTF-8 string: \"%s\"\n", buf); + /* Escape the string */ + escaped = escape_utf8_string_literal(buf); + if (!escaped) + break; + assert_printf(is_valid_utf8(escaped, strlen(escaped)), + "Invalid input UTF-8 string: %s\n", escaped); + char *string_literal = asprintf_safe("\"%s\"", escaped); + if (!string_literal) { + free(escaped); + break; + } + /* Unescape the string */ + char *unescaped = parse_string_literal(ctx, string_literal); + assert_streq_not_null("Escaped string", buf, unescaped); + free(unescaped); + free(string_literal); + free(escaped); + } + } +# undef SAMPLE_SIZE +# undef MIN_CODE_POINT +# undef MAX_CODE_POINTS_COUNT } int @@ -791,10 +898,20 @@ main(int argc, char *argv[]) ctx = test_get_context(CONTEXT_NO_FLAG); assert(ctx); + /* Initialize pseudo-random generator with program arg or current time */ + int seed; + if (argc == 2) { + seed = atoi(argv[1]); + } else { + seed = time(NULL); + } + fprintf(stderr, "Seed for the pseudo-random generator: %d\n", seed); + srand(seed); + /* * Ensure no environment variables but “top_srcdir” is set. This ensures * that user Compose file paths are unset before the tests and set - * explicitely when necessary. + * explicitly when necessary. */ #ifdef __linux__ const char *srcdir = getenv("top_srcdir"); @@ -818,7 +935,8 @@ main(int argc, char *argv[]) test_include(ctx); test_override(ctx); test_traverse(ctx); - test_escape_sequences(ctx); + test_decode_escape_sequences(ctx); + test_encode_escape_sequences(ctx); xkb_context_unref(ctx); return 0; diff --git a/test/test.h b/test/test.h index afeec78..3968606 100644 --- a/test/test.h +++ b/test/test.h @@ -34,6 +34,16 @@ /* Automake test exit code to signify SKIP (à la PASS, FAIL, etc). */ #define SKIP_TEST 77 +#define assert_printf(cond, ...) \ + if (!(cond)) { \ + fprintf(stderr, "Assertion failure: " __VA_ARGS__); \ + assert(cond); \ + } + +#define assert_streq_not_null(test_name, expected, got) \ + assert_printf(streq_not_null(expected, got), \ + test_name ". Expected \"%s\", got: \"%s\"\n", expected, got) + /* The offset between KEY_* numbering, and keycodes in the XKB evdev * dataset. */ #define EVDEV_OFFSET 8 diff --git a/tools/compile-compose.c b/tools/compile-compose.c index e9b7716..a0b83ed 100644 --- a/tools/compile-compose.c +++ b/tools/compile-compose.c @@ -25,12 +25,12 @@ #include #include -#include -#include +#include #include "xkbcommon/xkbcommon.h" #include "xkbcommon/xkbcommon-keysyms.h" #include "xkbcommon/xkbcommon-compose.h" +#include "src/compose/dump.h" static void usage(FILE *fp, char *progname) @@ -56,7 +56,7 @@ usage(FILE *fp, char *progname) ); } -static void +static bool print_compose_table_entry(struct xkb_compose_table_entry *entry) { size_t nsyms; @@ -69,10 +69,17 @@ print_compose_table_entry(struct xkb_compose_table_entry *entry) printf(" "); } } - printf(":"); + printf(" : "); const char *utf8 = xkb_compose_table_entry_utf8(entry); if (*utf8 != '\0') { - printf(" \"%s\"", utf8); + char *escaped = escape_utf8_string_literal(utf8); + if (!escaped) { + fprintf(stderr, "ERROR: Cannot escape the string: allocation error\n"); + return false; + } else { + printf(" \"%s\"", escaped); + free(escaped); + } } const xkb_keysym_t keysym = xkb_compose_table_entry_keysym(entry); if (keysym != XKB_KEY_NoSymbol) { @@ -80,6 +87,7 @@ print_compose_table_entry(struct xkb_compose_table_entry *entry) printf(" %s", buf); } printf("\n"); + return true; } int @@ -182,10 +190,15 @@ main(int argc, char *argv[]) struct xkb_compose_table_iterator *iter = xkb_compose_table_iterator_new(compose_table); struct xkb_compose_table_entry *entry; while ((entry = xkb_compose_table_iterator_next(iter))) { - print_compose_table_entry(entry); + if (!print_compose_table_entry(entry)) { + ret = EXIT_FAILURE; + goto entry_error; + } } - xkb_compose_table_iterator_free(iter); + ret = EXIT_SUCCESS; +entry_error: + xkb_compose_table_iterator_free(iter); out: xkb_compose_table_unref(compose_table); file_error: