xkbcli-compile-compose: Fix string result escaping

Currently the result string is not escaped and may produce invalid
results.

Fixed by introducing an ad-hoc escape function and relative tests.
master
Pierre Le Marre 2023-11-07 12:58:41 +01:00 committed by Wismill
parent d826d70b9b
commit 0a577a0998
6 changed files with 269 additions and 12 deletions

95
src/compose/dump.h Normal file
View File

@ -0,0 +1,95 @@
/*
* Copyright © 2023 Pierre Le Marre <dev@wismill.eu>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef COMPOSE_DUMP_H
#define COMPOSE_DUMP_H
#include "config.h"
#include <stdlib.h>
#include "src/utils.h"
/* Ad-hoc escaping for UTF-8 string
*
* Note that it only escapes the strict minimum to get a valid Compose file.
* It also escapes hexadecimal digits after an hexadecimal escape. This is not
* strictly needed by the current implementation: "\x0abcg" parses as "␊bcg",
* but better be cautious than sorry and produce "\x0a\x62\x63g" instead.
* In the latter string there is no ambiguity and no need to know the maximum
* number of digits supported by the escape sequence.
*/
static inline char*
escape_utf8_string_literal(const char *from)
{
const size_t length = strlen(from);
/* Longest escape is converting ASCII character to "\xNN" */
char* to = calloc(4 * length + 1, sizeof(to));
if (!to)
return NULL;
size_t t = 0;
bool previous_is_hex_escape = false;
uint8_t nbytes = 0;
for (size_t f = 0; f < length;) {
if ((unsigned char) from[f] < 0x80) {
/* ASCII */
if (from[f] <= 0x10 || from[f] == 0x7f ||
(is_xdigit(from[f]) && previous_is_hex_escape))
{
/* Control character or
hexadecimal digit following an hexadecimal escape */
snprintf_safe(&to[t], 5, "\\x%02x", from[f]);
t += 4;
previous_is_hex_escape = true;
} else if (from[f] == '"' || from[f] == '\\') {
/* Quote and backslash */
snprintf_safe(&to[t], 3, "\\%c", from[f]);
t += 2;
previous_is_hex_escape = false;
} else {
/* Other characters */
to[t++] = from[f];
previous_is_hex_escape = false;
}
f++;
continue;
}
/* Test next byte for the next Unicode codepoints bytes count */
else if ((unsigned char) from[f] < 0xe0)
nbytes = 2;
else if ((unsigned char) from[f] < 0xf0)
nbytes = 3;
else
nbytes = 4;
memcpy(&to[t], &from[f], nbytes);
t += nbytes;
f += nbytes;
previous_is_hex_escape = false;
}
to[t++] = '\0';
return realloc(to, t);
}
#endif

View File

@ -468,6 +468,24 @@ resolve_modifier(const char *name)
return XKB_MOD_INVALID;
}
/* Parse a string literal ("...") and return the corresponding unescaped string,
* or NULL if it fails.
* This is aimed only for testing (un)escaping characters. */
char *
parse_string_literal(struct xkb_context *ctx, const char *string)
{
struct scanner s;
union lvalue val;
scanner_init(&s, ctx, string, strlen(string), "(unamed)", NULL);
switch (lex(&s, &val)) {
case TOK_STRING:
return strdup(val.string.str);
default:
fprintf(stderr, "ERROR: %s\n", s.s);
return NULL;
}
}
static bool
parse(struct xkb_compose_table *table, struct scanner *s,
unsigned include_depth);

View File

@ -27,6 +27,9 @@
#define MAX_LHS_LEN 10
#define MAX_INCLUDE_DEPTH 5
char *
parse_string_literal(struct xkb_context *ctx, const char *string);
bool
parse_string(struct xkb_compose_table *table,
const char *string, size_t len,

View File

@ -22,10 +22,14 @@
*/
#include "config.h"
#include <time.h>
#include "xkbcommon/xkbcommon-compose.h"
#include "test.h"
#include "src/utf8.h"
#include "src/compose/parser.h"
#include "src/compose/dump.h"
static const char *
compose_status_string(enum xkb_compose_status status)
@ -769,18 +773,121 @@ test_traverse(struct xkb_context *ctx)
}
static void
test_escape_sequences(struct xkb_context *ctx)
test_decode_escape_sequences(struct xkb_context *ctx)
{
/* The following escape sequences should be ignored:
* \401 overflows
* \0 and \x0 produce NULL
*/
const char *table_string = "<o> <e> : \"\\401f\\x0o\\0o\" X\n";
const char table_string_1[] = "<o> <e> : \"\\401f\\x0o\\0o\" X\n";
assert(test_compose_seq_buffer(ctx, table_string,
assert(test_compose_seq_buffer(ctx, table_string_1,
XKB_KEY_o, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSING, "", XKB_KEY_NoSymbol,
XKB_KEY_e, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "foo", XKB_KEY_X,
XKB_KEY_NoSymbol));
/* Test various cases */
const char table_string_2[] =
"<a> : \"\\x0abcg\\\"x\" A\n" /* hexadecimal sequence has max 2 chars */
"<b> : \"éxyz\" B\n" /* non-ASCII (2 bytes) */
"<c> : \"€xyz\" C\n" /* non-ASCII (3 bytes) */
"<d> : \"✨xyz\" D\n" /* non-ASCII (4 bytes) */
"<e> : \"\\x0aé\\x0a€x\\\"\" E\n"
"<f> : \"\" F\n";
assert(test_compose_seq_buffer(ctx, table_string_2,
XKB_KEY_a, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\x0a""bcg\"x", XKB_KEY_A,
XKB_KEY_b, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "éxyz", XKB_KEY_B,
XKB_KEY_c, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "€xyz", XKB_KEY_C,
XKB_KEY_d, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "✨xyz", XKB_KEY_D,
XKB_KEY_e, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\x0aé\x0a€x\"", XKB_KEY_E,
XKB_KEY_f, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "", XKB_KEY_F,
XKB_KEY_NoSymbol));
}
static uint32_t
random_non_null_unicode_char(bool ascii)
{
if (ascii)
return 0x01 + (rand() % 0x80);
switch (rand() % 5) {
case 0:
/* U+0080..U+07FF: 2 bytes in UTF-8 */
return 0x80 + (rand() % 0x800);
case 1:
/* U+0800..U+FFFF: 3 bytes in UTF-8 */
return 0x800 + (rand() % 0x10000);
case 2:
/* U+10000..U+10FFFF: 4 bytes in UTF-8 */
return 0x10000 + (rand() % 0x110000);
default:
/* NOTE: Higher probability for ASCII */
/* U+0001..U+007F: 1 byte in UTF-8 */
return 0x01 + (rand() % 0x80);
}
}
static void
test_encode_escape_sequences(struct xkb_context *ctx)
{
char *escaped;
/* Test empty string */
escaped = escape_utf8_string_literal("");
assert_streq_not_null("Empty string", "", escaped);
free(escaped);
/* Test specific ASCII characters: ", \ */
escaped = escape_utf8_string_literal("\"\\");
assert_streq_not_null("Quote and backslash", "\\\"\\\\", escaped);
free(escaped);
/* Test round-trip of random strings */
# define SAMPLE_SIZE 1000
# define MIN_CODE_POINT 0x0001
# define MAX_CODE_POINTS_COUNT 15
char buf[1 + MAX_CODE_POINTS_COUNT * 4];
for (int ascii = 1; ascii >= 0; ascii--) {
for (size_t s = 0; s < SAMPLE_SIZE; s++) {
/* Create the string */
size_t length = 1 + (rand() % MAX_CODE_POINTS_COUNT);
size_t c = 0;
for (size_t idx = 0; idx < length; idx++) {
int nbytes;
/* Get a random Unicode code point and encode it in UTF-8 */
do {
const uint32_t cp = random_non_null_unicode_char(ascii);
nbytes = utf32_to_utf8(cp, &buf[c]);
} while (!nbytes); /* Handle invalid code point in UTF-8 */
c += nbytes - 1;
assert(c <= sizeof(buf) - 1);
}
assert_printf(buf[c] == '\0', "NULL-terminated string\n");
assert_printf(strlen(buf) == c, "Contains no NULL char\n");
assert_printf(is_valid_utf8(buf, c),
"Invalid input UTF-8 string: \"%s\"\n", buf);
/* Escape the string */
escaped = escape_utf8_string_literal(buf);
if (!escaped)
break;
assert_printf(is_valid_utf8(escaped, strlen(escaped)),
"Invalid input UTF-8 string: %s\n", escaped);
char *string_literal = asprintf_safe("\"%s\"", escaped);
if (!string_literal) {
free(escaped);
break;
}
/* Unescape the string */
char *unescaped = parse_string_literal(ctx, string_literal);
assert_streq_not_null("Escaped string", buf, unescaped);
free(unescaped);
free(string_literal);
free(escaped);
}
}
# undef SAMPLE_SIZE
# undef MIN_CODE_POINT
# undef MAX_CODE_POINTS_COUNT
}
int
@ -791,10 +898,20 @@ main(int argc, char *argv[])
ctx = test_get_context(CONTEXT_NO_FLAG);
assert(ctx);
/* Initialize pseudo-random generator with program arg or current time */
int seed;
if (argc == 2) {
seed = atoi(argv[1]);
} else {
seed = time(NULL);
}
fprintf(stderr, "Seed for the pseudo-random generator: %d\n", seed);
srand(seed);
/*
* Ensure no environment variables but top_srcdir is set. This ensures
* that user Compose file paths are unset before the tests and set
* explicitely when necessary.
* explicitly when necessary.
*/
#ifdef __linux__
const char *srcdir = getenv("top_srcdir");
@ -818,7 +935,8 @@ main(int argc, char *argv[])
test_include(ctx);
test_override(ctx);
test_traverse(ctx);
test_escape_sequences(ctx);
test_decode_escape_sequences(ctx);
test_encode_escape_sequences(ctx);
xkb_context_unref(ctx);
return 0;

View File

@ -34,6 +34,16 @@
/* Automake test exit code to signify SKIP (à la PASS, FAIL, etc). */
#define SKIP_TEST 77
#define assert_printf(cond, ...) \
if (!(cond)) { \
fprintf(stderr, "Assertion failure: " __VA_ARGS__); \
assert(cond); \
}
#define assert_streq_not_null(test_name, expected, got) \
assert_printf(streq_not_null(expected, got), \
test_name ". Expected \"%s\", got: \"%s\"\n", expected, got)
/* The offset between KEY_* numbering, and keycodes in the XKB evdev
* dataset. */
#define EVDEV_OFFSET 8

View File

@ -25,12 +25,12 @@
#include <getopt.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include "xkbcommon/xkbcommon.h"
#include "xkbcommon/xkbcommon-keysyms.h"
#include "xkbcommon/xkbcommon-compose.h"
#include "src/compose/dump.h"
static void
usage(FILE *fp, char *progname)
@ -56,7 +56,7 @@ usage(FILE *fp, char *progname)
);
}
static void
static bool
print_compose_table_entry(struct xkb_compose_table_entry *entry)
{
size_t nsyms;
@ -69,10 +69,17 @@ print_compose_table_entry(struct xkb_compose_table_entry *entry)
printf(" ");
}
}
printf(":");
printf(" : ");
const char *utf8 = xkb_compose_table_entry_utf8(entry);
if (*utf8 != '\0') {
printf(" \"%s\"", utf8);
char *escaped = escape_utf8_string_literal(utf8);
if (!escaped) {
fprintf(stderr, "ERROR: Cannot escape the string: allocation error\n");
return false;
} else {
printf(" \"%s\"", escaped);
free(escaped);
}
}
const xkb_keysym_t keysym = xkb_compose_table_entry_keysym(entry);
if (keysym != XKB_KEY_NoSymbol) {
@ -80,6 +87,7 @@ print_compose_table_entry(struct xkb_compose_table_entry *entry)
printf(" %s", buf);
}
printf("\n");
return true;
}
int
@ -182,10 +190,15 @@ main(int argc, char *argv[])
struct xkb_compose_table_iterator *iter = xkb_compose_table_iterator_new(compose_table);
struct xkb_compose_table_entry *entry;
while ((entry = xkb_compose_table_iterator_next(iter))) {
print_compose_table_entry(entry);
if (!print_compose_table_entry(entry)) {
ret = EXIT_FAILURE;
goto entry_error;
}
xkb_compose_table_iterator_free(iter);
}
ret = EXIT_SUCCESS;
entry_error:
xkb_compose_table_iterator_free(iter);
out:
xkb_compose_table_unref(compose_table);
file_error: