feat: reliabletxt and wsv

2024-01-27 20:15:14 -07:00 · 2024-01-27 20:15:14 -07:00 · 151470e5d3
commit 151470e5d3
11 changed files with 791 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 zig-out/
 zig-cache/
--- a/build.zig
+++ b/build.zig
@ -0,0 +1,91 @@
 const std = @import("std");
 // Although this function looks imperative, note that its job is to
 // declaratively construct a build graph that will be executed by an external
 // runner.
 pub fn build(b: *std.Build) void {
    // Standard target options allows the person running `zig build` to choose
    // what target to build for. Here we do not override the defaults, which
    // means any target is allowed, and the default is native. Other options
    // for restricting supported target set are available.
    const target = b.standardTargetOptions(.{});
    // Standard optimization options allow the person running `zig build` to select
    // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
    // set a preferred release mode, allowing the user to decide how to optimize.
    const optimize = b.standardOptimizeOption(.{});
    const lib = b.addStaticLibrary(.{
        .name = "stenway-formats",
        // In this case the main source file is merely a path, however, in more
        // complicated build scripts, this could be a generated file.
        .root_source_file = .{ .path = "src/root.zig" },
        .target = target,
        .optimize = optimize,
    });
    // This declares intent for the library to be installed into the standard
    // location when the user invokes the "install" step (the default step when
    // running `zig build`).
    b.installArtifact(lib);
    const exe = b.addExecutable(.{
        .name = "stenway-formats",
        .root_source_file = .{ .path = "src/main.zig" },
        .target = target,
        .optimize = optimize,
    });
    // This declares intent for the executable to be installed into the
    // standard location when the user invokes the "install" step (the default
    // step when running `zig build`).
    b.installArtifact(exe);
    // This *creates* a Run step in the build graph, to be executed when another
    // step is evaluated that depends on it. The next line below will establish
    // such a dependency.
    const run_cmd = b.addRunArtifact(exe);
    // By making the run step depend on the install step, it will be run from the
    // installation directory rather than directly from within the cache directory.
    // This is not necessary, however, if the application depends on other installed
    // files, this ensures they will be present and in the expected location.
    run_cmd.step.dependOn(b.getInstallStep());
    // This allows the user to pass arguments to the application in the build
    // command itself, like this: `zig build run -- arg1 arg2 etc`
    if (b.args) |args| {
        run_cmd.addArgs(args);
    }
    // This creates a build step. It will be visible in the `zig build --help` menu,
    // and can be selected like this: `zig build run`
    // This will evaluate the `run` step rather than the default, which is "install".
    const run_step = b.step("run", "Run the app");
    run_step.dependOn(&run_cmd.step);
    // Creates a step for unit testing. This only builds the test executable
    // but does not run it.
    const lib_unit_tests = b.addTest(.{
        .root_source_file = .{ .path = "src/root.zig" },
        .target = target,
        .optimize = optimize,
    });
    const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
    const exe_unit_tests = b.addTest(.{
        .root_source_file = .{ .path = "src/main.zig" },
        .target = target,
        .optimize = optimize,
    });
    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
    // Similar to creating the run step earlier, this exposes a `test` step to
    // the `zig build --help` menu, providing a way for the user to request
    // running the unit tests.
    const test_step = b.step("test", "Run unit tests");
    test_step.dependOn(&run_lib_unit_tests.step);
    test_step.dependOn(&run_exe_unit_tests.step);
 }
--- a/build.zig.zon
+++ b/build.zig.zon
@ -0,0 +1,62 @@
 .{
    .name = "stenway-formats",
    // This is a [Semantic Version](https://semver.org/).
    // In a future version of Zig it will be used for package deduplication.
    .version = "0.0.0",
    // This field is optional.
    // This is currently advisory only; Zig does not yet do anything
    // with this value.
    //.minimum_zig_version = "0.11.0",
    // This field is optional.
    // Each dependency must either provide a `url` and `hash`, or a `path`.
    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
    // Once all dependencies are fetched, `zig build` no longer requires
    // internet connectivity.
    .dependencies = .{
        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
        //.example = .{
        //    // When updating this field to a new URL, be sure to delete the corresponding
        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
        //    // the new URL.
        //    .url = "https://example.com/foo.tar.gz",
        //
        //    // This is computed from the file contents of the directory of files that is
        //    // obtained after fetching `url` and applying the inclusion rules given by
        //    // `paths`.
        //    //
        //    // This field is the source of truth; packages do not come from a `url`; they
        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
        //    // obtain a package matching this `hash`.
        //    //
        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
        //    .hash = "...",
        //
        //    // When this is provided, the package is found in a directory relative to the
        //    // build root. In this case the package's hash is irrelevant and therefore not
        //    // computed. This field and `url` are mutually exclusive.
        //    .path = "foo",
        //},
    },
    // Specifies the set of files and directories that are included in this package.
    // Only files and directories listed here are included in the `hash` that
    // is computed for this package.
    // Paths are relative to the build root. Use the empty string (`""`) to refer to
    // the build root itself.
    // A directory listed here means that all files within, recursively, are included.
    .paths = .{
        // This makes *all* files, recursively, included in this package. It is generally
        // better to explicitly list the files and directories instead, to insure that
        // fetching from tarballs, file system paths, and version control all result
        // in the same contents hash.
        "",
        // For example...
        //"build.zig",
        //"build.zig.zon",
        //"src",
        //"LICENSE",
        //"README.md",
    },
 }
--- a/src/main.zig
+++ b/src/main.zig
@ -0,0 +1,24 @@
 const std = @import("std");
 pub fn main() !void {
    // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`)
    std.debug.print("All your {s} are belong to us.\n", .{"codebase"});
    // stdout is for the actual output of your application, for example if you
    // are implementing gzip, then only the compressed bytes should be sent to
    // stdout, not any debugging messages.
    const stdout_file = std.io.getStdOut().writer();
    var bw = std.io.bufferedWriter(stdout_file);
    const stdout = bw.writer();
    try stdout.print("Run `zig build test` to run the tests.\n", .{});
    try bw.flush(); // don't forget to flush!
 }
 test "simple test" {
    var list = std.ArrayList(i32).init(std.testing.allocator);
    defer list.deinit(); // try commenting this out and see if zig detects the memory leak!
    try list.append(42);
    try std.testing.expectEqual(@as(i32, 42), list.pop());
 }
--- a/src/reliabletxt.zig
+++ b/src/reliabletxt.zig
@ -0,0 +1,70 @@
 //! https://dev.stenway.com/ReliableTXT/Specification.html
 pub const Encoding = enum {
    utf8,
    /// Big Endian
    utf16,
    /// Little Endian
    utf16_reverse,
    /// Big Endian
    utf32,
 };
 pub fn detectEncoding(contents: []const u8) !Encoding {
    if (std.mem.startsWith(u8, contents, "\xEF\xBB\xBF")) {
        return Encoding.utf8;
    } else if (std.mem.startsWith(u8, contents, "\xFE\xFF")) {
        return Encoding.utf16;
    } else if (std.mem.startsWith(u8, contents, "\xFF\xFE")) {
        return Encoding.utf16_reverse;
    } else if (std.mem.startsWith(u8, contents, "\x00\x00\xFE\xFF")) {
        return Encoding.utf32;
    }
    return error.InvalidEncoding;
 }
 test detectEncoding {
    try testing.expectEqual(Encoding.utf8, detectEncoding("\xEF\xBB\xBFaaa!"));
    try testing.expectEqual(Encoding.utf16_reverse, detectEncoding(std.mem.sliceAsBytes(&[_]u16{
        std.mem.nativeToLittle(u16, 0xFE_FF),
        std.mem.nativeToLittle(u16, 'a'),
        std.mem.nativeToLittle(u16, 'a'),
        std.mem.nativeToLittle(u16, 'a'),
        std.mem.nativeToLittle(u16, '!'),
    })));
    try testing.expectEqual(Encoding.utf16, detectEncoding(std.mem.sliceAsBytes(&[_]u16{
        std.mem.nativeToBig(u16, 0xFE_FF),
        std.mem.nativeToBig(u16, 'a'),
        std.mem.nativeToBig(u16, 'a'),
        std.mem.nativeToBig(u16, 'a'),
        std.mem.nativeToBig(u16, '!'),
    })));
    try testing.expectEqual(Encoding.utf32, detectEncoding(std.mem.sliceAsBytes(&[_]u32{
        std.mem.nativeToBig(u32, 0x00_00_FE_FF),
        std.mem.nativeToBig(u32, 'a'),
        std.mem.nativeToBig(u32, 'a'),
        std.mem.nativeToBig(u32, 'a'),
        std.mem.nativeToBig(u32, '!'),
    })));
 }
 pub const File = union(Encoding) {
    utf8: []const u8,
    utf16: []const u16,
    utf16_reverse: []const u16,
    utf32: []const u32,
 };
 pub fn parse(contents: []const u8) !File {
    switch (try detectEncoding(contents)) {
        .utf8 => return .{ .utf8 = contents[3..] },
        .utf16 => return .{ .utf16 = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] },
        .utf16_reverse => return .{ .utf16_reverse = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] },
        .utf32 => return .{ .utf32 = @as([*]const u32, @ptrCast(@alignCast(contents[4..])))[0 .. contents[4..].len / @sizeOf(u32)] },
    }
 }
 const testing = std.testing;
 const std = @import("std");
--- a/src/root.zig
+++ b/src/root.zig
@ -0,0 +1,7 @@
 pub const reliabletxt = @import("./reliabletxt.zig");
 pub const wsv = @import("./wsv.zig");
 test {
    _ = reliabletxt;
    _ = wsv;
 }
--- a/src/testdata/Example01_Table_UTF16.txt
+++ b/src/testdata/Example01_Table_UTF16.txt
--- a/src/testdata/Example01_Table_UTF16R.txt
+++ b/src/testdata/Example01_Table_UTF16R.txt
--- a/src/testdata/Example01_Table_UTF32.txt
+++ b/src/testdata/Example01_Table_UTF32.txt
--- a/src/testdata/Example01_Table_UTF8.txt
+++ b/src/testdata/Example01_Table_UTF8.txt
@ -0,0 +1,14 @@
 a 	U+0061    61            0061        "Latin Small Letter A"
 ~ 	U+007E    7E            007E        Tilde
 ¥ 	U+00A5    C2_A5         00A5        "Yen Sign"
 » 	U+00BB    C2_BB         00BB        "Right-Pointing Double Angle Quotation Mark"
 ½ 	U+00BD    C2_BD         00BD        "Vulgar Fraction One Half"
 ¿ 	U+00BF    C2_BF         00BF        "Inverted Question Mark"
 ß 	U+00DF    C3_9F         00DF        "Latin Small Letter Sharp S"
 ä 	U+00E4    C3_A4         00E4        "Latin Small Letter A with Diaeresis"
 ï 	U+00EF    C3_AF         00EF        "Latin Small Letter I with Diaeresis"
 œ 	U+0153    C5_93         0153        "Latin Small Ligature Oe"
 € 	U+20AC    E2_82_AC      20AC        "Euro Sign"
 東 	U+6771    E6_9D_B1      6771        "CJK Unified Ideograph-6771"
 𝄞 	U+1D11E   F0_9D_84_9E   D834_DD1E   "Musical Symbol G Clef"
 𠀇 	U+20007   F0_A0_80_87   D840_DC07   "CJK Unified Ideograph-20007"
--- a/src/wsv.zig
+++ b/src/wsv.zig
@ -0,0 +1,521 @@
 const Table = union(reliabletxt.Encoding) {
    utf8: [][]?[]u8,
    utf16: [][]?[]u16,
    utf16_reverse: [][]?[]u16,
    utf32: [][]?[]u32,
    pub fn free(this: @This(), gpa: std.mem.Allocator) void {
        switch (this) {
            .utf8 => |table| {
                for (table) |row| {
                    for (row) |value_opt| {
                        if (value_opt) |value| {
                            gpa.free(value);
                        }
                    }
                    gpa.free(row);
                }
                gpa.free(table);
            },
            else => std.debug.panic("unimplemented", .{}),
        }
    }
 };
 const ParseState = enum { default, string, string_double_quote, string_line_break_escape, comment };
 pub fn parseAlloc(gpa: std.mem.Allocator, contents_any: []const u8) !Table {
    switch (try reliabletxt.parse(contents_any)) {
        .utf8 => |contents_utf8| {
            var table = std.ArrayList([]?[]u8).init(gpa);
            defer table.deinit();
            const utf8_view = try std.unicode.Utf8View.init(contents_utf8);
            var utf8_iter = utf8_view.iterator();
            var line_buf = std.ArrayList(?[]u8).init(gpa);
            defer line_buf.deinit();
            var value_buf = std.ArrayList(u8).init(gpa);
            defer value_buf.deinit();
            var state = ParseState.default;
            while (utf8_iter.nextCodepoint()) |codepoint| {
                switch (state) {
                    .default => switch (codepoint) {
                        '\n' => {
                            try table.ensureUnusedCapacity(1);
                            if (value_buf.items.len > 0) {
                                try line_buf.ensureUnusedCapacity(1);
                                const value = try value_buf.toOwnedSlice();
                                line_buf.appendAssumeCapacity(value);
                            }
                            const line = try line_buf.toOwnedSlice();
                            table.appendAssumeCapacity(line);
                        },
                        '"' => state = .string,
                        ' ',
                        '\t',
                        => {
                            if (value_buf.items.len > 0) {
                                try line_buf.ensureUnusedCapacity(1);
                                const value = try value_buf.toOwnedSlice();
                                line_buf.appendAssumeCapacity(value);
                            }
                        },
                        '#' => {
                            try table.ensureUnusedCapacity(1);
                            if (value_buf.items.len > 0) {
                                try line_buf.ensureUnusedCapacity(1);
                                const value = try value_buf.toOwnedSlice();
                                line_buf.appendAssumeCapacity(value);
                            }
                            const line = try line_buf.toOwnedSlice();
                            table.appendAssumeCapacity(line);
                            state = .comment;
                        },
                        else => |character| {
                            const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
                            try value_buf.ensureUnusedCapacity(codepoint_len);
                            const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
                            _ = try std.unicode.utf8Encode(character, buf);
                            value_buf.items.len += codepoint_len;
                        },
                    },
                    .string => switch (codepoint) {
                        '\n' => {
                            // TODO: diagnostic: string not closed
                            return error.StringNotClosed;
                        },
                        '"' => state = .string_double_quote,
                        else => |character| {
                            const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
                            try value_buf.ensureUnusedCapacity(codepoint_len);
                            const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
                            _ = try std.unicode.utf8Encode(character, buf);
                            value_buf.items.len += codepoint_len;
                        },
                    },
                    .string_double_quote => switch (codepoint) {
                        '"' => {
                            try value_buf.append('"');
                            state = .string;
                        },
                        '/' => state = .string_line_break_escape,
                        '\n' => {
                            try table.ensureUnusedCapacity(1);
                            if (value_buf.items.len > 0) {
                                try line_buf.ensureUnusedCapacity(1);
                                const value = try value_buf.toOwnedSlice();
                                line_buf.appendAssumeCapacity(value);
                            }
                            const line = try line_buf.toOwnedSlice();
                            table.appendAssumeCapacity(line);
                        },
                        '#' => {
                            try table.ensureUnusedCapacity(1);
                            if (value_buf.items.len > 0) {
                                try line_buf.ensureUnusedCapacity(1);
                                const value = try value_buf.toOwnedSlice();
                                line_buf.appendAssumeCapacity(value);
                            }
                            const line = try line_buf.toOwnedSlice();
                            table.appendAssumeCapacity(line);
                            state = .comment;
                        },
                        ' ',
                        '\t',
                        => {
                            try line_buf.ensureUnusedCapacity(1);
                            const value = try value_buf.toOwnedSlice();
                            line_buf.appendAssumeCapacity(value);
                            state = .default;
                        },
                        else => |character| {
                            const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
                            try value_buf.ensureUnusedCapacity(codepoint_len);
                            const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
                            _ = try std.unicode.utf8Encode(character, buf);
                            value_buf.items.len += codepoint_len;
                        },
                    },
                    .string_line_break_escape => switch (codepoint) {
                        '"' => {
                            try value_buf.append('\n');
                            state = .string;
                        },
                        else => {
                            // TODO: diagnostic: invalid string line break
                            return error.InvalidStringLineBreak;
                        },
                    },
                    .comment => switch (codepoint) {
                        '\n' => state = .default,
                        else => {},
                    },
                }
            }
            {
                try table.ensureUnusedCapacity(1);
                if (value_buf.items.len > 0) {
                    try line_buf.ensureUnusedCapacity(1);
                    const value = try value_buf.toOwnedSlice();
                    line_buf.appendAssumeCapacity(value);
                }
                const line = try line_buf.toOwnedSlice();
                table.appendAssumeCapacity(line);
            }
            const utf8_table = try table.toOwnedSlice();
            return .{ .utf8 = utf8_table };
        },
        else => return error.Unimplemented,
    }
 }
 fn expectEqualUTF8Tables(expected_table: []const []const ?[]const u8, actual_table: []const []const ?[]const u8) !void {
    var is_errors = false;
    if (expected_table.len != actual_table.len) {
        std.debug.print("Expected table to have {} rows, found {} rows\n", .{ expected_table.len, actual_table.len });
        return error.TestExpectedEqual;
    }
    for (expected_table, actual_table, 0..) |expected_row, actual_row, row| {
        for (expected_row, actual_row, 0..) |expected_value, actual_value, col| {
            if (expected_value == null and actual_value != null) {
                std.debug.print(
                    \\at row {}, column {}
                    \\    expected null
                    \\       found "{}"
                    \\
                , .{ row, col, std.zig.fmtEscapes(actual_value.?) });
                is_errors = true;
            }
            if (expected_value != null and actual_value == null) {
                std.debug.print(
                    \\at row {}, column {}
                    \\    expected "{}"
                    \\       found null
                    \\
                , .{ row, col, std.zig.fmtEscapes(expected_value.?) });
                is_errors = true;
            }
            if (!std.mem.eql(u8, expected_value.?, actual_value.?)) {
                std.debug.print(
                    \\at row {}, column {}
                    \\    expected "{}"
                    \\       found "{}"
                    \\
                , .{ row, col, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value.?) });
                is_errors = true;
            }
        }
    }
    if (is_errors) {
        return error.TestExpectedEqual;
    }
 }
 test parseAlloc {
    const table = try parseAlloc(testing.allocator, @embedFile("./testdata/Example01_Table_UTF8.txt"));
    defer table.free(testing.allocator);
    try testing.expectEqual(reliabletxt.Encoding.utf8, @as(reliabletxt.Encoding, table));
    const utf8_table = table.utf8;
    try expectEqualUTF8Tables(
        &.{
            &.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" },
            &.{ "~", "U+007E", "7E", "007E", "Tilde" },
            &.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" },
            &.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" },
            &.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" },
            &.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" },
            &.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" },
            &.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" },
            &.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" },
            &.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" },
            &.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" },
            &.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" },
            &.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" },
            &.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" },
        },
        utf8_table,
    );
 }
 pub fn decodeString(encoded_string: []const u8, buffer: []u8) ![]const u8 {
    const State = enum {
        default,
        double_quote,
        double_quote_slash,
    };
    if (encoded_string.len < 1 or encoded_string[0] != '"' or encoded_string[encoded_string.len - 1] != '"') return error.InvalidFormat;
    var state = State.default;
    var write_pos: usize = 0;
    for (encoded_string[1 .. encoded_string.len - 1]) |encoded_character| {
        switch (state) {
            .default => switch (encoded_character) {
                '\n' => return error.InvalidFormat,
                '"' => state = .double_quote,
                else => {
                    if (write_pos >= buffer.len) return error.OutOfMemory;
                    buffer[write_pos] = encoded_character;
                    write_pos += 1;
                },
            },
            .double_quote => switch (encoded_character) {
                '"' => {
                    if (write_pos >= buffer.len) return error.OutOfMemory;
                    buffer[write_pos] = encoded_character;
                    write_pos += 1;
                    state = .default;
                },
                '/' => state = .double_quote_slash,
                else => return error.InvalidFormat,
            },
            .double_quote_slash => switch (encoded_character) {
                '"' => {
                    if (write_pos >= buffer.len) return error.OutOfMemory;
                    buffer[write_pos] = '\n';
                    write_pos += 1;
                    state = .default;
                },
                else => return error.InvalidFormat,
            },
        }
    }
    return buffer[0..write_pos];
 }
 test decodeString {
    var buffer: [128]u8 = undefined;
    try testing.expectEqualStrings("", try decodeString("\"\"", &buffer));
    try testing.expectEqualStrings("Latin Small Letter A", try decodeString("\"Latin Small Letter A\"", &buffer));
    try testing.expectEqualStrings("See these \"quotes\" I'm making with my claw hands? It means I don't belive you.", try decodeString("\"See these \"\"quotes\"\" I'm making with my claw hands? It means I don't belive you.\"", &buffer));
    try testing.expectEqualStrings("Line 1\nLine 2", try decodeString("\"Line 1\"/\"Line 2\"", &buffer));
 }
 pub fn parseIter(contents_any: []const u8) !Iterator {
    switch (try reliabletxt.parse(contents_any)) {
        .utf8 => |contents_utf8| {
            const utf8_view = try std.unicode.Utf8View.init(contents_utf8);
            return Iterator{ .utf8 = .{
                .utf8_iter = utf8_view.iterator(),
            } };
        },
        else => return error.Unimplemented,
    }
 }
 pub const Iterator = union(enum) {
    utf8: Utf8Iterator,
    _,
 };
 pub const Utf8Iterator = struct {
    utf8_iter: std.unicode.Utf8Iterator,
    pub const Item = union(enum) {
        newline,
        /// A value not surrounded by quotes. Can't include any whitespace.
        value: []const u8,
        /// A value surrounded by quotes. May include escaped double quotes or escaped newlines.
        string: []const u8,
        null,
    };
    const ParseState = enum { default, value, string, string_double_quote, string_line_break_escape, comment };
    pub fn next(this: *@This()) !?Item {
        var state = Utf8Iterator.ParseState.default;
        var value_start: usize = this.utf8_iter.i;
        while (this.utf8_iter.nextCodepoint()) |codepoint| {
            switch (state) {
                .default => switch (codepoint) {
                    '\n' => return Item.newline,
                    '"' => state = .string,
                    ' ',
                    '\t',
                    => value_start = this.utf8_iter.i,
                    '#' => state = .comment,
                    else => state = .value,
                },
                .value => switch (codepoint) {
                    // TODO: Add other whitespace characters
                    '\n',
                    ' ',
                    '\t',
                    => {
                        this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
                        return Item{ .value = this.utf8_iter.bytes[value_start..this.utf8_iter.i] };
                    },
                    '"' => return error.DoubleQuoteInValue,
                    else => {},
                },
                .string => switch (codepoint) {
                    '\n' => {
                        // TODO: diagnostic: string not closed
                        return error.StringNotClosed;
                    },
                    '"' => state = .string_double_quote,
                    else => {},
                },
                .string_double_quote => switch (codepoint) {
                    '"' => state = .string,
                    '/' => state = .string_line_break_escape,
                    // TODO: Add other whitespace characters
                    '\n',
                    '#',
                    ' ',
                    '\t',
                    => {
                        // we roll back here so it can be handled in the next iteration of the loop
                        this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
                        return Item{ .string = this.utf8_iter.bytes[value_start..this.utf8_iter.i] };
                    },
                    else => {},
                },
                .string_line_break_escape => switch (codepoint) {
                    '"' => state = .string,
                    else => {
                        // TODO: diagnostic: invalid string line break
                        return error.InvalidStringLineBreak;
                    },
                },
                .comment => switch (codepoint) {
                    '\n' => state = .default,
                    else => {},
                },
            }
        }
        return null;
    }
 };
 fn expectEqualUTF8TablesIter(expected_table: []const []const ?[]const u8, actual_table: Utf8Iterator) !void {
    var actual_table_iter = actual_table;
    var is_errors = false;
    var expected_row_index: usize = 0;
    var expected_value_index: usize = 0;
    while (try actual_table_iter.next()) |actual_parse_event| {
        if (expected_row_index > expected_table.len) {
            std.debug.print("Expected table to have at most {} rows, found more rows\n", .{expected_row_index});
            is_errors = true;
            break;
        }
        switch (actual_parse_event) {
            .newline => {
                expected_row_index += 1;
                expected_value_index = 0;
            },
            .value => |actual_value_str| {
                const expected_value = expected_table[expected_row_index][expected_value_index];
                if (expected_value == null) {
                    std.debug.print(
                        \\at row {}, column {}
                        \\    expected null
                        \\       found "{}"
                        \\
                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) });
                    is_errors = true;
                } else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) {
                    std.debug.print(
                        \\at row {}, column {}
                        \\    expected "{}"
                        \\       found "{}"
                        \\
                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) });
                    is_errors = true;
                }
                expected_value_index += 1;
            },
            .string => |actual_string_encoded| {
                var decode_buf: [128]u8 = undefined;
                const actual_value_str = try decodeString(actual_string_encoded, &decode_buf);
                const expected_value = expected_table[expected_row_index][expected_value_index];
                if (expected_value == null) {
                    std.debug.print(
                        \\at row {}, column {}
                        \\    expected null
                        \\       found "{}"
                        \\
                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) });
                    is_errors = true;
                } else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) {
                    std.debug.print(
                        \\at row {}, column {}
                        \\    expected "{}"
                        \\       found "{}"
                        \\
                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) });
                    is_errors = true;
                }
                expected_value_index += 1;
            },
            .null => {
                const expected_value = expected_table[expected_row_index][expected_value_index];
                if (expected_value != null) {
                    std.debug.print(
                        \\at row {}, column {}
                        \\    expected "{}"
                        \\       found null
                        \\
                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?) });
                    is_errors = true;
                }
            },
        }
    }
    if (is_errors) {
        return error.TestExpectedEqual;
    }
 }
 test parseIter {
    try expectEqualUTF8TablesIter(
        &.{
            &.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" },
            &.{ "~", "U+007E", "7E", "007E", "Tilde" },
            &.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" },
            &.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" },
            &.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" },
            &.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" },
            &.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" },
            &.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" },
            &.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" },
            &.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" },
            &.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" },
            &.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" },
            &.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" },
            &.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" },
        },
        (try parseIter(@embedFile("./testdata/Example01_Table_UTF8.txt"))).utf8,
    );
 }
 const reliabletxt = @import("./reliabletxt.zig");
 const testing = std.testing;
 const std = @import("std");