feat: reliabletxt and wsv

2024-01-27 20:15:14 -07:00 · 2024-01-27 20:15:14 -07:00 · 151470e5d3
commit 151470e5d3
11 changed files with 791 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+zig-out/
+zig-cache/
--- a/build.zig
+++ b/build.zig
@ -0,0 +1,91 @@
+const std = @import("std");
+
+// Although this function looks imperative, note that its job is to
+// declaratively construct a build graph that will be executed by an external
+// runner.
+pub fn build(b: *std.Build) void {
+    // Standard target options allows the person running `zig build` to choose
+    // what target to build for. Here we do not override the defaults, which
+    // means any target is allowed, and the default is native. Other options
+    // for restricting supported target set are available.
+    const target = b.standardTargetOptions(.{});
+
+    // Standard optimization options allow the person running `zig build` to select
+    // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
+    // set a preferred release mode, allowing the user to decide how to optimize.
+    const optimize = b.standardOptimizeOption(.{});
+
+    const lib = b.addStaticLibrary(.{
+        .name = "stenway-formats",
+        // In this case the main source file is merely a path, however, in more
+        // complicated build scripts, this could be a generated file.
+        .root_source_file = .{ .path = "src/root.zig" },
+        .target = target,
+        .optimize = optimize,
+    });
+
+    // This declares intent for the library to be installed into the standard
+    // location when the user invokes the "install" step (the default step when
+    // running `zig build`).
+    b.installArtifact(lib);
+
+    const exe = b.addExecutable(.{
+        .name = "stenway-formats",
+        .root_source_file = .{ .path = "src/main.zig" },
+        .target = target,
+        .optimize = optimize,
+    });
+
+    // This declares intent for the executable to be installed into the
+    // standard location when the user invokes the "install" step (the default
+    // step when running `zig build`).
+    b.installArtifact(exe);
+
+    // This *creates* a Run step in the build graph, to be executed when another
+    // step is evaluated that depends on it. The next line below will establish
+    // such a dependency.
+    const run_cmd = b.addRunArtifact(exe);
+
+    // By making the run step depend on the install step, it will be run from the
+    // installation directory rather than directly from within the cache directory.
+    // This is not necessary, however, if the application depends on other installed
+    // files, this ensures they will be present and in the expected location.
+    run_cmd.step.dependOn(b.getInstallStep());
+
+    // This allows the user to pass arguments to the application in the build
+    // command itself, like this: `zig build run -- arg1 arg2 etc`
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    // This creates a build step. It will be visible in the `zig build --help` menu,
+    // and can be selected like this: `zig build run`
+    // This will evaluate the `run` step rather than the default, which is "install".
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+
+    // Creates a step for unit testing. This only builds the test executable
+    // but does not run it.
+    const lib_unit_tests = b.addTest(.{
+        .root_source_file = .{ .path = "src/root.zig" },
+        .target = target,
+        .optimize = optimize,
+    });
+
+    const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
+
+    const exe_unit_tests = b.addTest(.{
+        .root_source_file = .{ .path = "src/main.zig" },
+        .target = target,
+        .optimize = optimize,
+    });
+
+    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
+
+    // Similar to creating the run step earlier, this exposes a `test` step to
+    // the `zig build --help` menu, providing a way for the user to request
+    // running the unit tests.
+    const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_lib_unit_tests.step);
+    test_step.dependOn(&run_exe_unit_tests.step);
+}
--- a/build.zig.zon
+++ b/build.zig.zon
@ -0,0 +1,62 @@
+.{
+    .name = "stenway-formats",
+    // This is a [Semantic Version](https://semver.org/).
+    // In a future version of Zig it will be used for package deduplication.
+    .version = "0.0.0",
+
+    // This field is optional.
+    // This is currently advisory only; Zig does not yet do anything
+    // with this value.
+    //.minimum_zig_version = "0.11.0",
+
+    // This field is optional.
+    // Each dependency must either provide a `url` and `hash`, or a `path`.
+    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
+    // Once all dependencies are fetched, `zig build` no longer requires
+    // internet connectivity.
+    .dependencies = .{
+        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
+        //.example = .{
+        //    // When updating this field to a new URL, be sure to delete the corresponding
+        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
+        //    // the new URL.
+        //    .url = "https://example.com/foo.tar.gz",
+        //
+        //    // This is computed from the file contents of the directory of files that is
+        //    // obtained after fetching `url` and applying the inclusion rules given by
+        //    // `paths`.
+        //    //
+        //    // This field is the source of truth; packages do not come from a `url`; they
+        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
+        //    // obtain a package matching this `hash`.
+        //    //
+        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
+        //    .hash = "...",
+        //
+        //    // When this is provided, the package is found in a directory relative to the
+        //    // build root. In this case the package's hash is irrelevant and therefore not
+        //    // computed. This field and `url` are mutually exclusive.
+        //    .path = "foo",
+        //},
+    },
+
+    // Specifies the set of files and directories that are included in this package.
+    // Only files and directories listed here are included in the `hash` that
+    // is computed for this package.
+    // Paths are relative to the build root. Use the empty string (`""`) to refer to
+    // the build root itself.
+    // A directory listed here means that all files within, recursively, are included.
+    .paths = .{
+        // This makes *all* files, recursively, included in this package. It is generally
+        // better to explicitly list the files and directories instead, to insure that
+        // fetching from tarballs, file system paths, and version control all result
+        // in the same contents hash.
+        "",
+        // For example...
+        //"build.zig",
+        //"build.zig.zon",
+        //"src",
+        //"LICENSE",
+        //"README.md",
+    },
+}
--- a/src/main.zig
+++ b/src/main.zig
@ -0,0 +1,24 @@
+const std = @import("std");
+
+pub fn main() !void {
+    // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`)
+    std.debug.print("All your {s} are belong to us.\n", .{"codebase"});
+
+    // stdout is for the actual output of your application, for example if you
+    // are implementing gzip, then only the compressed bytes should be sent to
+    // stdout, not any debugging messages.
+    const stdout_file = std.io.getStdOut().writer();
+    var bw = std.io.bufferedWriter(stdout_file);
+    const stdout = bw.writer();
+
+    try stdout.print("Run `zig build test` to run the tests.\n", .{});
+
+    try bw.flush(); // don't forget to flush!
+}
+
+test "simple test" {
+    var list = std.ArrayList(i32).init(std.testing.allocator);
+    defer list.deinit(); // try commenting this out and see if zig detects the memory leak!
+    try list.append(42);
+    try std.testing.expectEqual(@as(i32, 42), list.pop());
+}
--- a/src/reliabletxt.zig
+++ b/src/reliabletxt.zig
@ -0,0 +1,70 @@
+//! https://dev.stenway.com/ReliableTXT/Specification.html
+
+pub const Encoding = enum {
+    utf8,
+    /// Big Endian
+    utf16,
+    /// Little Endian
+    utf16_reverse,
+    /// Big Endian
+    utf32,
+};
+
+pub fn detectEncoding(contents: []const u8) !Encoding {
+    if (std.mem.startsWith(u8, contents, "\xEF\xBB\xBF")) {
+        return Encoding.utf8;
+    } else if (std.mem.startsWith(u8, contents, "\xFE\xFF")) {
+        return Encoding.utf16;
+    } else if (std.mem.startsWith(u8, contents, "\xFF\xFE")) {
+        return Encoding.utf16_reverse;
+    } else if (std.mem.startsWith(u8, contents, "\x00\x00\xFE\xFF")) {
+        return Encoding.utf32;
+    }
+    return error.InvalidEncoding;
+}
+
+test detectEncoding {
+    try testing.expectEqual(Encoding.utf8, detectEncoding("\xEF\xBB\xBFaaa!"));
+    try testing.expectEqual(Encoding.utf16_reverse, detectEncoding(std.mem.sliceAsBytes(&[_]u16{
+        std.mem.nativeToLittle(u16, 0xFE_FF),
+        std.mem.nativeToLittle(u16, 'a'),
+        std.mem.nativeToLittle(u16, 'a'),
+        std.mem.nativeToLittle(u16, 'a'),
+        std.mem.nativeToLittle(u16, '!'),
+    })));
+
+    try testing.expectEqual(Encoding.utf16, detectEncoding(std.mem.sliceAsBytes(&[_]u16{
+        std.mem.nativeToBig(u16, 0xFE_FF),
+        std.mem.nativeToBig(u16, 'a'),
+        std.mem.nativeToBig(u16, 'a'),
+        std.mem.nativeToBig(u16, 'a'),
+        std.mem.nativeToBig(u16, '!'),
+    })));
+
+    try testing.expectEqual(Encoding.utf32, detectEncoding(std.mem.sliceAsBytes(&[_]u32{
+        std.mem.nativeToBig(u32, 0x00_00_FE_FF),
+        std.mem.nativeToBig(u32, 'a'),
+        std.mem.nativeToBig(u32, 'a'),
+        std.mem.nativeToBig(u32, 'a'),
+        std.mem.nativeToBig(u32, '!'),
+    })));
+}
+
+pub const File = union(Encoding) {
+    utf8: []const u8,
+    utf16: []const u16,
+    utf16_reverse: []const u16,
+    utf32: []const u32,
+};
+
+pub fn parse(contents: []const u8) !File {
+    switch (try detectEncoding(contents)) {
+        .utf8 => return .{ .utf8 = contents[3..] },
+        .utf16 => return .{ .utf16 = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] },
+        .utf16_reverse => return .{ .utf16_reverse = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] },
+        .utf32 => return .{ .utf32 = @as([*]const u32, @ptrCast(@alignCast(contents[4..])))[0 .. contents[4..].len / @sizeOf(u32)] },
+    }
+}
+
+const testing = std.testing;
+const std = @import("std");
--- a/src/root.zig
+++ b/src/root.zig
@ -0,0 +1,7 @@
+pub const reliabletxt = @import("./reliabletxt.zig");
+pub const wsv = @import("./wsv.zig");
+
+test {
+    _ = reliabletxt;
+    _ = wsv;
+}
--- a/src/testdata/Example01_Table_UTF16.txt
+++ b/src/testdata/Example01_Table_UTF16.txt
--- a/src/testdata/Example01_Table_UTF16R.txt
+++ b/src/testdata/Example01_Table_UTF16R.txt
--- a/src/testdata/Example01_Table_UTF32.txt
+++ b/src/testdata/Example01_Table_UTF32.txt
--- a/src/testdata/Example01_Table_UTF8.txt
+++ b/src/testdata/Example01_Table_UTF8.txt
@ -0,0 +1,14 @@
+a 	U+0061    61            0061        "Latin Small Letter A"
+~ 	U+007E    7E            007E        Tilde
+¥ 	U+00A5    C2_A5         00A5        "Yen Sign"
+» 	U+00BB    C2_BB         00BB        "Right-Pointing Double Angle Quotation Mark"
+½ 	U+00BD    C2_BD         00BD        "Vulgar Fraction One Half"
+¿ 	U+00BF    C2_BF         00BF        "Inverted Question Mark"
+ß 	U+00DF    C3_9F         00DF        "Latin Small Letter Sharp S"
+ä 	U+00E4    C3_A4         00E4        "Latin Small Letter A with Diaeresis"
+ï 	U+00EF    C3_AF         00EF        "Latin Small Letter I with Diaeresis"
+œ 	U+0153    C5_93         0153        "Latin Small Ligature Oe"
+€ 	U+20AC    E2_82_AC      20AC        "Euro Sign"
+東 	U+6771    E6_9D_B1      6771        "CJK Unified Ideograph-6771"
+𝄞 	U+1D11E   F0_9D_84_9E   D834_DD1E   "Musical Symbol G Clef"
+𠀇 	U+20007   F0_A0_80_87   D840_DC07   "CJK Unified Ideograph-20007"
--- a/src/wsv.zig
+++ b/src/wsv.zig
@ -0,0 +1,521 @@
+const Table = union(reliabletxt.Encoding) {
+    utf8: [][]?[]u8,
+    utf16: [][]?[]u16,
+    utf16_reverse: [][]?[]u16,
+    utf32: [][]?[]u32,
+
+    pub fn free(this: @This(), gpa: std.mem.Allocator) void {
+        switch (this) {
+            .utf8 => |table| {
+                for (table) |row| {
+                    for (row) |value_opt| {
+                        if (value_opt) |value| {
+                            gpa.free(value);
+                        }
+                    }
+                    gpa.free(row);
+                }
+                gpa.free(table);
+            },
+            else => std.debug.panic("unimplemented", .{}),
+        }
+    }
+};
+
+const ParseState = enum { default, string, string_double_quote, string_line_break_escape, comment };
+
+pub fn parseAlloc(gpa: std.mem.Allocator, contents_any: []const u8) !Table {
+    switch (try reliabletxt.parse(contents_any)) {
+        .utf8 => |contents_utf8| {
+            var table = std.ArrayList([]?[]u8).init(gpa);
+            defer table.deinit();
+
+            const utf8_view = try std.unicode.Utf8View.init(contents_utf8);
+            var utf8_iter = utf8_view.iterator();
+
+            var line_buf = std.ArrayList(?[]u8).init(gpa);
+            defer line_buf.deinit();
+
+            var value_buf = std.ArrayList(u8).init(gpa);
+            defer value_buf.deinit();
+
+            var state = ParseState.default;
+            while (utf8_iter.nextCodepoint()) |codepoint| {
+                switch (state) {
+                    .default => switch (codepoint) {
+                        '\n' => {
+                            try table.ensureUnusedCapacity(1);
+                            if (value_buf.items.len > 0) {
+                                try line_buf.ensureUnusedCapacity(1);
+                                const value = try value_buf.toOwnedSlice();
+                                line_buf.appendAssumeCapacity(value);
+                            }
+                            const line = try line_buf.toOwnedSlice();
+                            table.appendAssumeCapacity(line);
+                        },
+                        '"' => state = .string,
+                        ' ',
+                        '\t',
+                        => {
+                            if (value_buf.items.len > 0) {
+                                try line_buf.ensureUnusedCapacity(1);
+                                const value = try value_buf.toOwnedSlice();
+                                line_buf.appendAssumeCapacity(value);
+                            }
+                        },
+                        '#' => {
+                            try table.ensureUnusedCapacity(1);
+                            if (value_buf.items.len > 0) {
+                                try line_buf.ensureUnusedCapacity(1);
+                                const value = try value_buf.toOwnedSlice();
+                                line_buf.appendAssumeCapacity(value);
+                            }
+                            const line = try line_buf.toOwnedSlice();
+                            table.appendAssumeCapacity(line);
+                            state = .comment;
+                        },
+                        else => |character| {
+                            const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
+
+                            try value_buf.ensureUnusedCapacity(codepoint_len);
+                            const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
+
+                            _ = try std.unicode.utf8Encode(character, buf);
+
+                            value_buf.items.len += codepoint_len;
+                        },
+                    },
+                    .string => switch (codepoint) {
+                        '\n' => {
+                            // TODO: diagnostic: string not closed
+                            return error.StringNotClosed;
+                        },
+                        '"' => state = .string_double_quote,
+                        else => |character| {
+                            const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
+
+                            try value_buf.ensureUnusedCapacity(codepoint_len);
+                            const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
+
+                            _ = try std.unicode.utf8Encode(character, buf);
+
+                            value_buf.items.len += codepoint_len;
+                        },
+                    },
+                    .string_double_quote => switch (codepoint) {
+                        '"' => {
+                            try value_buf.append('"');
+                            state = .string;
+                        },
+                        '/' => state = .string_line_break_escape,
+                        '\n' => {
+                            try table.ensureUnusedCapacity(1);
+                            if (value_buf.items.len > 0) {
+                                try line_buf.ensureUnusedCapacity(1);
+                                const value = try value_buf.toOwnedSlice();
+                                line_buf.appendAssumeCapacity(value);
+                            }
+                            const line = try line_buf.toOwnedSlice();
+                            table.appendAssumeCapacity(line);
+                        },
+                        '#' => {
+                            try table.ensureUnusedCapacity(1);
+                            if (value_buf.items.len > 0) {
+                                try line_buf.ensureUnusedCapacity(1);
+                                const value = try value_buf.toOwnedSlice();
+                                line_buf.appendAssumeCapacity(value);
+                            }
+                            const line = try line_buf.toOwnedSlice();
+                            table.appendAssumeCapacity(line);
+                            state = .comment;
+                        },
+                        ' ',
+                        '\t',
+                        => {
+                            try line_buf.ensureUnusedCapacity(1);
+                            const value = try value_buf.toOwnedSlice();
+                            line_buf.appendAssumeCapacity(value);
+                            state = .default;
+                        },
+                        else => |character| {
+                            const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
+
+                            try value_buf.ensureUnusedCapacity(codepoint_len);
+                            const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
+
+                            _ = try std.unicode.utf8Encode(character, buf);
+
+                            value_buf.items.len += codepoint_len;
+                        },
+                    },
+                    .string_line_break_escape => switch (codepoint) {
+                        '"' => {
+                            try value_buf.append('\n');
+                            state = .string;
+                        },
+                        else => {
+                            // TODO: diagnostic: invalid string line break
+                            return error.InvalidStringLineBreak;
+                        },
+                    },
+                    .comment => switch (codepoint) {
+                        '\n' => state = .default,
+                        else => {},
+                    },
+                }
+            }
+
+            {
+                try table.ensureUnusedCapacity(1);
+                if (value_buf.items.len > 0) {
+                    try line_buf.ensureUnusedCapacity(1);
+                    const value = try value_buf.toOwnedSlice();
+                    line_buf.appendAssumeCapacity(value);
+                }
+                const line = try line_buf.toOwnedSlice();
+                table.appendAssumeCapacity(line);
+            }
+
+            const utf8_table = try table.toOwnedSlice();
+            return .{ .utf8 = utf8_table };
+        },
+        else => return error.Unimplemented,
+    }
+}
+
+fn expectEqualUTF8Tables(expected_table: []const []const ?[]const u8, actual_table: []const []const ?[]const u8) !void {
+    var is_errors = false;
+    if (expected_table.len != actual_table.len) {
+        std.debug.print("Expected table to have {} rows, found {} rows\n", .{ expected_table.len, actual_table.len });
+        return error.TestExpectedEqual;
+    }
+    for (expected_table, actual_table, 0..) |expected_row, actual_row, row| {
+        for (expected_row, actual_row, 0..) |expected_value, actual_value, col| {
+            if (expected_value == null and actual_value != null) {
+                std.debug.print(
+                    \\at row {}, column {}
+                    \\    expected null
+                    \\       found "{}"
+                    \\
+                , .{ row, col, std.zig.fmtEscapes(actual_value.?) });
+                is_errors = true;
+            }
+            if (expected_value != null and actual_value == null) {
+                std.debug.print(
+                    \\at row {}, column {}
+                    \\    expected "{}"
+                    \\       found null
+                    \\
+                , .{ row, col, std.zig.fmtEscapes(expected_value.?) });
+                is_errors = true;
+            }
+            if (!std.mem.eql(u8, expected_value.?, actual_value.?)) {
+                std.debug.print(
+                    \\at row {}, column {}
+                    \\    expected "{}"
+                    \\       found "{}"
+                    \\
+                , .{ row, col, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value.?) });
+                is_errors = true;
+            }
+        }
+    }
+
+    if (is_errors) {
+        return error.TestExpectedEqual;
+    }
+}
+
+test parseAlloc {
+    const table = try parseAlloc(testing.allocator, @embedFile("./testdata/Example01_Table_UTF8.txt"));
+    defer table.free(testing.allocator);
+
+    try testing.expectEqual(reliabletxt.Encoding.utf8, @as(reliabletxt.Encoding, table));
+    const utf8_table = table.utf8;
+
+    try expectEqualUTF8Tables(
+        &.{
+            &.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" },
+            &.{ "~", "U+007E", "7E", "007E", "Tilde" },
+            &.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" },
+            &.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" },
+            &.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" },
+            &.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" },
+            &.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" },
+            &.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" },
+            &.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" },
+            &.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" },
+            &.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" },
+            &.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" },
+            &.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" },
+            &.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" },
+        },
+        utf8_table,
+    );
+}
+
+pub fn decodeString(encoded_string: []const u8, buffer: []u8) ![]const u8 {
+    const State = enum {
+        default,
+        double_quote,
+        double_quote_slash,
+    };
+    if (encoded_string.len < 1 or encoded_string[0] != '"' or encoded_string[encoded_string.len - 1] != '"') return error.InvalidFormat;
+    var state = State.default;
+    var write_pos: usize = 0;
+    for (encoded_string[1 .. encoded_string.len - 1]) |encoded_character| {
+        switch (state) {
+            .default => switch (encoded_character) {
+                '\n' => return error.InvalidFormat,
+                '"' => state = .double_quote,
+                else => {
+                    if (write_pos >= buffer.len) return error.OutOfMemory;
+                    buffer[write_pos] = encoded_character;
+                    write_pos += 1;
+                },
+            },
+            .double_quote => switch (encoded_character) {
+                '"' => {
+                    if (write_pos >= buffer.len) return error.OutOfMemory;
+                    buffer[write_pos] = encoded_character;
+                    write_pos += 1;
+                    state = .default;
+                },
+                '/' => state = .double_quote_slash,
+                else => return error.InvalidFormat,
+            },
+            .double_quote_slash => switch (encoded_character) {
+                '"' => {
+                    if (write_pos >= buffer.len) return error.OutOfMemory;
+                    buffer[write_pos] = '\n';
+                    write_pos += 1;
+                    state = .default;
+                },
+                else => return error.InvalidFormat,
+            },
+        }
+    }
+    return buffer[0..write_pos];
+}
+
+test decodeString {
+    var buffer: [128]u8 = undefined;
+    try testing.expectEqualStrings("", try decodeString("\"\"", &buffer));
+    try testing.expectEqualStrings("Latin Small Letter A", try decodeString("\"Latin Small Letter A\"", &buffer));
+    try testing.expectEqualStrings("See these \"quotes\" I'm making with my claw hands? It means I don't belive you.", try decodeString("\"See these \"\"quotes\"\" I'm making with my claw hands? It means I don't belive you.\"", &buffer));
+    try testing.expectEqualStrings("Line 1\nLine 2", try decodeString("\"Line 1\"/\"Line 2\"", &buffer));
+}
+
+pub fn parseIter(contents_any: []const u8) !Iterator {
+    switch (try reliabletxt.parse(contents_any)) {
+        .utf8 => |contents_utf8| {
+            const utf8_view = try std.unicode.Utf8View.init(contents_utf8);
+            return Iterator{ .utf8 = .{
+                .utf8_iter = utf8_view.iterator(),
+            } };
+        },
+        else => return error.Unimplemented,
+    }
+}
+
+pub const Iterator = union(enum) {
+    utf8: Utf8Iterator,
+    _,
+};
+
+pub const Utf8Iterator = struct {
+    utf8_iter: std.unicode.Utf8Iterator,
+
+    pub const Item = union(enum) {
+        newline,
+        /// A value not surrounded by quotes. Can't include any whitespace.
+        value: []const u8,
+        /// A value surrounded by quotes. May include escaped double quotes or escaped newlines.
+        string: []const u8,
+        null,
+    };
+
+    const ParseState = enum { default, value, string, string_double_quote, string_line_break_escape, comment };
+
+    pub fn next(this: *@This()) !?Item {
+        var state = Utf8Iterator.ParseState.default;
+        var value_start: usize = this.utf8_iter.i;
+        while (this.utf8_iter.nextCodepoint()) |codepoint| {
+            switch (state) {
+                .default => switch (codepoint) {
+                    '\n' => return Item.newline,
+                    '"' => state = .string,
+
+                    ' ',
+                    '\t',
+                    => value_start = this.utf8_iter.i,
+
+                    '#' => state = .comment,
+                    else => state = .value,
+                },
+                .value => switch (codepoint) {
+                    // TODO: Add other whitespace characters
+                    '\n',
+                    ' ',
+                    '\t',
+                    => {
+                        this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
+                        return Item{ .value = this.utf8_iter.bytes[value_start..this.utf8_iter.i] };
+                    },
+
+                    '"' => return error.DoubleQuoteInValue,
+
+                    else => {},
+                },
+                .string => switch (codepoint) {
+                    '\n' => {
+                        // TODO: diagnostic: string not closed
+                        return error.StringNotClosed;
+                    },
+                    '"' => state = .string_double_quote,
+                    else => {},
+                },
+                .string_double_quote => switch (codepoint) {
+                    '"' => state = .string,
+                    '/' => state = .string_line_break_escape,
+
+                    // TODO: Add other whitespace characters
+                    '\n',
+                    '#',
+                    ' ',
+                    '\t',
+                    => {
+                        // we roll back here so it can be handled in the next iteration of the loop
+                        this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
+                        return Item{ .string = this.utf8_iter.bytes[value_start..this.utf8_iter.i] };
+                    },
+
+                    else => {},
+                },
+                .string_line_break_escape => switch (codepoint) {
+                    '"' => state = .string,
+                    else => {
+                        // TODO: diagnostic: invalid string line break
+                        return error.InvalidStringLineBreak;
+                    },
+                },
+                .comment => switch (codepoint) {
+                    '\n' => state = .default,
+                    else => {},
+                },
+            }
+        }
+
+        return null;
+    }
+};
+
+fn expectEqualUTF8TablesIter(expected_table: []const []const ?[]const u8, actual_table: Utf8Iterator) !void {
+    var actual_table_iter = actual_table;
+
+    var is_errors = false;
+    var expected_row_index: usize = 0;
+    var expected_value_index: usize = 0;
+    while (try actual_table_iter.next()) |actual_parse_event| {
+        if (expected_row_index > expected_table.len) {
+            std.debug.print("Expected table to have at most {} rows, found more rows\n", .{expected_row_index});
+            is_errors = true;
+            break;
+        }
+
+        switch (actual_parse_event) {
+            .newline => {
+                expected_row_index += 1;
+                expected_value_index = 0;
+            },
+            .value => |actual_value_str| {
+                const expected_value = expected_table[expected_row_index][expected_value_index];
+                if (expected_value == null) {
+                    std.debug.print(
+                        \\at row {}, column {}
+                        \\    expected null
+                        \\       found "{}"
+                        \\
+                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) });
+                    is_errors = true;
+                } else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) {
+                    std.debug.print(
+                        \\at row {}, column {}
+                        \\    expected "{}"
+                        \\       found "{}"
+                        \\
+                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) });
+                    is_errors = true;
+                }
+                expected_value_index += 1;
+            },
+            .string => |actual_string_encoded| {
+                var decode_buf: [128]u8 = undefined;
+                const actual_value_str = try decodeString(actual_string_encoded, &decode_buf);
+
+                const expected_value = expected_table[expected_row_index][expected_value_index];
+
+                if (expected_value == null) {
+                    std.debug.print(
+                        \\at row {}, column {}
+                        \\    expected null
+                        \\       found "{}"
+                        \\
+                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) });
+                    is_errors = true;
+                } else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) {
+                    std.debug.print(
+                        \\at row {}, column {}
+                        \\    expected "{}"
+                        \\       found "{}"
+                        \\
+                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) });
+                    is_errors = true;
+                }
+                expected_value_index += 1;
+            },
+            .null => {
+                const expected_value = expected_table[expected_row_index][expected_value_index];
+                if (expected_value != null) {
+                    std.debug.print(
+                        \\at row {}, column {}
+                        \\    expected "{}"
+                        \\       found null
+                        \\
+                    , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?) });
+                    is_errors = true;
+                }
+            },
+        }
+    }
+
+    if (is_errors) {
+        return error.TestExpectedEqual;
+    }
+}
+
+test parseIter {
+    try expectEqualUTF8TablesIter(
+        &.{
+            &.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" },
+            &.{ "~", "U+007E", "7E", "007E", "Tilde" },
+            &.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" },
+            &.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" },
+            &.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" },
+            &.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" },
+            &.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" },
+            &.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" },
+            &.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" },
+            &.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" },
+            &.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" },
+            &.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" },
+            &.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" },
+            &.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" },
+        },
+        (try parseIter(@embedFile("./testdata/Example01_Table_UTF8.txt"))).utf8,
+    );
+}
+
+const reliabletxt = @import("./reliabletxt.zig");
+const testing = std.testing;
+const std = @import("std");