From 151470e5d30d03e5d706383863f9fa25d9e0e235 Mon Sep 17 00:00:00 2001 From: geemili Date: Sat, 27 Jan 2024 20:15:14 -0700 Subject: [PATCH] feat: reliabletxt and wsv --- .gitignore | 2 + build.zig | 91 +++++ build.zig.zon | 62 +++ src/main.zig | 24 ++ src/reliabletxt.zig | 70 ++++ src/root.zig | 7 + src/testdata/Example01_Table_UTF16.txt | Bin 0 -> 1822 bytes src/testdata/Example01_Table_UTF16R.txt | Bin 0 -> 1822 bytes src/testdata/Example01_Table_UTF32.txt | Bin 0 -> 3636 bytes src/testdata/Example01_Table_UTF8.txt | 14 + src/wsv.zig | 521 ++++++++++++++++++++++++ 11 files changed, 791 insertions(+) create mode 100644 .gitignore create mode 100644 build.zig create mode 100644 build.zig.zon create mode 100644 src/main.zig create mode 100644 src/reliabletxt.zig create mode 100644 src/root.zig create mode 100644 src/testdata/Example01_Table_UTF16.txt create mode 100644 src/testdata/Example01_Table_UTF16R.txt create mode 100644 src/testdata/Example01_Table_UTF32.txt create mode 100644 src/testdata/Example01_Table_UTF8.txt create mode 100644 src/wsv.zig diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ee7098f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +zig-out/ +zig-cache/ diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..444b647 --- /dev/null +++ b/build.zig @@ -0,0 +1,91 @@ +const std = @import("std"); + +// Although this function looks imperative, note that its job is to +// declaratively construct a build graph that will be executed by an external +// runner. +pub fn build(b: *std.Build) void { + // Standard target options allows the person running `zig build` to choose + // what target to build for. Here we do not override the defaults, which + // means any target is allowed, and the default is native. Other options + // for restricting supported target set are available. + const target = b.standardTargetOptions(.{}); + + // Standard optimization options allow the person running `zig build` to select + // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not + // set a preferred release mode, allowing the user to decide how to optimize. + const optimize = b.standardOptimizeOption(.{}); + + const lib = b.addStaticLibrary(.{ + .name = "stenway-formats", + // In this case the main source file is merely a path, however, in more + // complicated build scripts, this could be a generated file. + .root_source_file = .{ .path = "src/root.zig" }, + .target = target, + .optimize = optimize, + }); + + // This declares intent for the library to be installed into the standard + // location when the user invokes the "install" step (the default step when + // running `zig build`). + b.installArtifact(lib); + + const exe = b.addExecutable(.{ + .name = "stenway-formats", + .root_source_file = .{ .path = "src/main.zig" }, + .target = target, + .optimize = optimize, + }); + + // This declares intent for the executable to be installed into the + // standard location when the user invokes the "install" step (the default + // step when running `zig build`). + b.installArtifact(exe); + + // This *creates* a Run step in the build graph, to be executed when another + // step is evaluated that depends on it. The next line below will establish + // such a dependency. + const run_cmd = b.addRunArtifact(exe); + + // By making the run step depend on the install step, it will be run from the + // installation directory rather than directly from within the cache directory. + // This is not necessary, however, if the application depends on other installed + // files, this ensures they will be present and in the expected location. + run_cmd.step.dependOn(b.getInstallStep()); + + // This allows the user to pass arguments to the application in the build + // command itself, like this: `zig build run -- arg1 arg2 etc` + if (b.args) |args| { + run_cmd.addArgs(args); + } + + // This creates a build step. It will be visible in the `zig build --help` menu, + // and can be selected like this: `zig build run` + // This will evaluate the `run` step rather than the default, which is "install". + const run_step = b.step("run", "Run the app"); + run_step.dependOn(&run_cmd.step); + + // Creates a step for unit testing. This only builds the test executable + // but does not run it. + const lib_unit_tests = b.addTest(.{ + .root_source_file = .{ .path = "src/root.zig" }, + .target = target, + .optimize = optimize, + }); + + const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests); + + const exe_unit_tests = b.addTest(.{ + .root_source_file = .{ .path = "src/main.zig" }, + .target = target, + .optimize = optimize, + }); + + const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); + + // Similar to creating the run step earlier, this exposes a `test` step to + // the `zig build --help` menu, providing a way for the user to request + // running the unit tests. + const test_step = b.step("test", "Run unit tests"); + test_step.dependOn(&run_lib_unit_tests.step); + test_step.dependOn(&run_exe_unit_tests.step); +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..239e4cc --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,62 @@ +.{ + .name = "stenway-formats", + // This is a [Semantic Version](https://semver.org/). + // In a future version of Zig it will be used for package deduplication. + .version = "0.0.0", + + // This field is optional. + // This is currently advisory only; Zig does not yet do anything + // with this value. + //.minimum_zig_version = "0.11.0", + + // This field is optional. + // Each dependency must either provide a `url` and `hash`, or a `path`. + // `zig build --fetch` can be used to fetch all dependencies of a package, recursively. + // Once all dependencies are fetched, `zig build` no longer requires + // internet connectivity. + .dependencies = .{ + // See `zig fetch --save ` for a command-line interface for adding dependencies. + //.example = .{ + // // When updating this field to a new URL, be sure to delete the corresponding + // // `hash`, otherwise you are communicating that you expect to find the old hash at + // // the new URL. + // .url = "https://example.com/foo.tar.gz", + // + // // This is computed from the file contents of the directory of files that is + // // obtained after fetching `url` and applying the inclusion rules given by + // // `paths`. + // // + // // This field is the source of truth; packages do not come from a `url`; they + // // come from a `hash`. `url` is just one of many possible mirrors for how to + // // obtain a package matching this `hash`. + // // + // // Uses the [multihash](https://multiformats.io/multihash/) format. + // .hash = "...", + // + // // When this is provided, the package is found in a directory relative to the + // // build root. In this case the package's hash is irrelevant and therefore not + // // computed. This field and `url` are mutually exclusive. + // .path = "foo", + //}, + }, + + // Specifies the set of files and directories that are included in this package. + // Only files and directories listed here are included in the `hash` that + // is computed for this package. + // Paths are relative to the build root. Use the empty string (`""`) to refer to + // the build root itself. + // A directory listed here means that all files within, recursively, are included. + .paths = .{ + // This makes *all* files, recursively, included in this package. It is generally + // better to explicitly list the files and directories instead, to insure that + // fetching from tarballs, file system paths, and version control all result + // in the same contents hash. + "", + // For example... + //"build.zig", + //"build.zig.zon", + //"src", + //"LICENSE", + //"README.md", + }, +} diff --git a/src/main.zig b/src/main.zig new file mode 100644 index 0000000..c8a3f67 --- /dev/null +++ b/src/main.zig @@ -0,0 +1,24 @@ +const std = @import("std"); + +pub fn main() !void { + // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`) + std.debug.print("All your {s} are belong to us.\n", .{"codebase"}); + + // stdout is for the actual output of your application, for example if you + // are implementing gzip, then only the compressed bytes should be sent to + // stdout, not any debugging messages. + const stdout_file = std.io.getStdOut().writer(); + var bw = std.io.bufferedWriter(stdout_file); + const stdout = bw.writer(); + + try stdout.print("Run `zig build test` to run the tests.\n", .{}); + + try bw.flush(); // don't forget to flush! +} + +test "simple test" { + var list = std.ArrayList(i32).init(std.testing.allocator); + defer list.deinit(); // try commenting this out and see if zig detects the memory leak! + try list.append(42); + try std.testing.expectEqual(@as(i32, 42), list.pop()); +} diff --git a/src/reliabletxt.zig b/src/reliabletxt.zig new file mode 100644 index 0000000..456e5e5 --- /dev/null +++ b/src/reliabletxt.zig @@ -0,0 +1,70 @@ +//! https://dev.stenway.com/ReliableTXT/Specification.html + +pub const Encoding = enum { + utf8, + /// Big Endian + utf16, + /// Little Endian + utf16_reverse, + /// Big Endian + utf32, +}; + +pub fn detectEncoding(contents: []const u8) !Encoding { + if (std.mem.startsWith(u8, contents, "\xEF\xBB\xBF")) { + return Encoding.utf8; + } else if (std.mem.startsWith(u8, contents, "\xFE\xFF")) { + return Encoding.utf16; + } else if (std.mem.startsWith(u8, contents, "\xFF\xFE")) { + return Encoding.utf16_reverse; + } else if (std.mem.startsWith(u8, contents, "\x00\x00\xFE\xFF")) { + return Encoding.utf32; + } + return error.InvalidEncoding; +} + +test detectEncoding { + try testing.expectEqual(Encoding.utf8, detectEncoding("\xEF\xBB\xBFaaa!")); + try testing.expectEqual(Encoding.utf16_reverse, detectEncoding(std.mem.sliceAsBytes(&[_]u16{ + std.mem.nativeToLittle(u16, 0xFE_FF), + std.mem.nativeToLittle(u16, 'a'), + std.mem.nativeToLittle(u16, 'a'), + std.mem.nativeToLittle(u16, 'a'), + std.mem.nativeToLittle(u16, '!'), + }))); + + try testing.expectEqual(Encoding.utf16, detectEncoding(std.mem.sliceAsBytes(&[_]u16{ + std.mem.nativeToBig(u16, 0xFE_FF), + std.mem.nativeToBig(u16, 'a'), + std.mem.nativeToBig(u16, 'a'), + std.mem.nativeToBig(u16, 'a'), + std.mem.nativeToBig(u16, '!'), + }))); + + try testing.expectEqual(Encoding.utf32, detectEncoding(std.mem.sliceAsBytes(&[_]u32{ + std.mem.nativeToBig(u32, 0x00_00_FE_FF), + std.mem.nativeToBig(u32, 'a'), + std.mem.nativeToBig(u32, 'a'), + std.mem.nativeToBig(u32, 'a'), + std.mem.nativeToBig(u32, '!'), + }))); +} + +pub const File = union(Encoding) { + utf8: []const u8, + utf16: []const u16, + utf16_reverse: []const u16, + utf32: []const u32, +}; + +pub fn parse(contents: []const u8) !File { + switch (try detectEncoding(contents)) { + .utf8 => return .{ .utf8 = contents[3..] }, + .utf16 => return .{ .utf16 = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] }, + .utf16_reverse => return .{ .utf16_reverse = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] }, + .utf32 => return .{ .utf32 = @as([*]const u32, @ptrCast(@alignCast(contents[4..])))[0 .. contents[4..].len / @sizeOf(u32)] }, + } +} + +const testing = std.testing; +const std = @import("std"); diff --git a/src/root.zig b/src/root.zig new file mode 100644 index 0000000..2ad2137 --- /dev/null +++ b/src/root.zig @@ -0,0 +1,7 @@ +pub const reliabletxt = @import("./reliabletxt.zig"); +pub const wsv = @import("./wsv.zig"); + +test { + _ = reliabletxt; + _ = wsv; +} diff --git a/src/testdata/Example01_Table_UTF16.txt b/src/testdata/Example01_Table_UTF16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a18ce79f3df373e8b3e54adb553b574b839ea4f GIT binary patch literal 1822 zcma)7O>fjN5Ph6b`y)`~E2_k{+wO8Bo20PN3Q8yvC)AG+A{J7XEr<*Mh8zC?A^t`W zMO?V^4+?MWo!H5)U}eSeym96|kL}6T9~>dV8V+$2x7c>jqwQKaR!raBxr!U?G2#pr z-Y`mz*R&>V5%GffjQx~W41;xi;>dFkkkNK694n@8;a=u{Le>dR$*_*kt;jL9o5%?^ z@tph~(YTS#z%q})Ge&hQs!(zy>c6%)Mi_NCLZr>H%wzDF`b)fIjyCWJwauZ+WXZ@W zk#ih#g$1pc8p`?jfOyTXT;6M|bD!1}udL2*9i8cdPLFhTE_w`}l5awGVTiEH{s<@J zuc$(D^pL%1yhnV37gob}+J2`;{7RYJeuRj6+vpBo-9*)qD~o2vQXe%pd-w| z&lZ2$;@{$VxE&yE{&22(Rz2l8cT=)-M)ut1;a5vXhCXfA5{QmCK-A;%5YAQ4sybqF zeBl1fs7FpxMOBhJ$ydckPV;Yi{0?P7K2(|CGv!xhi=I`LjsLIgS{^9t@!oFHb}gYL zd*>1Lh_d!v_54ej+&{TR=QcNTk_*XA^rca-$+$6+MnPt)$#7m%JT7eAxn4nL8EVeS zng4k`Dc?3MJFIVmpb}@S%d*at{V2dCVt1}*NpK$zIOC9arQ$uTxEi@BV_PF@^2*IU zrXPRJ+!f*c?D zXE;M1& literal 0 HcmV?d00001 diff --git a/src/testdata/Example01_Table_UTF16R.txt b/src/testdata/Example01_Table_UTF16R.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8e723af657c5225ed2b3a23822d1ff294f963e7 GIT binary patch literal 1822 zcma)7!EO^V5Ph5w{Rp)36;&c_Xu^%n?iK=6P?jQbLTS4!D*{_WTEvCF;l>{z#BcOK z#ijTD0r1Ao#NJJWR#qI(8++dK*q;6SXMzYfaDZF5&9;LcZP&uFV*1~ntGK~qOfW}< z_l%O`9jz%_M7$tAXFsDA!(annIP%;BWVBri$BOB{aIf<}C+if)WZ1yhM&uZGmXQ-| z;T8EkqH!aefps2(myGImRH5WZ)PHYqj4*0(gh-QPoyXuA^_O_V9Btw$YMVos$&!&% zBBway4hvc_HI$3-KJl8XT;6M|bC1>xZ>`QBEuHC#PLH&8u6hh!kZ($MVTiEHeu5+N zS5zT6dcs~bJ|I5DYpdZWZGX}uu2Lp<9wDM0cOL6J24ixZk!5CY)G?z;77Ba}bc8wh z-QZ6f{M)<_cLSu!AI?=z*ISWInl ziRUw?9=S;sRY~q7Ulo6Ho3H5cCzJ*GP-XtilwXytdb%nbcTkpJ^U8XBx7%FfT0%?C z&Lip(WzD(j`JXa*fAWe>ZEoZy7dAKFB2}=(xG|EYg3MNv;i9H^T-e%ky@Jd#)SQzu z|5{Hzl*+Qh`ZfqEamKnV`%Kx70$d`t=X#a|5AldI4)|6ozQc;Uk(V;IHL@nJyxbGs ziOppcxe5LAn-{92N6sFbR_0yqUzH4bS7ofe6hoebV5wcDxWOLxD>vG@tI$S zL)Hti?(>&$R4i)6z4Pn8lwWe_>b={SN%lqQmJ%16U4-S(V7(>7;timZcQ#jA{EJQ}=N z(4}LMxHw-^>*&j=^K+a!rd>1Sx%VLJe#+QBIt_Q=B{-)><2hDiz!flOAJ)J#)RTY_ zxX#=QG>F2F7XFop!Yqc-FK)@`t@36Y9DH>WoyK>q!-q(Ikv&I@9 zg8I~~cJ<{ud9{p%L7~57Ni$jLmnyO-LWUG7jkT!Dl&N)B4brQ|ITn_FDncnfOL1NA!IfPm};V|&o$M_(U&CqJM|zh~7|r~2o?@57~@oFLwk`up~6^VPos zV@OTe*N6F*yT*6Z{q%7c0*q%r{3x+5KbDv~SLUO-)_#n7a_amXryqU$W?o*8+7D}W z?{}~bdaOSIo{_OJ*jM_M{bhU)(zBn81NuCh)I%fIcICKdOYPY-YmVBpdNyOv zsjcA2Z)hG8a($p(ZtY74=7j@3Uv$`r$eI-OYHr&OOcXeaJjx`K&hI8vQ-= zZ1+G9)q!Ewd`wf5{k^@%+Z6}9#e@$^Am Z^|)P^+SQ$T#;mRDIo9~UdZA0>@DE)=-QxfN literal 0 HcmV?d00001 diff --git a/src/testdata/Example01_Table_UTF8.txt b/src/testdata/Example01_Table_UTF8.txt new file mode 100644 index 0000000..c5f6470 --- /dev/null +++ b/src/testdata/Example01_Table_UTF8.txt @@ -0,0 +1,14 @@ +a U+0061 61 0061 "Latin Small Letter A" +~ U+007E 7E 007E Tilde +¥ U+00A5 C2_A5 00A5 "Yen Sign" +» U+00BB C2_BB 00BB "Right-Pointing Double Angle Quotation Mark" +½ U+00BD C2_BD 00BD "Vulgar Fraction One Half" +¿ U+00BF C2_BF 00BF "Inverted Question Mark" +ß U+00DF C3_9F 00DF "Latin Small Letter Sharp S" +ä U+00E4 C3_A4 00E4 "Latin Small Letter A with Diaeresis" +ï U+00EF C3_AF 00EF "Latin Small Letter I with Diaeresis" +œ U+0153 C5_93 0153 "Latin Small Ligature Oe" +€ U+20AC E2_82_AC 20AC "Euro Sign" +東 U+6771 E6_9D_B1 6771 "CJK Unified Ideograph-6771" +𝄞 U+1D11E F0_9D_84_9E D834_DD1E "Musical Symbol G Clef" +𠀇 U+20007 F0_A0_80_87 D840_DC07 "CJK Unified Ideograph-20007" \ No newline at end of file diff --git a/src/wsv.zig b/src/wsv.zig new file mode 100644 index 0000000..9185eb8 --- /dev/null +++ b/src/wsv.zig @@ -0,0 +1,521 @@ +const Table = union(reliabletxt.Encoding) { + utf8: [][]?[]u8, + utf16: [][]?[]u16, + utf16_reverse: [][]?[]u16, + utf32: [][]?[]u32, + + pub fn free(this: @This(), gpa: std.mem.Allocator) void { + switch (this) { + .utf8 => |table| { + for (table) |row| { + for (row) |value_opt| { + if (value_opt) |value| { + gpa.free(value); + } + } + gpa.free(row); + } + gpa.free(table); + }, + else => std.debug.panic("unimplemented", .{}), + } + } +}; + +const ParseState = enum { default, string, string_double_quote, string_line_break_escape, comment }; + +pub fn parseAlloc(gpa: std.mem.Allocator, contents_any: []const u8) !Table { + switch (try reliabletxt.parse(contents_any)) { + .utf8 => |contents_utf8| { + var table = std.ArrayList([]?[]u8).init(gpa); + defer table.deinit(); + + const utf8_view = try std.unicode.Utf8View.init(contents_utf8); + var utf8_iter = utf8_view.iterator(); + + var line_buf = std.ArrayList(?[]u8).init(gpa); + defer line_buf.deinit(); + + var value_buf = std.ArrayList(u8).init(gpa); + defer value_buf.deinit(); + + var state = ParseState.default; + while (utf8_iter.nextCodepoint()) |codepoint| { + switch (state) { + .default => switch (codepoint) { + '\n' => { + try table.ensureUnusedCapacity(1); + if (value_buf.items.len > 0) { + try line_buf.ensureUnusedCapacity(1); + const value = try value_buf.toOwnedSlice(); + line_buf.appendAssumeCapacity(value); + } + const line = try line_buf.toOwnedSlice(); + table.appendAssumeCapacity(line); + }, + '"' => state = .string, + ' ', + '\t', + => { + if (value_buf.items.len > 0) { + try line_buf.ensureUnusedCapacity(1); + const value = try value_buf.toOwnedSlice(); + line_buf.appendAssumeCapacity(value); + } + }, + '#' => { + try table.ensureUnusedCapacity(1); + if (value_buf.items.len > 0) { + try line_buf.ensureUnusedCapacity(1); + const value = try value_buf.toOwnedSlice(); + line_buf.appendAssumeCapacity(value); + } + const line = try line_buf.toOwnedSlice(); + table.appendAssumeCapacity(line); + state = .comment; + }, + else => |character| { + const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character); + + try value_buf.ensureUnusedCapacity(codepoint_len); + const buf = value_buf.unusedCapacitySlice()[0..codepoint_len]; + + _ = try std.unicode.utf8Encode(character, buf); + + value_buf.items.len += codepoint_len; + }, + }, + .string => switch (codepoint) { + '\n' => { + // TODO: diagnostic: string not closed + return error.StringNotClosed; + }, + '"' => state = .string_double_quote, + else => |character| { + const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character); + + try value_buf.ensureUnusedCapacity(codepoint_len); + const buf = value_buf.unusedCapacitySlice()[0..codepoint_len]; + + _ = try std.unicode.utf8Encode(character, buf); + + value_buf.items.len += codepoint_len; + }, + }, + .string_double_quote => switch (codepoint) { + '"' => { + try value_buf.append('"'); + state = .string; + }, + '/' => state = .string_line_break_escape, + '\n' => { + try table.ensureUnusedCapacity(1); + if (value_buf.items.len > 0) { + try line_buf.ensureUnusedCapacity(1); + const value = try value_buf.toOwnedSlice(); + line_buf.appendAssumeCapacity(value); + } + const line = try line_buf.toOwnedSlice(); + table.appendAssumeCapacity(line); + }, + '#' => { + try table.ensureUnusedCapacity(1); + if (value_buf.items.len > 0) { + try line_buf.ensureUnusedCapacity(1); + const value = try value_buf.toOwnedSlice(); + line_buf.appendAssumeCapacity(value); + } + const line = try line_buf.toOwnedSlice(); + table.appendAssumeCapacity(line); + state = .comment; + }, + ' ', + '\t', + => { + try line_buf.ensureUnusedCapacity(1); + const value = try value_buf.toOwnedSlice(); + line_buf.appendAssumeCapacity(value); + state = .default; + }, + else => |character| { + const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character); + + try value_buf.ensureUnusedCapacity(codepoint_len); + const buf = value_buf.unusedCapacitySlice()[0..codepoint_len]; + + _ = try std.unicode.utf8Encode(character, buf); + + value_buf.items.len += codepoint_len; + }, + }, + .string_line_break_escape => switch (codepoint) { + '"' => { + try value_buf.append('\n'); + state = .string; + }, + else => { + // TODO: diagnostic: invalid string line break + return error.InvalidStringLineBreak; + }, + }, + .comment => switch (codepoint) { + '\n' => state = .default, + else => {}, + }, + } + } + + { + try table.ensureUnusedCapacity(1); + if (value_buf.items.len > 0) { + try line_buf.ensureUnusedCapacity(1); + const value = try value_buf.toOwnedSlice(); + line_buf.appendAssumeCapacity(value); + } + const line = try line_buf.toOwnedSlice(); + table.appendAssumeCapacity(line); + } + + const utf8_table = try table.toOwnedSlice(); + return .{ .utf8 = utf8_table }; + }, + else => return error.Unimplemented, + } +} + +fn expectEqualUTF8Tables(expected_table: []const []const ?[]const u8, actual_table: []const []const ?[]const u8) !void { + var is_errors = false; + if (expected_table.len != actual_table.len) { + std.debug.print("Expected table to have {} rows, found {} rows\n", .{ expected_table.len, actual_table.len }); + return error.TestExpectedEqual; + } + for (expected_table, actual_table, 0..) |expected_row, actual_row, row| { + for (expected_row, actual_row, 0..) |expected_value, actual_value, col| { + if (expected_value == null and actual_value != null) { + std.debug.print( + \\at row {}, column {} + \\ expected null + \\ found "{}" + \\ + , .{ row, col, std.zig.fmtEscapes(actual_value.?) }); + is_errors = true; + } + if (expected_value != null and actual_value == null) { + std.debug.print( + \\at row {}, column {} + \\ expected "{}" + \\ found null + \\ + , .{ row, col, std.zig.fmtEscapes(expected_value.?) }); + is_errors = true; + } + if (!std.mem.eql(u8, expected_value.?, actual_value.?)) { + std.debug.print( + \\at row {}, column {} + \\ expected "{}" + \\ found "{}" + \\ + , .{ row, col, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value.?) }); + is_errors = true; + } + } + } + + if (is_errors) { + return error.TestExpectedEqual; + } +} + +test parseAlloc { + const table = try parseAlloc(testing.allocator, @embedFile("./testdata/Example01_Table_UTF8.txt")); + defer table.free(testing.allocator); + + try testing.expectEqual(reliabletxt.Encoding.utf8, @as(reliabletxt.Encoding, table)); + const utf8_table = table.utf8; + + try expectEqualUTF8Tables( + &.{ + &.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" }, + &.{ "~", "U+007E", "7E", "007E", "Tilde" }, + &.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" }, + &.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" }, + &.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" }, + &.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" }, + &.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" }, + &.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" }, + &.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" }, + &.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" }, + &.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" }, + &.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" }, + &.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" }, + &.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" }, + }, + utf8_table, + ); +} + +pub fn decodeString(encoded_string: []const u8, buffer: []u8) ![]const u8 { + const State = enum { + default, + double_quote, + double_quote_slash, + }; + if (encoded_string.len < 1 or encoded_string[0] != '"' or encoded_string[encoded_string.len - 1] != '"') return error.InvalidFormat; + var state = State.default; + var write_pos: usize = 0; + for (encoded_string[1 .. encoded_string.len - 1]) |encoded_character| { + switch (state) { + .default => switch (encoded_character) { + '\n' => return error.InvalidFormat, + '"' => state = .double_quote, + else => { + if (write_pos >= buffer.len) return error.OutOfMemory; + buffer[write_pos] = encoded_character; + write_pos += 1; + }, + }, + .double_quote => switch (encoded_character) { + '"' => { + if (write_pos >= buffer.len) return error.OutOfMemory; + buffer[write_pos] = encoded_character; + write_pos += 1; + state = .default; + }, + '/' => state = .double_quote_slash, + else => return error.InvalidFormat, + }, + .double_quote_slash => switch (encoded_character) { + '"' => { + if (write_pos >= buffer.len) return error.OutOfMemory; + buffer[write_pos] = '\n'; + write_pos += 1; + state = .default; + }, + else => return error.InvalidFormat, + }, + } + } + return buffer[0..write_pos]; +} + +test decodeString { + var buffer: [128]u8 = undefined; + try testing.expectEqualStrings("", try decodeString("\"\"", &buffer)); + try testing.expectEqualStrings("Latin Small Letter A", try decodeString("\"Latin Small Letter A\"", &buffer)); + try testing.expectEqualStrings("See these \"quotes\" I'm making with my claw hands? It means I don't belive you.", try decodeString("\"See these \"\"quotes\"\" I'm making with my claw hands? It means I don't belive you.\"", &buffer)); + try testing.expectEqualStrings("Line 1\nLine 2", try decodeString("\"Line 1\"/\"Line 2\"", &buffer)); +} + +pub fn parseIter(contents_any: []const u8) !Iterator { + switch (try reliabletxt.parse(contents_any)) { + .utf8 => |contents_utf8| { + const utf8_view = try std.unicode.Utf8View.init(contents_utf8); + return Iterator{ .utf8 = .{ + .utf8_iter = utf8_view.iterator(), + } }; + }, + else => return error.Unimplemented, + } +} + +pub const Iterator = union(enum) { + utf8: Utf8Iterator, + _, +}; + +pub const Utf8Iterator = struct { + utf8_iter: std.unicode.Utf8Iterator, + + pub const Item = union(enum) { + newline, + /// A value not surrounded by quotes. Can't include any whitespace. + value: []const u8, + /// A value surrounded by quotes. May include escaped double quotes or escaped newlines. + string: []const u8, + null, + }; + + const ParseState = enum { default, value, string, string_double_quote, string_line_break_escape, comment }; + + pub fn next(this: *@This()) !?Item { + var state = Utf8Iterator.ParseState.default; + var value_start: usize = this.utf8_iter.i; + while (this.utf8_iter.nextCodepoint()) |codepoint| { + switch (state) { + .default => switch (codepoint) { + '\n' => return Item.newline, + '"' => state = .string, + + ' ', + '\t', + => value_start = this.utf8_iter.i, + + '#' => state = .comment, + else => state = .value, + }, + .value => switch (codepoint) { + // TODO: Add other whitespace characters + '\n', + ' ', + '\t', + => { + this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable; + return Item{ .value = this.utf8_iter.bytes[value_start..this.utf8_iter.i] }; + }, + + '"' => return error.DoubleQuoteInValue, + + else => {}, + }, + .string => switch (codepoint) { + '\n' => { + // TODO: diagnostic: string not closed + return error.StringNotClosed; + }, + '"' => state = .string_double_quote, + else => {}, + }, + .string_double_quote => switch (codepoint) { + '"' => state = .string, + '/' => state = .string_line_break_escape, + + // TODO: Add other whitespace characters + '\n', + '#', + ' ', + '\t', + => { + // we roll back here so it can be handled in the next iteration of the loop + this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable; + return Item{ .string = this.utf8_iter.bytes[value_start..this.utf8_iter.i] }; + }, + + else => {}, + }, + .string_line_break_escape => switch (codepoint) { + '"' => state = .string, + else => { + // TODO: diagnostic: invalid string line break + return error.InvalidStringLineBreak; + }, + }, + .comment => switch (codepoint) { + '\n' => state = .default, + else => {}, + }, + } + } + + return null; + } +}; + +fn expectEqualUTF8TablesIter(expected_table: []const []const ?[]const u8, actual_table: Utf8Iterator) !void { + var actual_table_iter = actual_table; + + var is_errors = false; + var expected_row_index: usize = 0; + var expected_value_index: usize = 0; + while (try actual_table_iter.next()) |actual_parse_event| { + if (expected_row_index > expected_table.len) { + std.debug.print("Expected table to have at most {} rows, found more rows\n", .{expected_row_index}); + is_errors = true; + break; + } + + switch (actual_parse_event) { + .newline => { + expected_row_index += 1; + expected_value_index = 0; + }, + .value => |actual_value_str| { + const expected_value = expected_table[expected_row_index][expected_value_index]; + if (expected_value == null) { + std.debug.print( + \\at row {}, column {} + \\ expected null + \\ found "{}" + \\ + , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) }); + is_errors = true; + } else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) { + std.debug.print( + \\at row {}, column {} + \\ expected "{}" + \\ found "{}" + \\ + , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) }); + is_errors = true; + } + expected_value_index += 1; + }, + .string => |actual_string_encoded| { + var decode_buf: [128]u8 = undefined; + const actual_value_str = try decodeString(actual_string_encoded, &decode_buf); + + const expected_value = expected_table[expected_row_index][expected_value_index]; + + if (expected_value == null) { + std.debug.print( + \\at row {}, column {} + \\ expected null + \\ found "{}" + \\ + , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) }); + is_errors = true; + } else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) { + std.debug.print( + \\at row {}, column {} + \\ expected "{}" + \\ found "{}" + \\ + , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) }); + is_errors = true; + } + expected_value_index += 1; + }, + .null => { + const expected_value = expected_table[expected_row_index][expected_value_index]; + if (expected_value != null) { + std.debug.print( + \\at row {}, column {} + \\ expected "{}" + \\ found null + \\ + , .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?) }); + is_errors = true; + } + }, + } + } + + if (is_errors) { + return error.TestExpectedEqual; + } +} + +test parseIter { + try expectEqualUTF8TablesIter( + &.{ + &.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" }, + &.{ "~", "U+007E", "7E", "007E", "Tilde" }, + &.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" }, + &.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" }, + &.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" }, + &.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" }, + &.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" }, + &.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" }, + &.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" }, + &.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" }, + &.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" }, + &.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" }, + &.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" }, + &.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" }, + }, + (try parseIter(@embedFile("./testdata/Example01_Table_UTF8.txt"))).utf8, + ); +} + +const reliabletxt = @import("./reliabletxt.zig"); +const testing = std.testing; +const std = @import("std");