feat: reliabletxt and wsv
commit
151470e5d3
|
@ -0,0 +1,2 @@
|
|||
zig-out/
|
||||
zig-cache/
|
|
@ -0,0 +1,91 @@
|
|||
const std = @import("std");
|
||||
|
||||
// Although this function looks imperative, note that its job is to
|
||||
// declaratively construct a build graph that will be executed by an external
|
||||
// runner.
|
||||
pub fn build(b: *std.Build) void {
|
||||
// Standard target options allows the person running `zig build` to choose
|
||||
// what target to build for. Here we do not override the defaults, which
|
||||
// means any target is allowed, and the default is native. Other options
|
||||
// for restricting supported target set are available.
|
||||
const target = b.standardTargetOptions(.{});
|
||||
|
||||
// Standard optimization options allow the person running `zig build` to select
|
||||
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
|
||||
// set a preferred release mode, allowing the user to decide how to optimize.
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
|
||||
const lib = b.addStaticLibrary(.{
|
||||
.name = "stenway-formats",
|
||||
// In this case the main source file is merely a path, however, in more
|
||||
// complicated build scripts, this could be a generated file.
|
||||
.root_source_file = .{ .path = "src/root.zig" },
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
||||
// This declares intent for the library to be installed into the standard
|
||||
// location when the user invokes the "install" step (the default step when
|
||||
// running `zig build`).
|
||||
b.installArtifact(lib);
|
||||
|
||||
const exe = b.addExecutable(.{
|
||||
.name = "stenway-formats",
|
||||
.root_source_file = .{ .path = "src/main.zig" },
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
||||
// This declares intent for the executable to be installed into the
|
||||
// standard location when the user invokes the "install" step (the default
|
||||
// step when running `zig build`).
|
||||
b.installArtifact(exe);
|
||||
|
||||
// This *creates* a Run step in the build graph, to be executed when another
|
||||
// step is evaluated that depends on it. The next line below will establish
|
||||
// such a dependency.
|
||||
const run_cmd = b.addRunArtifact(exe);
|
||||
|
||||
// By making the run step depend on the install step, it will be run from the
|
||||
// installation directory rather than directly from within the cache directory.
|
||||
// This is not necessary, however, if the application depends on other installed
|
||||
// files, this ensures they will be present and in the expected location.
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
|
||||
// This allows the user to pass arguments to the application in the build
|
||||
// command itself, like this: `zig build run -- arg1 arg2 etc`
|
||||
if (b.args) |args| {
|
||||
run_cmd.addArgs(args);
|
||||
}
|
||||
|
||||
// This creates a build step. It will be visible in the `zig build --help` menu,
|
||||
// and can be selected like this: `zig build run`
|
||||
// This will evaluate the `run` step rather than the default, which is "install".
|
||||
const run_step = b.step("run", "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
|
||||
// Creates a step for unit testing. This only builds the test executable
|
||||
// but does not run it.
|
||||
const lib_unit_tests = b.addTest(.{
|
||||
.root_source_file = .{ .path = "src/root.zig" },
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
||||
const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
|
||||
|
||||
const exe_unit_tests = b.addTest(.{
|
||||
.root_source_file = .{ .path = "src/main.zig" },
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
||||
const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
|
||||
|
||||
// Similar to creating the run step earlier, this exposes a `test` step to
|
||||
// the `zig build --help` menu, providing a way for the user to request
|
||||
// running the unit tests.
|
||||
const test_step = b.step("test", "Run unit tests");
|
||||
test_step.dependOn(&run_lib_unit_tests.step);
|
||||
test_step.dependOn(&run_exe_unit_tests.step);
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
.{
|
||||
.name = "stenway-formats",
|
||||
// This is a [Semantic Version](https://semver.org/).
|
||||
// In a future version of Zig it will be used for package deduplication.
|
||||
.version = "0.0.0",
|
||||
|
||||
// This field is optional.
|
||||
// This is currently advisory only; Zig does not yet do anything
|
||||
// with this value.
|
||||
//.minimum_zig_version = "0.11.0",
|
||||
|
||||
// This field is optional.
|
||||
// Each dependency must either provide a `url` and `hash`, or a `path`.
|
||||
// `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
|
||||
// Once all dependencies are fetched, `zig build` no longer requires
|
||||
// internet connectivity.
|
||||
.dependencies = .{
|
||||
// See `zig fetch --save <url>` for a command-line interface for adding dependencies.
|
||||
//.example = .{
|
||||
// // When updating this field to a new URL, be sure to delete the corresponding
|
||||
// // `hash`, otherwise you are communicating that you expect to find the old hash at
|
||||
// // the new URL.
|
||||
// .url = "https://example.com/foo.tar.gz",
|
||||
//
|
||||
// // This is computed from the file contents of the directory of files that is
|
||||
// // obtained after fetching `url` and applying the inclusion rules given by
|
||||
// // `paths`.
|
||||
// //
|
||||
// // This field is the source of truth; packages do not come from a `url`; they
|
||||
// // come from a `hash`. `url` is just one of many possible mirrors for how to
|
||||
// // obtain a package matching this `hash`.
|
||||
// //
|
||||
// // Uses the [multihash](https://multiformats.io/multihash/) format.
|
||||
// .hash = "...",
|
||||
//
|
||||
// // When this is provided, the package is found in a directory relative to the
|
||||
// // build root. In this case the package's hash is irrelevant and therefore not
|
||||
// // computed. This field and `url` are mutually exclusive.
|
||||
// .path = "foo",
|
||||
//},
|
||||
},
|
||||
|
||||
// Specifies the set of files and directories that are included in this package.
|
||||
// Only files and directories listed here are included in the `hash` that
|
||||
// is computed for this package.
|
||||
// Paths are relative to the build root. Use the empty string (`""`) to refer to
|
||||
// the build root itself.
|
||||
// A directory listed here means that all files within, recursively, are included.
|
||||
.paths = .{
|
||||
// This makes *all* files, recursively, included in this package. It is generally
|
||||
// better to explicitly list the files and directories instead, to insure that
|
||||
// fetching from tarballs, file system paths, and version control all result
|
||||
// in the same contents hash.
|
||||
"",
|
||||
// For example...
|
||||
//"build.zig",
|
||||
//"build.zig.zon",
|
||||
//"src",
|
||||
//"LICENSE",
|
||||
//"README.md",
|
||||
},
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
const std = @import("std");
|
||||
|
||||
pub fn main() !void {
|
||||
// Prints to stderr (it's a shortcut based on `std.io.getStdErr()`)
|
||||
std.debug.print("All your {s} are belong to us.\n", .{"codebase"});
|
||||
|
||||
// stdout is for the actual output of your application, for example if you
|
||||
// are implementing gzip, then only the compressed bytes should be sent to
|
||||
// stdout, not any debugging messages.
|
||||
const stdout_file = std.io.getStdOut().writer();
|
||||
var bw = std.io.bufferedWriter(stdout_file);
|
||||
const stdout = bw.writer();
|
||||
|
||||
try stdout.print("Run `zig build test` to run the tests.\n", .{});
|
||||
|
||||
try bw.flush(); // don't forget to flush!
|
||||
}
|
||||
|
||||
test "simple test" {
|
||||
var list = std.ArrayList(i32).init(std.testing.allocator);
|
||||
defer list.deinit(); // try commenting this out and see if zig detects the memory leak!
|
||||
try list.append(42);
|
||||
try std.testing.expectEqual(@as(i32, 42), list.pop());
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
//! https://dev.stenway.com/ReliableTXT/Specification.html
|
||||
|
||||
pub const Encoding = enum {
|
||||
utf8,
|
||||
/// Big Endian
|
||||
utf16,
|
||||
/// Little Endian
|
||||
utf16_reverse,
|
||||
/// Big Endian
|
||||
utf32,
|
||||
};
|
||||
|
||||
pub fn detectEncoding(contents: []const u8) !Encoding {
|
||||
if (std.mem.startsWith(u8, contents, "\xEF\xBB\xBF")) {
|
||||
return Encoding.utf8;
|
||||
} else if (std.mem.startsWith(u8, contents, "\xFE\xFF")) {
|
||||
return Encoding.utf16;
|
||||
} else if (std.mem.startsWith(u8, contents, "\xFF\xFE")) {
|
||||
return Encoding.utf16_reverse;
|
||||
} else if (std.mem.startsWith(u8, contents, "\x00\x00\xFE\xFF")) {
|
||||
return Encoding.utf32;
|
||||
}
|
||||
return error.InvalidEncoding;
|
||||
}
|
||||
|
||||
test detectEncoding {
|
||||
try testing.expectEqual(Encoding.utf8, detectEncoding("\xEF\xBB\xBFaaa!"));
|
||||
try testing.expectEqual(Encoding.utf16_reverse, detectEncoding(std.mem.sliceAsBytes(&[_]u16{
|
||||
std.mem.nativeToLittle(u16, 0xFE_FF),
|
||||
std.mem.nativeToLittle(u16, 'a'),
|
||||
std.mem.nativeToLittle(u16, 'a'),
|
||||
std.mem.nativeToLittle(u16, 'a'),
|
||||
std.mem.nativeToLittle(u16, '!'),
|
||||
})));
|
||||
|
||||
try testing.expectEqual(Encoding.utf16, detectEncoding(std.mem.sliceAsBytes(&[_]u16{
|
||||
std.mem.nativeToBig(u16, 0xFE_FF),
|
||||
std.mem.nativeToBig(u16, 'a'),
|
||||
std.mem.nativeToBig(u16, 'a'),
|
||||
std.mem.nativeToBig(u16, 'a'),
|
||||
std.mem.nativeToBig(u16, '!'),
|
||||
})));
|
||||
|
||||
try testing.expectEqual(Encoding.utf32, detectEncoding(std.mem.sliceAsBytes(&[_]u32{
|
||||
std.mem.nativeToBig(u32, 0x00_00_FE_FF),
|
||||
std.mem.nativeToBig(u32, 'a'),
|
||||
std.mem.nativeToBig(u32, 'a'),
|
||||
std.mem.nativeToBig(u32, 'a'),
|
||||
std.mem.nativeToBig(u32, '!'),
|
||||
})));
|
||||
}
|
||||
|
||||
pub const File = union(Encoding) {
|
||||
utf8: []const u8,
|
||||
utf16: []const u16,
|
||||
utf16_reverse: []const u16,
|
||||
utf32: []const u32,
|
||||
};
|
||||
|
||||
pub fn parse(contents: []const u8) !File {
|
||||
switch (try detectEncoding(contents)) {
|
||||
.utf8 => return .{ .utf8 = contents[3..] },
|
||||
.utf16 => return .{ .utf16 = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] },
|
||||
.utf16_reverse => return .{ .utf16_reverse = @as([*]const u16, @ptrCast(@alignCast(contents[2..])))[0 .. contents[2..].len / @sizeOf(u16)] },
|
||||
.utf32 => return .{ .utf32 = @as([*]const u32, @ptrCast(@alignCast(contents[4..])))[0 .. contents[4..].len / @sizeOf(u32)] },
|
||||
}
|
||||
}
|
||||
|
||||
const testing = std.testing;
|
||||
const std = @import("std");
|
|
@ -0,0 +1,7 @@
|
|||
pub const reliabletxt = @import("./reliabletxt.zig");
|
||||
pub const wsv = @import("./wsv.zig");
|
||||
|
||||
test {
|
||||
_ = reliabletxt;
|
||||
_ = wsv;
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,14 @@
|
|||
a U+0061 61 0061 "Latin Small Letter A"
|
||||
~ U+007E 7E 007E Tilde
|
||||
¥ U+00A5 C2_A5 00A5 "Yen Sign"
|
||||
» U+00BB C2_BB 00BB "Right-Pointing Double Angle Quotation Mark"
|
||||
½ U+00BD C2_BD 00BD "Vulgar Fraction One Half"
|
||||
¿ U+00BF C2_BF 00BF "Inverted Question Mark"
|
||||
ß U+00DF C3_9F 00DF "Latin Small Letter Sharp S"
|
||||
ä U+00E4 C3_A4 00E4 "Latin Small Letter A with Diaeresis"
|
||||
ï U+00EF C3_AF 00EF "Latin Small Letter I with Diaeresis"
|
||||
œ U+0153 C5_93 0153 "Latin Small Ligature Oe"
|
||||
€ U+20AC E2_82_AC 20AC "Euro Sign"
|
||||
東 U+6771 E6_9D_B1 6771 "CJK Unified Ideograph-6771"
|
||||
𝄞 U+1D11E F0_9D_84_9E D834_DD1E "Musical Symbol G Clef"
|
||||
𠀇 U+20007 F0_A0_80_87 D840_DC07 "CJK Unified Ideograph-20007"
|
|
@ -0,0 +1,521 @@
|
|||
const Table = union(reliabletxt.Encoding) {
|
||||
utf8: [][]?[]u8,
|
||||
utf16: [][]?[]u16,
|
||||
utf16_reverse: [][]?[]u16,
|
||||
utf32: [][]?[]u32,
|
||||
|
||||
pub fn free(this: @This(), gpa: std.mem.Allocator) void {
|
||||
switch (this) {
|
||||
.utf8 => |table| {
|
||||
for (table) |row| {
|
||||
for (row) |value_opt| {
|
||||
if (value_opt) |value| {
|
||||
gpa.free(value);
|
||||
}
|
||||
}
|
||||
gpa.free(row);
|
||||
}
|
||||
gpa.free(table);
|
||||
},
|
||||
else => std.debug.panic("unimplemented", .{}),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const ParseState = enum { default, string, string_double_quote, string_line_break_escape, comment };
|
||||
|
||||
pub fn parseAlloc(gpa: std.mem.Allocator, contents_any: []const u8) !Table {
|
||||
switch (try reliabletxt.parse(contents_any)) {
|
||||
.utf8 => |contents_utf8| {
|
||||
var table = std.ArrayList([]?[]u8).init(gpa);
|
||||
defer table.deinit();
|
||||
|
||||
const utf8_view = try std.unicode.Utf8View.init(contents_utf8);
|
||||
var utf8_iter = utf8_view.iterator();
|
||||
|
||||
var line_buf = std.ArrayList(?[]u8).init(gpa);
|
||||
defer line_buf.deinit();
|
||||
|
||||
var value_buf = std.ArrayList(u8).init(gpa);
|
||||
defer value_buf.deinit();
|
||||
|
||||
var state = ParseState.default;
|
||||
while (utf8_iter.nextCodepoint()) |codepoint| {
|
||||
switch (state) {
|
||||
.default => switch (codepoint) {
|
||||
'\n' => {
|
||||
try table.ensureUnusedCapacity(1);
|
||||
if (value_buf.items.len > 0) {
|
||||
try line_buf.ensureUnusedCapacity(1);
|
||||
const value = try value_buf.toOwnedSlice();
|
||||
line_buf.appendAssumeCapacity(value);
|
||||
}
|
||||
const line = try line_buf.toOwnedSlice();
|
||||
table.appendAssumeCapacity(line);
|
||||
},
|
||||
'"' => state = .string,
|
||||
' ',
|
||||
'\t',
|
||||
=> {
|
||||
if (value_buf.items.len > 0) {
|
||||
try line_buf.ensureUnusedCapacity(1);
|
||||
const value = try value_buf.toOwnedSlice();
|
||||
line_buf.appendAssumeCapacity(value);
|
||||
}
|
||||
},
|
||||
'#' => {
|
||||
try table.ensureUnusedCapacity(1);
|
||||
if (value_buf.items.len > 0) {
|
||||
try line_buf.ensureUnusedCapacity(1);
|
||||
const value = try value_buf.toOwnedSlice();
|
||||
line_buf.appendAssumeCapacity(value);
|
||||
}
|
||||
const line = try line_buf.toOwnedSlice();
|
||||
table.appendAssumeCapacity(line);
|
||||
state = .comment;
|
||||
},
|
||||
else => |character| {
|
||||
const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
|
||||
|
||||
try value_buf.ensureUnusedCapacity(codepoint_len);
|
||||
const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
|
||||
|
||||
_ = try std.unicode.utf8Encode(character, buf);
|
||||
|
||||
value_buf.items.len += codepoint_len;
|
||||
},
|
||||
},
|
||||
.string => switch (codepoint) {
|
||||
'\n' => {
|
||||
// TODO: diagnostic: string not closed
|
||||
return error.StringNotClosed;
|
||||
},
|
||||
'"' => state = .string_double_quote,
|
||||
else => |character| {
|
||||
const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
|
||||
|
||||
try value_buf.ensureUnusedCapacity(codepoint_len);
|
||||
const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
|
||||
|
||||
_ = try std.unicode.utf8Encode(character, buf);
|
||||
|
||||
value_buf.items.len += codepoint_len;
|
||||
},
|
||||
},
|
||||
.string_double_quote => switch (codepoint) {
|
||||
'"' => {
|
||||
try value_buf.append('"');
|
||||
state = .string;
|
||||
},
|
||||
'/' => state = .string_line_break_escape,
|
||||
'\n' => {
|
||||
try table.ensureUnusedCapacity(1);
|
||||
if (value_buf.items.len > 0) {
|
||||
try line_buf.ensureUnusedCapacity(1);
|
||||
const value = try value_buf.toOwnedSlice();
|
||||
line_buf.appendAssumeCapacity(value);
|
||||
}
|
||||
const line = try line_buf.toOwnedSlice();
|
||||
table.appendAssumeCapacity(line);
|
||||
},
|
||||
'#' => {
|
||||
try table.ensureUnusedCapacity(1);
|
||||
if (value_buf.items.len > 0) {
|
||||
try line_buf.ensureUnusedCapacity(1);
|
||||
const value = try value_buf.toOwnedSlice();
|
||||
line_buf.appendAssumeCapacity(value);
|
||||
}
|
||||
const line = try line_buf.toOwnedSlice();
|
||||
table.appendAssumeCapacity(line);
|
||||
state = .comment;
|
||||
},
|
||||
' ',
|
||||
'\t',
|
||||
=> {
|
||||
try line_buf.ensureUnusedCapacity(1);
|
||||
const value = try value_buf.toOwnedSlice();
|
||||
line_buf.appendAssumeCapacity(value);
|
||||
state = .default;
|
||||
},
|
||||
else => |character| {
|
||||
const codepoint_len = try std.unicode.utf8CodepointSequenceLength(character);
|
||||
|
||||
try value_buf.ensureUnusedCapacity(codepoint_len);
|
||||
const buf = value_buf.unusedCapacitySlice()[0..codepoint_len];
|
||||
|
||||
_ = try std.unicode.utf8Encode(character, buf);
|
||||
|
||||
value_buf.items.len += codepoint_len;
|
||||
},
|
||||
},
|
||||
.string_line_break_escape => switch (codepoint) {
|
||||
'"' => {
|
||||
try value_buf.append('\n');
|
||||
state = .string;
|
||||
},
|
||||
else => {
|
||||
// TODO: diagnostic: invalid string line break
|
||||
return error.InvalidStringLineBreak;
|
||||
},
|
||||
},
|
||||
.comment => switch (codepoint) {
|
||||
'\n' => state = .default,
|
||||
else => {},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
try table.ensureUnusedCapacity(1);
|
||||
if (value_buf.items.len > 0) {
|
||||
try line_buf.ensureUnusedCapacity(1);
|
||||
const value = try value_buf.toOwnedSlice();
|
||||
line_buf.appendAssumeCapacity(value);
|
||||
}
|
||||
const line = try line_buf.toOwnedSlice();
|
||||
table.appendAssumeCapacity(line);
|
||||
}
|
||||
|
||||
const utf8_table = try table.toOwnedSlice();
|
||||
return .{ .utf8 = utf8_table };
|
||||
},
|
||||
else => return error.Unimplemented,
|
||||
}
|
||||
}
|
||||
|
||||
fn expectEqualUTF8Tables(expected_table: []const []const ?[]const u8, actual_table: []const []const ?[]const u8) !void {
|
||||
var is_errors = false;
|
||||
if (expected_table.len != actual_table.len) {
|
||||
std.debug.print("Expected table to have {} rows, found {} rows\n", .{ expected_table.len, actual_table.len });
|
||||
return error.TestExpectedEqual;
|
||||
}
|
||||
for (expected_table, actual_table, 0..) |expected_row, actual_row, row| {
|
||||
for (expected_row, actual_row, 0..) |expected_value, actual_value, col| {
|
||||
if (expected_value == null and actual_value != null) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected null
|
||||
\\ found "{}"
|
||||
\\
|
||||
, .{ row, col, std.zig.fmtEscapes(actual_value.?) });
|
||||
is_errors = true;
|
||||
}
|
||||
if (expected_value != null and actual_value == null) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected "{}"
|
||||
\\ found null
|
||||
\\
|
||||
, .{ row, col, std.zig.fmtEscapes(expected_value.?) });
|
||||
is_errors = true;
|
||||
}
|
||||
if (!std.mem.eql(u8, expected_value.?, actual_value.?)) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected "{}"
|
||||
\\ found "{}"
|
||||
\\
|
||||
, .{ row, col, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value.?) });
|
||||
is_errors = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_errors) {
|
||||
return error.TestExpectedEqual;
|
||||
}
|
||||
}
|
||||
|
||||
test parseAlloc {
|
||||
const table = try parseAlloc(testing.allocator, @embedFile("./testdata/Example01_Table_UTF8.txt"));
|
||||
defer table.free(testing.allocator);
|
||||
|
||||
try testing.expectEqual(reliabletxt.Encoding.utf8, @as(reliabletxt.Encoding, table));
|
||||
const utf8_table = table.utf8;
|
||||
|
||||
try expectEqualUTF8Tables(
|
||||
&.{
|
||||
&.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" },
|
||||
&.{ "~", "U+007E", "7E", "007E", "Tilde" },
|
||||
&.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" },
|
||||
&.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" },
|
||||
&.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" },
|
||||
&.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" },
|
||||
&.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" },
|
||||
&.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" },
|
||||
&.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" },
|
||||
&.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" },
|
||||
&.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" },
|
||||
&.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" },
|
||||
&.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" },
|
||||
&.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" },
|
||||
},
|
||||
utf8_table,
|
||||
);
|
||||
}
|
||||
|
||||
pub fn decodeString(encoded_string: []const u8, buffer: []u8) ![]const u8 {
|
||||
const State = enum {
|
||||
default,
|
||||
double_quote,
|
||||
double_quote_slash,
|
||||
};
|
||||
if (encoded_string.len < 1 or encoded_string[0] != '"' or encoded_string[encoded_string.len - 1] != '"') return error.InvalidFormat;
|
||||
var state = State.default;
|
||||
var write_pos: usize = 0;
|
||||
for (encoded_string[1 .. encoded_string.len - 1]) |encoded_character| {
|
||||
switch (state) {
|
||||
.default => switch (encoded_character) {
|
||||
'\n' => return error.InvalidFormat,
|
||||
'"' => state = .double_quote,
|
||||
else => {
|
||||
if (write_pos >= buffer.len) return error.OutOfMemory;
|
||||
buffer[write_pos] = encoded_character;
|
||||
write_pos += 1;
|
||||
},
|
||||
},
|
||||
.double_quote => switch (encoded_character) {
|
||||
'"' => {
|
||||
if (write_pos >= buffer.len) return error.OutOfMemory;
|
||||
buffer[write_pos] = encoded_character;
|
||||
write_pos += 1;
|
||||
state = .default;
|
||||
},
|
||||
'/' => state = .double_quote_slash,
|
||||
else => return error.InvalidFormat,
|
||||
},
|
||||
.double_quote_slash => switch (encoded_character) {
|
||||
'"' => {
|
||||
if (write_pos >= buffer.len) return error.OutOfMemory;
|
||||
buffer[write_pos] = '\n';
|
||||
write_pos += 1;
|
||||
state = .default;
|
||||
},
|
||||
else => return error.InvalidFormat,
|
||||
},
|
||||
}
|
||||
}
|
||||
return buffer[0..write_pos];
|
||||
}
|
||||
|
||||
test decodeString {
|
||||
var buffer: [128]u8 = undefined;
|
||||
try testing.expectEqualStrings("", try decodeString("\"\"", &buffer));
|
||||
try testing.expectEqualStrings("Latin Small Letter A", try decodeString("\"Latin Small Letter A\"", &buffer));
|
||||
try testing.expectEqualStrings("See these \"quotes\" I'm making with my claw hands? It means I don't belive you.", try decodeString("\"See these \"\"quotes\"\" I'm making with my claw hands? It means I don't belive you.\"", &buffer));
|
||||
try testing.expectEqualStrings("Line 1\nLine 2", try decodeString("\"Line 1\"/\"Line 2\"", &buffer));
|
||||
}
|
||||
|
||||
pub fn parseIter(contents_any: []const u8) !Iterator {
|
||||
switch (try reliabletxt.parse(contents_any)) {
|
||||
.utf8 => |contents_utf8| {
|
||||
const utf8_view = try std.unicode.Utf8View.init(contents_utf8);
|
||||
return Iterator{ .utf8 = .{
|
||||
.utf8_iter = utf8_view.iterator(),
|
||||
} };
|
||||
},
|
||||
else => return error.Unimplemented,
|
||||
}
|
||||
}
|
||||
|
||||
pub const Iterator = union(enum) {
|
||||
utf8: Utf8Iterator,
|
||||
_,
|
||||
};
|
||||
|
||||
pub const Utf8Iterator = struct {
|
||||
utf8_iter: std.unicode.Utf8Iterator,
|
||||
|
||||
pub const Item = union(enum) {
|
||||
newline,
|
||||
/// A value not surrounded by quotes. Can't include any whitespace.
|
||||
value: []const u8,
|
||||
/// A value surrounded by quotes. May include escaped double quotes or escaped newlines.
|
||||
string: []const u8,
|
||||
null,
|
||||
};
|
||||
|
||||
const ParseState = enum { default, value, string, string_double_quote, string_line_break_escape, comment };
|
||||
|
||||
pub fn next(this: *@This()) !?Item {
|
||||
var state = Utf8Iterator.ParseState.default;
|
||||
var value_start: usize = this.utf8_iter.i;
|
||||
while (this.utf8_iter.nextCodepoint()) |codepoint| {
|
||||
switch (state) {
|
||||
.default => switch (codepoint) {
|
||||
'\n' => return Item.newline,
|
||||
'"' => state = .string,
|
||||
|
||||
' ',
|
||||
'\t',
|
||||
=> value_start = this.utf8_iter.i,
|
||||
|
||||
'#' => state = .comment,
|
||||
else => state = .value,
|
||||
},
|
||||
.value => switch (codepoint) {
|
||||
// TODO: Add other whitespace characters
|
||||
'\n',
|
||||
' ',
|
||||
'\t',
|
||||
=> {
|
||||
this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
|
||||
return Item{ .value = this.utf8_iter.bytes[value_start..this.utf8_iter.i] };
|
||||
},
|
||||
|
||||
'"' => return error.DoubleQuoteInValue,
|
||||
|
||||
else => {},
|
||||
},
|
||||
.string => switch (codepoint) {
|
||||
'\n' => {
|
||||
// TODO: diagnostic: string not closed
|
||||
return error.StringNotClosed;
|
||||
},
|
||||
'"' => state = .string_double_quote,
|
||||
else => {},
|
||||
},
|
||||
.string_double_quote => switch (codepoint) {
|
||||
'"' => state = .string,
|
||||
'/' => state = .string_line_break_escape,
|
||||
|
||||
// TODO: Add other whitespace characters
|
||||
'\n',
|
||||
'#',
|
||||
' ',
|
||||
'\t',
|
||||
=> {
|
||||
// we roll back here so it can be handled in the next iteration of the loop
|
||||
this.utf8_iter.i -= std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
|
||||
return Item{ .string = this.utf8_iter.bytes[value_start..this.utf8_iter.i] };
|
||||
},
|
||||
|
||||
else => {},
|
||||
},
|
||||
.string_line_break_escape => switch (codepoint) {
|
||||
'"' => state = .string,
|
||||
else => {
|
||||
// TODO: diagnostic: invalid string line break
|
||||
return error.InvalidStringLineBreak;
|
||||
},
|
||||
},
|
||||
.comment => switch (codepoint) {
|
||||
'\n' => state = .default,
|
||||
else => {},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
fn expectEqualUTF8TablesIter(expected_table: []const []const ?[]const u8, actual_table: Utf8Iterator) !void {
|
||||
var actual_table_iter = actual_table;
|
||||
|
||||
var is_errors = false;
|
||||
var expected_row_index: usize = 0;
|
||||
var expected_value_index: usize = 0;
|
||||
while (try actual_table_iter.next()) |actual_parse_event| {
|
||||
if (expected_row_index > expected_table.len) {
|
||||
std.debug.print("Expected table to have at most {} rows, found more rows\n", .{expected_row_index});
|
||||
is_errors = true;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (actual_parse_event) {
|
||||
.newline => {
|
||||
expected_row_index += 1;
|
||||
expected_value_index = 0;
|
||||
},
|
||||
.value => |actual_value_str| {
|
||||
const expected_value = expected_table[expected_row_index][expected_value_index];
|
||||
if (expected_value == null) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected null
|
||||
\\ found "{}"
|
||||
\\
|
||||
, .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) });
|
||||
is_errors = true;
|
||||
} else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected "{}"
|
||||
\\ found "{}"
|
||||
\\
|
||||
, .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) });
|
||||
is_errors = true;
|
||||
}
|
||||
expected_value_index += 1;
|
||||
},
|
||||
.string => |actual_string_encoded| {
|
||||
var decode_buf: [128]u8 = undefined;
|
||||
const actual_value_str = try decodeString(actual_string_encoded, &decode_buf);
|
||||
|
||||
const expected_value = expected_table[expected_row_index][expected_value_index];
|
||||
|
||||
if (expected_value == null) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected null
|
||||
\\ found "{}"
|
||||
\\
|
||||
, .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(actual_value_str) });
|
||||
is_errors = true;
|
||||
} else if (!std.mem.eql(u8, expected_value.?, actual_value_str)) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected "{}"
|
||||
\\ found "{}"
|
||||
\\
|
||||
, .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?), std.zig.fmtEscapes(actual_value_str) });
|
||||
is_errors = true;
|
||||
}
|
||||
expected_value_index += 1;
|
||||
},
|
||||
.null => {
|
||||
const expected_value = expected_table[expected_row_index][expected_value_index];
|
||||
if (expected_value != null) {
|
||||
std.debug.print(
|
||||
\\at row {}, column {}
|
||||
\\ expected "{}"
|
||||
\\ found null
|
||||
\\
|
||||
, .{ expected_row_index, expected_value_index, std.zig.fmtEscapes(expected_value.?) });
|
||||
is_errors = true;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if (is_errors) {
|
||||
return error.TestExpectedEqual;
|
||||
}
|
||||
}
|
||||
|
||||
test parseIter {
|
||||
try expectEqualUTF8TablesIter(
|
||||
&.{
|
||||
&.{ "a", "U+0061", "61", "0061", "Latin Small Letter A" },
|
||||
&.{ "~", "U+007E", "7E", "007E", "Tilde" },
|
||||
&.{ "¥", "U+00A5", "C2_A5", "00A5", "Yen Sign" },
|
||||
&.{ "»", "U+00BB", "C2_BB", "00BB", "Right-Pointing Double Angle Quotation Mark" },
|
||||
&.{ "½", "U+00BD", "C2_BD", "00BD", "Vulgar Fraction One Half" },
|
||||
&.{ "¿", "U+00BF", "C2_BF", "00BF", "Inverted Question Mark" },
|
||||
&.{ "ß", "U+00DF", "C3_9F", "00DF", "Latin Small Letter Sharp S" },
|
||||
&.{ "ä", "U+00E4", "C3_A4", "00E4", "Latin Small Letter A with Diaeresis" },
|
||||
&.{ "ï", "U+00EF", "C3_AF", "00EF", "Latin Small Letter I with Diaeresis" },
|
||||
&.{ "œ", "U+0153", "C5_93", "0153", "Latin Small Ligature Oe" },
|
||||
&.{ "€", "U+20AC", "E2_82_AC", "20AC", "Euro Sign" },
|
||||
&.{ "東", "U+6771", "E6_9D_B1", "6771", "CJK Unified Ideograph-6771" },
|
||||
&.{ "𝄞", "U+1D11E", "F0_9D_84_9E", "D834_DD1E", "Musical Symbol G Clef" },
|
||||
&.{ "𠀇", "U+20007", "F0_A0_80_87", "D840_DC07", "CJK Unified Ideograph-20007" },
|
||||
},
|
||||
(try parseIter(@embedFile("./testdata/Example01_Table_UTF8.txt"))).utf8,
|
||||
);
|
||||
}
|
||||
|
||||
const reliabletxt = @import("./reliabletxt.zig");
|
||||
const testing = std.testing;
|
||||
const std = @import("std");
|
Loading…
Reference in New Issue