beacon-biosignals · jrevels · Aug 11, 2021 · Aug 10, 2021 · Aug 10, 2021 · Aug 10, 2021
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Onda"
 uuid = "e853f5be-6863-11e9-128d-476edb89bfb5"
 authors = ["Beacon Biosignals, Inc."]
-version = "0.14.0"
+version = "0.14.1"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

diff --git a/README.md b/README.md
@@ -84,11 +84,12 @@ The following sections provide [the version integer](https://beacon-biosignals.g
 - `span` (`Struct`): The signal's time span within the recording. This structure has two fields:
     - `start` (`Duration` w/ `NANOSECOND` unit): The start offset in nanoseconds from the beginning of the recording. The minimum possible value is `0`.
     - `stop` (`Duration` w/ `NANOSECOND` unit): The stop offset in nanoseconds (exclusive) from the beginning of the recording. This value must be greater than `start`.
-- `kind` (`Utf8`): A string identifying the kind of signal that the row represents. Valid `kind` values are alphanumeric, lowercase, `snake_case`, and contain no whitespace, punctuation, or leading/trailing underscores.
+- `kind` (`Utf8`): A string identifying the kind of signal that the row represents. Valid `kind` values are alphanumeric, nonempty, lowercase, `snake_case`, and contain no whitespace, punctuation, or leading/trailing underscores.
 - `channels` (`List` of `Utf8`): A list of strings where the `i`th element is the name of the signal's `i`th channel. A valid channel name...
-    - ...conforms to the same format as `kind` (alphanumeric, lowercase, `snake_case`, and contain no whitespace, punctuation, or leading/trailing underscores).
+    - ...conforms to the same format as `kind` (alphanumeric, nonempty, lowercase, `snake_case`, and contain no whitespace, punctuation, or leading/trailing underscores).
     - ...conforms to an `a-b` format where `a` and `b` are valid channel names. Furthermore, to allow arbitrary cross-signal referencing, `a` and/or `b` may be channel names from other signals contained in the recording. If this is the case, such a name must be qualified in the format `signal_name.channel_name`. For example, an `eog` signal might have a channel named `left-eeg.m1` (the left eye electrode referenced to the mastoid electrode from a 10-20 EEG signal).
-- `sample_unit` (`Utf8`): The name of the signal's canonical unit as a string. This string should conform to the same format as `kind` (alphanumeric, lowercase, `snake_case`, and contain no whitespace, punctuation, or leading/trailing underscores), should be singular and not contain abbreviations (e.g. `"uV"` is bad, `"microvolt"` is good; `"l/m"` is bad, `"liter_per_minute"` is good).
+    - ...is unique amongst the other channel names in the signal. In other words, duplicate channel names within the same signal are disallowed.
+- `sample_unit` (`Utf8`): The name of the signal's canonical unit as a string. This string should conform to the same format as `kind` (alphanumeric, nonempty, lowercase, `snake_case`, and contain no whitespace, punctuation, or leading/trailing underscores), should be singular and not contain abbreviations (e.g. `"uV"` is bad, `"microvolt"` is good; `"l/m"` is bad, `"liter_per_minute"` is good).
 - `sample_resolution_in_unit` (`Int` or `FloatingPoint`): The signal's resolution in its canonical unit. This value, along with the signal's `sample_type` and `sample_offset_in_unit` fields, determines the signal's LPCM quantization scheme.
 - `sample_offset_in_unit`  (`Int` or `FloatingPoint`): The signal's zero-offset in its canonical unit (thus allowing LPCM encodings that are centered around non-zero values).
 - `sample_type` (`Utf8`): The primitive scalar type used to encode each sample in the signal. Valid values are:

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -24,6 +24,7 @@ Onda.read_byte_range
 ```@docs
 Annotation
 write_annotations
+validate_annotations
 merge_overlapping_annotations
 ```
 
@@ -33,6 +34,7 @@ merge_overlapping_annotations
 Signal
 SamplesInfo
 write_signals
+validate_signals
 channel(x, name)
 channel(x, i::Integer)
 channel_count(x)

diff --git a/src/Onda.jl b/src/Onda.jl
@@ -10,10 +10,11 @@ using Legolas: @row
 include("utilities.jl")
 
 include("annotations.jl")
-export Annotation, write_annotations, merge_overlapping_annotations
+export Annotation, write_annotations, validate_annotations, merge_overlapping_annotations
 
 include("signals.jl")
-export Signal, SamplesInfo, write_signals, channel, channel_count, sample_count, sizeof_samples, sample_type
+export Signal, SamplesInfo, write_signals, validate_signals,
+       channel, channel_count, sample_count, sizeof_samples, sample_type
 
 include("serialization.jl")
 export AbstractLPCMFormat, AbstractLPCMStream, LPCMFormat, LPCMZstFormat,

diff --git a/src/annotations.jl b/src/annotations.jl
@@ -30,6 +30,20 @@ Invoke/return `Legolas.write(path_or_io, annotations, Schema("onda.annotation@1"
 """
 write_annotations(path_or_io, annotations; kwargs...) = Legolas.write(path_or_io, annotations, Legolas.Schema("onda.annotation@1"); kwargs...)
 
+"""
+    validate_annotations(annotations)
+
+Perform both table-level and row-level validation checks on the content of `annotations`,
+a presumed `onda.annotation` table. Returns `annotations`.
+
+This function will throw an error in any of the following cases:
+
+- `Legolas.validate(annotations, Legolas.Schema("onda.annotation@1"))` throws an error
+- `Annotation(row)` errors for any `row` in `Tables.rows(annotations)`
+- `annotations` contains rows with duplicate `id`s
+"""
+validate_annotations(annotations) = _fully_validate_legolas_table(annotations, Legolas.Schema("onda.annotation@1"), :id)
+
 #####
 ##### utilities
 #####

diff --git a/src/signals.jl b/src/signals.jl
@@ -100,7 +100,7 @@ const Signal = @row("onda.signal@1" > "onda.samples-info@1",
                     file_format::AbstractString = file_format isa AbstractLPCMFormat ? file_format_string(file_format) : file_format,
                     span::Union{NamedTupleTimeSpan,TimeSpan} = TimeSpan(span),
                     kind::AbstractString = _validate_signal_kind(kind),
-                    channels::AbstractVector{<:AbstractString} = (foreach(_validate_signal_channel, channels); channels),
+                    channels::AbstractVector{<:AbstractString} = _validate_signal_channels(channels),
                     sample_unit::AbstractString = _validate_signal_sample_unit(sample_unit))
 
 function _validate_signal_kind(x)
@@ -113,8 +113,14 @@ function _validate_signal_sample_unit(x)
     return x
 end
 
+function _validate_signal_channels(x)
+    allunique(x) || throw(ArgumentError("invalid signal channels (duplicate channel names are disallowed): $x"))
+    foreach(_validate_signal_channel, x)
+    return x
+end
+
 function _validate_signal_channel(x)
-    is_lower_snake_case_alphanumeric(x, ('-', '.')) || throw(ArgumentError("invalid channel name (must be lowercase/snakecase/alphanumeric): $c"))
+    is_lower_snake_case_alphanumeric(x, ('-', '.')) || throw(ArgumentError("invalid channel name (must be lowercase/snakecase/alphanumeric): $x"))
     return x
 end
 
@@ -129,6 +135,20 @@ Invoke/return `Legolas.write(path_or_io, signals, Schema("onda.signal@1"); kwarg
 """
 write_signals(path_or_io, signals; kwargs...) = Legolas.write(path_or_io, signals, Legolas.Schema("onda.signal@1"); kwargs...)
 
+"""
+    validate_signals(signals)
+
+Perform both table-level and row-level validation checks on the content of `signals`,
+a presumed `onda.signal` table. Returns `signals`.
+
+This function will throw an error in any of the following cases:
+
+- `Legolas.validate(signals, Legolas.Schema("onda.signal@1"))` throws an error
+- `Signal(row)` errors for any `row` in `Tables.rows(signals)`
+- `signals` contains rows with duplicate `file_path`s
+"""
+validate_signals(signals) = _fully_validate_legolas_table(signals, Legolas.Schema("onda.signal@1"), :file_path)
+
 #####
 ##### duck-typed utilities
 #####

diff --git a/src/utilities.jl b/src/utilities.jl
@@ -5,10 +5,32 @@ const ALPHANUMERIC_SNAKE_CASE_CHARACTERS = Char['_',
                                                 'a':'z'...]
 
 function is_lower_snake_case_alphanumeric(x::AbstractString, also_allow=())
-    return !startswith(x, '_') && !endswith(x, '_') &&
+    return !isempty(x) && !startswith(x, '_') && !endswith(x, '_') &&
            all(i -> i in ALPHANUMERIC_SNAKE_CASE_CHARACTERS || i in also_allow, x)
 end
 
+# TODO port a generic version of this + notion of primary key to Legolas.jl
+function _fully_validate_legolas_table(table, schema::Legolas.Schema, primary_key)
+    Legolas.validate(table, schema)
+    primary_counts = Dict{Any,Int}()
+    for (i, row) in enumerate(Tables.rows(table))
+        local validated_row
+        try
+            validated_row = Legolas.Row(schema, row)
+        catch err
+            log("Encountered invalid row $i when validating table's compliance with $schema:")
+            rethrow(err)
+        end
+        primary = Tables.getcolumn(validated_row, primary_key)
+        primary_counts[primary] = get(primary_counts, primary, 0) + 1
+    end
+    filter!(>(1) ∘ last, primary_counts)
+    if !isempty(primary_counts)
+        throw(ArgumentError("duplicate $primary_key values found in given $schema table: $primary_counts"))
+    end
+    return table
+end
+
 #####
 ##### arrrrr i'm a pirate
 #####

diff --git a/test/annotations.jl b/test/annotations.jl
@@ -28,6 +28,16 @@ end
     test_annotation_row(uuid4(), uuid4(), TimeSpan(Nanosecond(1), Nanosecond(100)); custom...)
 end
 
+@testset "`onda.annotation` validation" begin
+    template = (recording=uuid4(), id=uuid4(), span=TimeSpan(0, 1), custom=1234)
+    @test Annotation(template) isa Annotation
+    good = [template, Tables.rowmerge(template; id=uuid4()), Tables.rowmerge(template; id=uuid4())]
+    @test validate_annotations(good) === good
+    @test_throws ArgumentError validate_annotations(vcat(good, template))
+    @test_throws ArgumentError validate_annotations([template, template, template])
+    @test_throws ArgumentError validate_annotations((x=[1, 2, 3], y=["lol", "bad", "table"]))
+end
+
 @testset "`merge_overlapping_annotations`" begin
     recs = (uuid4(), uuid4(), uuid4())
     sources = [#= 1 =#  Annotation(recording=recs[1], id=uuid4(), span=TimeSpan(0, 100)),

diff --git a/test/signals.jl b/test/signals.jl
@@ -83,3 +83,30 @@ end
     test_signal_row(uuid4(), "/file/path", LPCMZstFormat(LPCMFormat(3, UInt16)), TimeSpan(Nanosecond(1), Nanosecond(100)),
                     "kind", ["ab", "a", "c"], "microvolt", 1.5, 0.4, UInt16, 256.3; custom...)
 end
+
+@testset "`onda.signal` validation" begin
+    template = (recording=uuid4(), file_path="/file/path", file_format="lpcm", span=TimeSpan(0, 1),
+                kind="x", channels=["a", "b", "c"],
+                sample_unit="microvolt", sample_rate=256, sample_resolution_in_unit=0.4,
+                sample_offset_in_unit=0.4, sample_type="uint8")
+    @test Signal(template) isa Signal
+    bad_rows = [Tables.rowmerge(template; channels = ["a", "b", "c", "a"]),
+                Tables.rowmerge(template; channels = ["a", "B", "c"]),
+                Tables.rowmerge(template; channels = ["a", "   ", "c"]),
+                Tables.rowmerge(template; sample_type = "not a valid sample type"),
+                Tables.rowmerge(template; sample_type = Tuple),
+                Tables.rowmerge(template; kind = "NO"),
+                Tables.rowmerge(template; kind = "   "),
+                Tables.rowmerge(template; kind = ""),
+                Tables.rowmerge(template; sample_unit = ""),
+                Tables.rowmerge(template; sample_unit = "  hA HA")]
+    for bad_row in bad_rows
+        @test_throws ArgumentError Signal(bad_row)
+    end
+    good = [template, Tables.rowmerge(template; file_path="/a/b"), Tables.rowmerge(template; file_path="/c/d")]
+    @test validate_signals(good) === good
+    @test_throws ArgumentError validate_signals(bad_rows)
+    @test_throws ArgumentError validate_signals(vcat(good, bad_rows[1]))
+    @test_throws ArgumentError validate_signals([template, template, template])
+    @test_throws ArgumentError validate_signals((x=[1, 2, 3], y=["lol", "bad", "table"]))
+end