SortTags deduplicates, uses slices.SortStable

golang.org/x/exp/slices has comparable allocation-free sorting, which we can use while reducing our code. Neither Datadog nor Prometheus appear to retain or handle multiple tags with the same name on the same metric, so this package deduplicates at time of sort (latest tag wins). If any metric systems we use do tolerate and provide useful behavior when provided duplicate tag/label names, and the practice is justified, that would be a reason to revert or adjust this deduplication behavior.
segmentio · Jul 16, 2023 · d65d2b2 · d65d2b2
1 parent d11a18e
commit d65d2b2
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 33 deletions.
diff --git a/engine.go b/engine.go
@@ -68,7 +68,7 @@ func (eng *Engine) WithPrefix(prefix string, tags ...Tag) *Engine {
 	return &Engine{
 		Handler: eng.Handler,
 		Prefix:  eng.makeName(prefix),
-		Tags:    eng.makeTags(tags),
+		Tags:    mergeTags(eng.Tags, tags),
 	}
 }
 
@@ -170,10 +170,6 @@ func (eng *Engine) makeName(name string) string {
 	return concat(eng.Prefix, name)
 }
 
-func (eng *Engine) makeTags(tags []Tag) []Tag {
-	return SortTags(concatTags(eng.Tags, tags))
-}
-
 var measureArrayPool = sync.Pool{
 	New: func() interface{} { return new([1]Measure) },
 }

diff --git a/tag.go b/tag.go
@@ -47,21 +47,54 @@ func SortTags(tags []Tag) []Tag {
 	// For 20 or fewer tags, this is as fast as an unstable sort.
 	slices.SortStableFunc(tags, tagIsLess)
 
-	return tags
+	return deduplicateTags(tags)
 }
 
 func tagIsLess(a, b Tag) bool { return a.Name < b.Name }
 
+func deduplicateTags(tags []Tag) []Tag {
+	var prev string
+	out := tags[:0]
+
+	for _, tag := range tags {
+		switch {
+		case tag.Name == "":
+			// Ignore unnamed tags.
+			continue
+
+		case tag.Name != prev:
+			// Non-duplicate tag: keep.
+			prev = tag.Name
+			out = append(out, tag)
+
+		default:
+			// Duplicate tag: replace previous, same-named tag.
+			i := len(out) - 1
+			out[i] = tag
+		}
+	}
+
+	if len(out) == 0 {
+		// No input tags had non-empty names:
+		// return nil to be consistent for ease of testing.
+		return nil
+	}
 
-func concatTags(t1 []Tag, t2 []Tag) []Tag {
+	return out
+}
+
+// mergeTags returns the sorted, deduplicated-by-name union of t1 and t2.
+func mergeTags(t1, t2 []Tag) []Tag {
 	n := len(t1) + len(t2)
 	if n == 0 {
 		return nil
 	}
-	t3 := make([]Tag, 0, n)
-	t3 = append(t3, t1...)
-	t3 = append(t3, t2...)
-	return t3
+
+	out := make([]Tag, 0, n)
+	out = append(out, t1...)
+	out = append(out, t2...)
+
+	return SortTags(out)
 }
 
 func copyTags(tags []Tag) []Tag {

diff --git a/tag_test.go b/tag_test.go
@@ -32,7 +32,8 @@ func Test_copyTags(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run("", func(t *testing.T) {
-			if tags := copyTags(test.t1); !reflect.DeepEqual(tags, test.t2) {
+			tags := copyTags(test.t1)
+			if !reflect.DeepEqual(tags, test.t2) {
 				t.Errorf("copyTags => %#v != %#v", tags, test.t2)
 			}
 		})
@@ -43,41 +44,58 @@ func Test_mergeTags(t *testing.T) {
 	t.Parallel()
 
 	tests := []struct {
-		t1 []Tag
-		t2 []Tag
-		t3 []Tag
+		name       string
+		t1, t2, t3 []Tag
 	}{
 		{
-			t1: nil,
-			t2: nil,
-			t3: nil,
+			name: "nil_inputs",
+			t1:   nil,
+			t2:   nil,
+			t3:   nil,
 		},
 		{
-			t1: []Tag{},
-			t2: []Tag{},
-			t3: nil,
+			name: "empty_inputs",
+			t1:   []Tag{},
+			t2:   []Tag{},
+			t3:   nil,
 		},
 		{
-			t1: []Tag{{"A", "1"}},
-			t2: nil,
-			t3: []Tag{{"A", "1"}},
+			name: "second_empty_input",
+			t1:   []Tag{{"A", "1"}},
+			t2:   nil,
+			t3:   []Tag{{"A", "1"}},
 		},
 		{
-			t1: nil,
-			t2: []Tag{{"B", "2"}},
-			t3: []Tag{{"B", "2"}},
+			name: "first_empty_input",
+			t1:   nil,
+			t2:   []Tag{{"B", "2"}},
+			t3:   []Tag{{"B", "2"}},
+		},
+		{
+			name: "non_duplicated_inputs",
+			t1:   []Tag{{"A", "1"}},
+			t2:   []Tag{{"B", "2"}},
+			t3:   []Tag{{"A", "1"}, {"B", "2"}},
 		},
 		{
-			t1: []Tag{{"A", "1"}},
-			t2: []Tag{{"B", "2"}},
-			t3: []Tag{{"A", "1"}, {"B", "2"}},
+			name: "cross_duplicated_inputs",
+			t1:   []Tag{{"A", "1"}},
+			t2:   []Tag{{"A", "2"}},
+			t3:   []Tag{{"A", "2"}},
+		},
+		{
+			name: "self_duplicated_input",
+			t1:   []Tag{{"A", "2"}, {"A", "1"}},
+			t2:   nil,
+			t3:   []Tag{{"A", "1"}},
 		},
 	}
 
 	for _, test := range tests {
-		t.Run("", func(t *testing.T) {
-			if tags := concatTags(test.t1, test.t2); !reflect.DeepEqual(tags, test.t3) {
-				t.Errorf("concatTags => %#v != %#v", tags, test.t3)
+		t.Run(test.name, func(t *testing.T) {
+			tags := mergeTags(test.t1, test.t2)
+			if !reflect.DeepEqual(tags, test.t3) {
+				t.Errorf("mergeTags => %v != %v", tags, test.t3)
 			}
 		})
 	}
@@ -329,3 +347,44 @@ func Benchmark_tagsBuffer_sort_unsorted(b *testing.B) {
 		buf.sort()
 	}
 }
+
+func Benchmark_mergeTags(b *testing.B) {
+	b.ReportAllocs()
+
+	origT1 := []Tag{
+		{"hello", "world"},
+		{"answer", "42"},
+		{"some long tag name", "!"},
+		{"some longer tag name", "1234"},
+		{"A", ""},
+		{"B", ""},
+		{"C", ""},
+		{"hello", "world"},
+		{"answer", "42"},
+		{"some long tag name", "!"},
+	}
+
+	origT2 := []Tag{
+		{"some longer tag name", "1234"},
+		{"A", ""},
+		{"B", ""},
+		{"C", ""},
+		{"hello", "world"},
+		{"answer", "42"},
+		{"some long tag name", "!"},
+		{"some longer tag name", "1234"},
+		{"A", ""},
+		{"B", ""},
+		{"C", ""},
+	}
+
+	t1 := make([]Tag, len(origT1))
+	t2 := make([]Tag, len(origT2))
+
+	for i := 0; i < b.N; i++ {
+		copy(t1, origT1)
+		copy(t2, origT2)
+
+		_ = mergeTags(t1, t2)
+	}
+}