File: step_test.go

package info (click to toggle)
golang-github-rivo-uniseg 0.4.7-1
links: PTS, VCS
area: main
in suites: experimental, forky, sid, trixie, trixie-backports, trixie-proposed-updates
size: 4,516 kB
sloc: makefile: 2
file content (530 lines) | stat: -rw-r--r-- 14,233 bytes
package uniseg

import (
	"testing"
)

// Test official Grapheme Cluster Unicode test cases for grapheme clusters using
// the [Step] function.
func TestStepBytesGrapheme(t *testing.T) {
	for testNum, testCase := range graphemeBreakTestCases {
		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
		testNum,
		strings.TrimSpace(testCase.original),
		testCase.expected,
		decomposed(testCase.original),
		[]rune(testCase.original))*/
		b := []byte(testCase.original)
		state := -1
		var (
			index int
			c     []byte
		)
	GraphemeLoop:
		for len(b) > 0 {
			c, b, _, state = Step(b, state)

			if index >= len(testCase.expected) {
				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
					testNum,
					testCase.original,
					len(testCase.expected))
				break
			}

			cluster := []rune(string(c))
			if len(cluster) != len(testCase.expected[index]) {
				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
					testNum,
					testCase.original,
					index,
					len(cluster),
					cluster,
					len(testCase.expected[index]),
					testCase.expected[index])
				break
			}
			for i, r := range cluster {
				if r != testCase.expected[index][i] {
					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
						testNum,
						testCase.original,
						index,
						cluster,
						testCase.expected[index])
					break GraphemeLoop
				}
			}

			index++
		}
		if index < len(testCase.expected) {
			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
				testNum,
				testCase.original,
				index,
				len(testCase.expected))
		}
	}
	cluster, rest, boundaries, newState := Step([]byte{}, -1)
	if len(cluster) > 0 {
		t.Errorf(`Expected cluster to be empty byte slice, got %q`, cluster)
	}
	if len(rest) > 0 {
		t.Errorf(`Expected rest to be empty byte slice, got %q`, rest)
	}
	if boundaries != 0 {
		t.Errorf(`Expected width to be 0, got %d`, boundaries)
	}
	if newState != 0 {
		t.Errorf(`Expected newState to be 0, got %d`, newState)
	}
}

// Test official word boundaries Unicode test cases for grapheme clusters using
// the [Step] function.
func TestStepBytesWord(t *testing.T) {
	for testNum, testCase := range wordBreakTestCases {
		if testNum == 1700 {
			// This test case reveals an inconsistency in the Unicode rule set,
			// namely the handling of ZWJ within two RI graphemes. (Grapheme
			// rules will restart the RI count, word rules will ignore the ZWJ.)
			// An error has been reported.
			continue
		}
		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
		testNum,
		strings.TrimSpace(testCase.original),
		testCase.expected,
		decomposed(testCase.original),
		[]rune(testCase.original))*/
		b := []byte(testCase.original)
		state := -1
		var (
			index, boundaries int
			c                 []byte
			growingCluster    []rune
		)
	GraphemeLoop:
		for len(b) > 0 {
			c, b, boundaries, state = Step(b, state)

			if index >= len(testCase.expected) {
				t.Errorf(`Test case %d %q failed: More words returned than expected %d`,
					testNum,
					testCase.original,
					len(testCase.expected))
				break
			}

			growingCluster = append(growingCluster, []rune(string(c))...)
			if boundaries&MaskWord == 0 {
				continue
			}
			cluster := growingCluster
			growingCluster = nil
			if len(cluster) != len(testCase.expected[index]) {
				t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`,
					testNum,
					testCase.original,
					index,
					len(cluster),
					cluster,
					len(testCase.expected[index]),
					testCase.expected[index])
				break
			}
			for i, r := range cluster {
				if r != testCase.expected[index][i] {
					t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`,
						testNum,
						testCase.original,
						index,
						cluster,
						testCase.expected[index])
					break GraphemeLoop
				}
			}

			index++
		}
		if index < len(testCase.expected) {
			t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`,
				testNum,
				testCase.original,
				index,
				len(testCase.expected))
		}
	}
}

// Test official sentence boundaries Unicode test cases for grapheme clusters
// using the [Step] function.
func TestStepBytesSentence(t *testing.T) {
	for testNum, testCase := range sentenceBreakTestCases {
		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
		testNum,
		strings.TrimSpace(testCase.original),
		testCase.expected,
		decomposed(testCase.original),
		[]rune(testCase.original))*/
		b := []byte(testCase.original)
		state := -1
		var (
			index, boundaries int
			c                 []byte
			growingCluster    []rune
		)
	GraphemeLoop:
		for len(b) > 0 {
			c, b, boundaries, state = Step(b, state)

			if index >= len(testCase.expected) {
				t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`,
					testNum,
					testCase.original,
					len(testCase.expected))
				break
			}

			growingCluster = append(growingCluster, []rune(string(c))...)
			if boundaries&MaskSentence == 0 {
				continue
			}
			cluster := growingCluster
			growingCluster = nil
			if len(cluster) != len(testCase.expected[index]) {
				t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`,
					testNum,
					testCase.original,
					index,
					len(cluster),
					cluster,
					len(testCase.expected[index]),
					testCase.expected[index])
				break
			}
			for i, r := range cluster {
				if r != testCase.expected[index][i] {
					t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`,
						testNum,
						testCase.original,
						index,
						cluster,
						testCase.expected[index])
					break GraphemeLoop
				}
			}

			index++
		}
		if index < len(testCase.expected) {
			t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`,
				testNum,
				testCase.original,
				index,
				len(testCase.expected))
		}
	}
}

// We don't test the [Step] function for UAX #14 line breaking because the rules
// aren't really compatible. Specifically emoji modifiers and zero-width joiners
// are kept together by the grapheme cluster rules while line breaking rules
// will allow them to be broken apart. The handling of this limitation is
// outlined in Section 8.2 Example 6 of UAX #14.

// Test official Grapheme Cluster Unicode test cases for grapheme clusters using
// the StepString() function.
func TestStepStringGrapheme(t *testing.T) {
	for testNum, testCase := range graphemeBreakTestCases {
		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
		testNum,
		strings.TrimSpace(testCase.original),
		testCase.expected,
		decomposed(testCase.original),
		[]rune(testCase.original))*/
		str := testCase.original
		state := -1
		var (
			index int
			c     string
		)
	GraphemeLoop:
		for len(str) > 0 {
			c, str, _, state = StepString(str, state)

			if index >= len(testCase.expected) {
				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
					testNum,
					testCase.original,
					len(testCase.expected))
				break
			}

			cluster := []rune(c)
			if len(cluster) != len(testCase.expected[index]) {
				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
					testNum,
					testCase.original,
					index,
					len(cluster),
					cluster,
					len(testCase.expected[index]),
					testCase.expected[index])
				break
			}
			for i, r := range cluster {
				if r != testCase.expected[index][i] {
					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
						testNum,
						testCase.original,
						index,
						cluster,
						testCase.expected[index])
					break GraphemeLoop
				}
			}

			index++
		}
		if index < len(testCase.expected) {
			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
				testNum,
				testCase.original,
				index,
				len(testCase.expected))
		}
	}
	cluster, rest, boundaries, newState := StepString("", -1)
	if len(cluster) > 0 {
		t.Errorf(`Expected cluster to be empty string, got %q`, cluster)
	}
	if len(rest) > 0 {
		t.Errorf(`Expected rest to be empty string, got %q`, rest)
	}
	if boundaries != 0 {
		t.Errorf(`Expected width to be 0, got %d`, boundaries)
	}
	if newState != 0 {
		t.Errorf(`Expected newState to be 0, got %d`, newState)
	}
}

// Test official word boundaries Unicode test cases for grapheme clusters using
// the StepString() function.
func TestStepStringWord(t *testing.T) {
	for testNum, testCase := range wordBreakTestCases {
		if testNum == 1700 {
			// This test case reveals an inconsistency in the Unicode rule set,
			// namely the handling of ZWJ within two RI graphemes. (Grapheme
			// rules will restart the RI count, word rules will ignore the ZWJ.)
			// An error has been reported.
			continue
		}
		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
		testNum,
		strings.TrimSpace(testCase.original),
		testCase.expected,
		decomposed(testCase.original),
		[]rune(testCase.original))*/
		str := testCase.original
		state := -1
		var (
			index, boundaries int
			c                 string
			growingCluster    []rune
		)
	GraphemeLoop:
		for len(str) > 0 {
			c, str, boundaries, state = StepString(str, state)

			if index >= len(testCase.expected) {
				t.Errorf(`Test case %d %q failed: More words returned than expected %d`,
					testNum,
					testCase.original,
					len(testCase.expected))
				break
			}

			growingCluster = append(growingCluster, []rune(c)...)
			if boundaries&MaskWord == 0 {
				continue
			}
			cluster := growingCluster
			growingCluster = nil
			if len(cluster) != len(testCase.expected[index]) {
				t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`,
					testNum,
					testCase.original,
					index,
					len(cluster),
					cluster,
					len(testCase.expected[index]),
					testCase.expected[index])
				break
			}
			for i, r := range cluster {
				if r != testCase.expected[index][i] {
					t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`,
						testNum,
						testCase.original,
						index,
						cluster,
						testCase.expected[index])
					break GraphemeLoop
				}
			}

			index++
		}
		if index < len(testCase.expected) {
			t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`,
				testNum,
				testCase.original,
				index,
				len(testCase.expected))
		}
	}
}

// Test official sentence boundaries Unicode test cases for grapheme clusters
// using the StepString() function.
func TestStepStringSentence(t *testing.T) {
	for testNum, testCase := range sentenceBreakTestCases {
		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
		testNum,
		strings.TrimSpace(testCase.original),
		testCase.expected,
		decomposed(testCase.original),
		[]rune(testCase.original))*/
		str := testCase.original
		state := -1
		var (
			index, boundaries int
			c                 string
			growingCluster    []rune
		)
	GraphemeLoop:
		for len(str) > 0 {
			c, str, boundaries, state = StepString(str, state)

			if index >= len(testCase.expected) {
				t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`,
					testNum,
					testCase.original,
					len(testCase.expected))
				break
			}

			growingCluster = append(growingCluster, []rune(c)...)
			if boundaries&MaskSentence == 0 {
				continue
			}
			cluster := growingCluster
			growingCluster = nil
			if len(cluster) != len(testCase.expected[index]) {
				t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`,
					testNum,
					testCase.original,
					index,
					len(cluster),
					cluster,
					len(testCase.expected[index]),
					testCase.expected[index])
				break
			}
			for i, r := range cluster {
				if r != testCase.expected[index][i] {
					t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`,
						testNum,
						testCase.original,
						index,
						cluster,
						testCase.expected[index])
					break GraphemeLoop
				}
			}

			index++
		}
		if index < len(testCase.expected) {
			t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`,
				testNum,
				testCase.original,
				index,
				len(testCase.expected))
		}
	}
}

// Benchmark the use of the [Step] function.
func BenchmarkStepBytes(b *testing.B) {
	for i := 0; i < b.N; i++ {
		var c []byte
		state := -1
		str := benchmarkBytes
		for len(str) > 0 {
			c, str, _, state = Step(str, state)
			resultRunes = []rune(string(c))
		}
	}
}

// Benchmark the use of the StepString() function.
func BenchmarkStepString(b *testing.B) {
	for i := 0; i < b.N; i++ {
		var c string
		state := -1
		str := benchmarkStr
		for len(str) > 0 {
			c, str, _, state = StepString(str, state)
			resultRunes = []rune(c)
		}
	}
}

// Fuzz the StepString function.
func FuzzStepString(f *testing.F) {
	for _, tc := range graphemeBreakTestCases {
		f.Add(tc.original)
	}
	f.Fuzz(func(t *testing.T, orig string) {
		var (
			c          string
			b          []byte
			boundaries int
		)
		str := orig
		state := -1
		for len(str) > 0 {
			c, str, boundaries, state = StepString(str, state)
			b = append(b, []byte(c)...)
		}

		// Check if the constructed string is the same as the original.
		if string(b) != orig {
			t.Errorf("Fuzzing failed: %q != %q", string(b), orig)
		}

		// For all other checks, we need to have a non-empty string.
		if orig == "" {
			return
		}

		// Check end boundaries.
		if boundaries&MaskWord == 0 {
			t.Errorf("String %q does not end on a word boundary (final boundary = %x)", orig, state)
		}
		if boundaries&MaskSentence == 0 {
			t.Errorf("String %q does not end on a sentence boundary (final boundary = %x)", orig, state)
		}
		if boundaries&MaskLine != LineMustBreak {
			t.Errorf("String %q does not end with a mandatory line break (final boundary = %x)", orig, state)
		}

		// Note: If you have ideas for more useful checks we could add here,
		// please submit them here:
		// https://github.com/rivo/uniseg/issues
	})
}