Jelajahi Sumber

refactor: simplify stringx.Replacer, and avoid potential infinite loops (#2877)

* simplify replace

* backup

* refactor: simplify stringx.Replacer

* chore: add comments and const

* chore: add more tests

* chore: rename variable
Kevin Wan 2 tahun lalu
induk
melakukan
cddf3875cf
4 mengubah file dengan 54 tambahan dan 352 penghapusan
  1. 0 98
      core/stringx/node.go
  2. 0 234
      core/stringx/node_test.go
  3. 44 20
      core/stringx/replacer.go
  4. 10 0
      core/stringx/replacer_test.go

+ 0 - 98
core/stringx/node.go

@@ -96,101 +96,3 @@ func (n *node) find(chars []rune) []scope {
 
 	return scopes
 }
-
-func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) {
-	cur := n
-	var longestMatched *node
-
-	for i := len(paths); i < len(chars); i++ {
-		char := chars[i]
-		child, ok := cur.children[char]
-		if ok {
-			cur = child
-			if cur.end {
-				longestMatched = cur
-			}
-			paths = append(paths, cur)
-		} else {
-			if longestMatched != nil {
-				return 0, longestMatched.depth, nil
-			}
-
-			if n.end {
-				return 0, n.depth, nil
-			}
-
-			// new path match
-			var jump *node
-			// old path pre longest preMatch
-			preMatch, preStart := findMatch(paths)
-			icur := cur
-			for icur.fail != nil {
-				jump, ok = icur.fail.children[char]
-				if ok {
-					break
-				}
-
-				icur = icur.fail
-			}
-
-			switch {
-			case preMatch != nil && jump != nil:
-				if jumpStart := i - jump.depth + 1; preStart < jumpStart {
-					return preStart, preMatch.depth, nil
-				} else {
-					return jumpStart, 0, append(paths[jumpStart:], jump)
-				}
-			case preMatch != nil && jump == nil:
-				return preStart, preMatch.depth, nil
-			case preMatch == nil && jump != nil:
-				return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump)
-			case preMatch == nil && jump == nil:
-				return i + 1, 0, nil
-			}
-		}
-	}
-
-	// this longest matched node
-	if longestMatched != nil {
-		return 0, longestMatched.depth, nil
-	}
-
-	if n.end {
-		return 0, n.depth, nil
-	}
-
-	match, start := findMatch(paths)
-	if match != nil {
-		return start, match.depth, nil
-	}
-
-	return len(chars), 0, nil
-}
-
-func findMatch(path []*node) (*node, int) {
-	var result *node
-	var start int
-
-	for i := len(path) - 1; i >= 0; i-- {
-		icur := path[i]
-		var cur *node
-		for icur.fail != nil {
-			if icur.fail.end {
-				cur = icur.fail
-				break
-			}
-			icur = icur.fail
-		}
-		if cur != nil {
-			if result == nil {
-				result = cur
-				start = i - result.depth + 1
-			} else if curStart := i - cur.depth + 1; curStart < start {
-				result = cur
-				start = curStart
-			}
-		}
-	}
-
-	return result, start
-}

+ 0 - 234
core/stringx/node_test.go

@@ -6,15 +6,6 @@ import (
 	"github.com/stretchr/testify/assert"
 )
 
-func TestLongestMatchGuardedCondition(t *testing.T) {
-	n := new(node)
-	n.end = true
-	uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil)
-	assert.Equal(t, 0, uselessLen)
-	assert.Nil(t, jump)
-	assert.Equal(t, 0, matchLen)
-}
-
 func TestFuzzNodeCase1(t *testing.T) {
 	keywords := []string{
 		"cs8Zh",
@@ -202,228 +193,3 @@ func BenchmarkNodeFind(b *testing.B) {
 		trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演"))
 	}
 }
-
-func TestNode_longestMatchCase0(t *testing.T) {
-	// match the longest word
-	keywords := []string{
-		"a",
-		"ab",
-		"abc",
-		"abcd",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil)
-	assert.Equal(t, 0, uselessLen)
-	assert.Equal(t, 3, matchLen)
-	assert.Nil(t, jump)
-}
-
-func TestNode_longestMatchCase1(t *testing.T) {
-	keywords := []string{
-		"abcde",
-		"bcde",
-		"cde",
-		"de",
-
-		"b",
-		"bc",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 2, matchLen)
-	assert.Nil(t, jump)
-}
-
-func TestNode_longestMatchCase2(t *testing.T) {
-	keywords := []string{
-		"abcde",
-		"bcde",
-		"cde",
-		"de",
-
-		"c",
-		"cd",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
-	assert.Equal(t, 2, uselessLen)
-	assert.Equal(t, 2, matchLen)
-	assert.Nil(t, jump)
-}
-
-func TestNode_longestMatchCase3(t *testing.T) {
-	keywords := []string{
-		"abcde",
-		"bcde",
-		"cde",
-		"de",
-
-		"b",
-		"bc",
-		"c",
-		"cd",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 2, matchLen)
-	assert.Nil(t, jump)
-}
-
-func TestNode_longestMatchCase4(t *testing.T) {
-	keywords := []string{
-		"abcde",
-		"bcdf",
-		"bcd",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 0, matchLen)
-	assert.Equal(t, 4, len(paths))
-}
-
-func TestNode_longestMatchCase5(t *testing.T) {
-	keywords := []string{
-		"abcdef",
-		"bcde",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 4, matchLen)
-	assert.Nil(t, paths)
-}
-
-func TestNode_longestMatchCase6(t *testing.T) {
-	keywords := []string{
-		"abcde",
-		"bc",
-		"d",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 2, matchLen)
-	assert.Nil(t, jump)
-}
-
-func TestNode_longestMatchCase7(t *testing.T) {
-	keywords := []string{
-		"abcdeg",
-		"cdef",
-		"bcde",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	word := []rune("abcdef")
-	uselessLen, matchLen, paths := trie.longestMatch(word, nil)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 4, matchLen)
-	assert.Nil(t, paths)
-	uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 0, matchLen)
-	assert.Nil(t, paths)
-}
-
-func TestNode_longestMatchCase8(t *testing.T) {
-	keywords := []string{
-		"abcdeg",
-		"cdef",
-		"cde",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	word := []rune("abcdef")
-	uselessLen, matchLen, paths := trie.longestMatch(word, nil)
-	assert.Equal(t, 2, uselessLen)
-	assert.Equal(t, 0, matchLen)
-	assert.NotNil(t, paths)
-}
-
-func TestNode_longestMatchCase9(t *testing.T) {
-	keywords := []string{
-		"abcdeg",
-		"cdef",
-		"cde",
-		"cd",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-
-	word := []rune("abcde")
-	uselessLen, matchLen, paths := trie.longestMatch(word, nil)
-	assert.Equal(t, 2, uselessLen)
-	assert.Equal(t, 3, matchLen)
-	assert.Nil(t, paths)
-}
-
-func TestNode_jump(t *testing.T) {
-	keywords := []string{
-		"de",
-		"fe",
-	}
-	trie := new(node)
-	for _, keyword := range keywords {
-		trie.add(keyword)
-	}
-	trie.build()
-	target := []rune("dfe")
-
-	uselessLen, matchLen, paths := trie.longestMatch(target, nil)
-	assert.Equal(t, 1, uselessLen)
-	assert.Equal(t, 0, matchLen)
-	assert.NotNil(t, paths)
-	uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths)
-	assert.Equal(t, 0, uselessLen)
-	assert.Equal(t, 2, matchLen)
-	assert.Nil(t, paths)
-}

+ 44 - 20
core/stringx/replacer.go

@@ -1,9 +1,14 @@
 package stringx
 
 import (
+	"sort"
 	"strings"
 )
 
+// replace more than once to avoid overlapped keywords after replace.
+// only try 2 times to avoid too many or infinite loops.
+const replaceTimes = 2
+
 type (
 	// Replacer interface wraps the Replace method.
 	Replacer interface {
@@ -32,29 +37,48 @@ func NewReplacer(mapping map[string]string) Replacer {
 
 // Replace replaces text with given substitutes.
 func (r *replacer) Replace(text string) string {
-	var buf strings.Builder
-	var paths []*node
-	target := []rune(text)
-	cur := r.node
-
-	for len(target) != 0 {
-		uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths)
-		if uselessLen > 0 {
-			buf.WriteString(string(target[:uselessLen]))
-			target = target[uselessLen:]
+	for i := 0; i < replaceTimes; i++ {
+		var replaced bool
+		if text, replaced = r.doReplace(text); !replaced {
+			return text
 		}
-		if matchLen > 0 {
-			replaced := r.mapping[string(target[:matchLen])]
-			target = append([]rune(replaced), target[matchLen:]...)
+	}
+
+	return text
+}
+
+func (r *replacer) doReplace(text string) (string, bool) {
+	chars := []rune(text)
+	scopes := r.find(chars)
+	if len(scopes) == 0 {
+		return text, false
+	}
+
+	sort.Slice(scopes, func(i, j int) bool {
+		if scopes[i].start < scopes[j].start {
+			return true
+		}
+		if scopes[i].start == scopes[j].start {
+			return scopes[i].stop > scopes[j].stop
 		}
-		if len(nextPaths) != 0 {
-			cur = nextPaths[len(nextPaths)-1]
-			paths = nextPaths
-		} else {
-			cur = r.node
-			paths = nil
+		return false
+	})
+
+	var buf strings.Builder
+	var index int
+	for i := 0; i < len(scopes); i++ {
+		scp := &scopes[i]
+		if scp.start < index {
+			continue
 		}
+
+		buf.WriteString(string(chars[index:scp.start]))
+		buf.WriteString(r.mapping[string(chars[scp.start:scp.stop])])
+		index = scp.stop
+	}
+	if index < len(chars) {
+		buf.WriteString(string(chars[index:]))
 	}
 
-	return buf.String()
+	return buf.String(), true
 }

+ 10 - 0
core/stringx/replacer_test.go

@@ -219,3 +219,13 @@ func TestReplacer_ReplaceLongestMatch(t *testing.T) {
 	})
 	assert.Equal(t, "东京是东京", replacer.Replace("日本的首都是东京"))
 }
+
+func TestReplacer_ReplaceIndefinitely(t *testing.T) {
+	mapping := map[string]string{
+		"日本的首都": "东京",
+		"东京":    "日本的首都",
+	}
+	assert.NotPanics(t, func() {
+		NewReplacer(mapping).Replace("日本的首都是东京")
+	})
+}