瀏覽代碼

fix Replacer suffix match, and add test case (#2867)

* fix: replace shoud replace the longest match

* feat: revert bytes.Buffer to strings.Builder

* fix: loop reset nextStart

* feat: add node longest match test

* feat: add replacer suffix match test case

* feat: multiple match

* fix: partial match ends

* fix: replace look back upon error

* feat: rm unnecessary branch

---------

Co-authored-by: hudahai <hscxrzs@gmail.com>
Co-authored-by: hushichang <hushichang@sensetime.com>
dahaihu 2 年之前
父節點
當前提交
cacd5dc91a
共有 4 個文件被更改,包括 349 次插入49 次删除
  1. 66 28
      core/stringx/node.go
  2. 228 3
      core/stringx/node_test.go
  3. 15 18
      core/stringx/replacer.go
  4. 40 0
      core/stringx/replacer_test.go

+ 66 - 28
core/stringx/node.go

@@ -14,7 +14,6 @@ func (n *node) add(word string) {
 	}
 
 	nd := n
-	var depth int
 	for i, char := range chars {
 		if nd.children == nil {
 			child := new(node)
@@ -23,7 +22,6 @@ func (n *node) add(word string) {
 			nd = child
 		} else if child, ok := nd.children[char]; ok {
 			nd = child
-			depth++
 		} else {
 			child := new(node)
 			child.depth = i + 1
@@ -99,51 +97,91 @@ func (n *node) find(chars []rune) []scope {
 	return scopes
 }
 
-func (n *node) longestMatch(chars []rune, start int) (used int, jump *node, matched bool) {
+func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) {
 	cur := n
-	var matchedNode *node
+	var longestMatched *node
+	findMatch := func(path []*node) (*node, int) {
+		var (
+			result *node
+			start  int
+		)
+		for i := len(path) - 1; i >= 0; i-- {
+			icur := path[i]
+			var cur *node
+			for icur.fail != nil {
+				if icur.fail.end {
+					cur = icur.fail
+					break
+				}
+				icur = icur.fail
+			}
+			if cur != nil {
+				if result == nil {
+					result = cur
+					start = i - result.depth + 1
+				} else if curStart := i - cur.depth + 1; curStart < start {
+					result = cur
+					start = curStart
+				}
+			}
+		}
+		return result, start
+	}
 
-	for i := start; i < len(chars); i++ {
-		child, ok := cur.children[chars[i]]
+	for i := len(paths); i < len(chars); i++ {
+		char := chars[i]
+		child, ok := cur.children[char]
 		if ok {
 			cur = child
 			if cur.end {
-				matchedNode = cur
+				longestMatched = cur
 			}
+			paths = append(paths, cur)
 		} else {
-			if matchedNode != nil {
-				return matchedNode.depth, nil, true
+			if longestMatched != nil {
+				return 0, longestMatched.depth, nil
 			}
-
 			if n.end {
-				return start, nil, true
+				return 0, n.depth, nil
 			}
-
+			// old path pre longest preMatch
+			preMatch, preStart := findMatch(paths)
+			// new path match
 			var jump *node
-			for cur.fail != nil {
-				jump, ok = cur.fail.children[chars[i]]
+			icur := cur
+			for icur.fail != nil {
+				jump, ok = icur.fail.children[char]
 				if ok {
 					break
 				}
-				cur = cur.fail
+				icur = icur.fail
 			}
-			if jump != nil {
-				return i + 1 - jump.depth, jump, false
+			switch {
+			case preMatch != nil && jump != nil:
+				if jumpStart := i - jump.depth + 1; preStart < jumpStart {
+					return preStart, preMatch.depth, nil
+				} else {
+					return jumpStart, 0, append(paths[jumpStart:], jump)
+				}
+			case preMatch != nil && jump == nil:
+				return preStart, preMatch.depth, nil
+			case preMatch == nil && jump != nil:
+				return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump)
+			case preMatch == nil && jump == nil:
+				return i + 1, 0, nil
 			}
-
-			return i + 1, nil, false
 		}
 	}
-
-	// longest matched node
-	if matchedNode != nil {
-		return matchedNode.depth, nil, true
+	// this longest matched node
+	if longestMatched != nil {
+		return 0, longestMatched.depth, nil
 	}
-
-	// last matched node
 	if n.end {
-		return start, nil, true
+		return 0, n.depth, nil
 	}
-
-	return len(chars), nil, false
+	match, start := findMatch(paths)
+	if match != nil {
+		return start, match.depth, nil
+	}
+	return len(chars), 0, nil
 }

+ 228 - 3
core/stringx/node_test.go

@@ -9,10 +9,10 @@ import (
 func TestLongestMatchGuardedCondition(t *testing.T) {
 	n := new(node)
 	n.end = true
-	used, jump, matched := n.longestMatch([]rune(""), 0)
-	assert.Equal(t, 0, used)
+	uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil)
+	assert.Equal(t, 0, uselessLen)
 	assert.Nil(t, jump)
-	assert.True(t, matched)
+	assert.Equal(t, 0, matchLen)
 }
 
 func TestFuzzNodeCase1(t *testing.T) {
@@ -202,3 +202,228 @@ func BenchmarkNodeFind(b *testing.B) {
 		trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演"))
 	}
 }
+
+func TestNode_longestMatchCase0(t *testing.T) {
+	// match the longest word
+	keywords := []string{
+		"a",
+		"ab",
+		"abc",
+		"abcd",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil)
+	assert.Equal(t, 0, uselessLen)
+	assert.Equal(t, 3, matchLen)
+	assert.Nil(t, jump)
+}
+
+func TestNode_longestMatchCase1(t *testing.T) {
+	keywords := []string{
+		"abcde",
+		"bcde",
+		"cde",
+		"de",
+
+		"b",
+		"bc",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 2, matchLen)
+	assert.Nil(t, jump)
+}
+
+func TestNode_longestMatchCase2(t *testing.T) {
+	keywords := []string{
+		"abcde",
+		"bcde",
+		"cde",
+		"de",
+
+		"c",
+		"cd",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
+	assert.Equal(t, 2, uselessLen)
+	assert.Equal(t, 2, matchLen)
+	assert.Nil(t, jump)
+}
+
+func TestNode_longestMatchCase3(t *testing.T) {
+	keywords := []string{
+		"abcde",
+		"bcde",
+		"cde",
+		"de",
+
+		"b",
+		"bc",
+		"c",
+		"cd",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 2, matchLen)
+	assert.Nil(t, jump)
+}
+
+func TestNode_longestMatchCase4(t *testing.T) {
+	keywords := []string{
+		"abcde",
+		"bcdf",
+		"bcd",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 0, matchLen)
+	assert.Equal(t, 4, len(paths))
+}
+
+func TestNode_longestMatchCase5(t *testing.T) {
+	keywords := []string{
+		"abcdef",
+		"bcde",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 4, matchLen)
+	assert.Nil(t, paths)
+}
+
+func TestNode_longestMatchCase6(t *testing.T) {
+	keywords := []string{
+		"abcde",
+		"bc",
+		"d",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 2, matchLen)
+	assert.Nil(t, jump)
+}
+
+func TestNode_longestMatchCase7(t *testing.T) {
+	keywords := []string{
+		"abcdeg",
+		"cdef",
+		"bcde",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	word := []rune("abcdef")
+	uselessLen, matchLen, paths := trie.longestMatch(word, nil)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 4, matchLen)
+	assert.Nil(t, paths)
+	uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 0, matchLen)
+	assert.Nil(t, paths)
+}
+
+func TestNode_longestMatchCase8(t *testing.T) {
+	keywords := []string{
+		"abcdeg",
+		"cdef",
+		"cde",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	word := []rune("abcdef")
+	uselessLen, matchLen, paths := trie.longestMatch(word, nil)
+	assert.Equal(t, 2, uselessLen)
+	assert.Equal(t, 0, matchLen)
+	assert.NotNil(t, paths)
+}
+
+func TestNode_longestMatchCase9(t *testing.T) {
+	keywords := []string{
+		"abcdeg",
+		"cdef",
+		"cde",
+		"cd",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+
+	word := []rune("abcde")
+	uselessLen, matchLen, paths := trie.longestMatch(word, nil)
+	assert.Equal(t, 2, uselessLen)
+	assert.Equal(t, 3, matchLen)
+	assert.Nil(t, paths)
+}
+
+func TestNode_jump(t *testing.T) {
+	keywords := []string{
+		"de",
+		"fe",
+	}
+	trie := new(node)
+	for _, keyword := range keywords {
+		trie.add(keyword)
+	}
+	trie.build()
+	target := []rune("dfe")
+
+	uselessLen, matchLen, paths := trie.longestMatch(target, nil)
+	assert.Equal(t, 1, uselessLen)
+	assert.Equal(t, 0, matchLen)
+	assert.NotNil(t, paths)
+	uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths)
+	assert.Equal(t, 0, uselessLen)
+	assert.Equal(t, 2, matchLen)
+	assert.Nil(t, paths)
+}

+ 15 - 18
core/stringx/replacer.go

@@ -33,29 +33,26 @@ func NewReplacer(mapping map[string]string) Replacer {
 // Replace replaces text with given substitutes.
 func (r *replacer) Replace(text string) string {
 	var buf strings.Builder
-	var nextStart int
 	target := []rune(text)
 	cur := r.node
-
+	var paths []*node
 	for len(target) != 0 {
-		used, jump, matched := cur.longestMatch(target, nextStart)
-		if matched {
-			replaced := r.mapping[string(target[:used])]
-			target = append([]rune(replaced), target[used:]...)
-			cur = r.node
-			nextStart = 0
+		uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths)
+		if uselessLen > 0 {
+			buf.WriteString(string(target[:uselessLen]))
+			target = target[uselessLen:]
+		}
+		if matchLen > 0 {
+			replaced := r.mapping[string(target[:matchLen])]
+			target = append([]rune(replaced), target[matchLen:]...)
+		}
+		if len(nextPaths) != 0 {
+			cur = nextPaths[len(nextPaths)-1]
+			paths = nextPaths
 		} else {
-			buf.WriteString(string(target[:used]))
-			target = target[used:]
-			if jump != nil {
-				cur = jump
-				nextStart = jump.depth
-			} else {
-				cur = r.node
-				nextStart = 0
-			}
+			cur = r.node
+			paths = nil
 		}
 	}
-
 	return buf.String()
 }

+ 40 - 0
core/stringx/replacer_test.go

@@ -15,6 +15,15 @@ func TestReplacer_Replace(t *testing.T) {
 	assert.Equal(t, "零1234五", NewReplacer(mapping).Replace("零一二三四五"))
 }
 
+func TestReplacer_ReplaceJumpMatch(t *testing.T) {
+	mapping := map[string]string{
+		"abcdeg": "ABCDEG",
+		"cdef":   "CDEF",
+		"cde":    "CDE",
+	}
+	assert.Equal(t, "abCDEF", NewReplacer(mapping).Replace("abcdef"))
+}
+
 func TestReplacer_ReplaceOverlap(t *testing.T) {
 	mapping := map[string]string{
 		"3d": "34",
@@ -44,6 +53,14 @@ func TestReplacer_ReplacePartialMatch(t *testing.T) {
 	assert.Equal(t, "零一二三四五", NewReplacer(mapping).Replace("零一二三四五"))
 }
 
+func TestReplacer_ReplacePartialMatchEnds(t *testing.T) {
+	mapping := map[string]string{
+		"二三四七": "2347",
+		"三四":   "34",
+	}
+	assert.Equal(t, "零一二34", NewReplacer(mapping).Replace("零一二三四"))
+}
+
 func TestReplacer_ReplaceMultiMatches(t *testing.T) {
 	mapping := map[string]string{
 		"二三": "23",
@@ -60,6 +77,29 @@ func TestReplacer_ReplaceLongestMatching(t *testing.T) {
 	assert.Equal(t, "东京在japan", replacer.Replace("日本的首都在日本"))
 }
 
+func TestReplacer_ReplaceSuffixMatch(t *testing.T) {
+	// case1
+	{
+		keywords := map[string]string{
+			"abcde": "ABCDE",
+			"bcde":  "BCDE",
+			"bcd":   "BCD",
+		}
+		assert.Equal(t, "aBCDf", NewReplacer(keywords).Replace("abcdf"))
+	}
+	// case2
+	{
+		keywords := map[string]string{
+			"abcde": "ABCDE",
+			"bcde":  "BCDE",
+			"cde":   "CDE",
+			"c":     "C",
+			"cd":    "CD",
+		}
+		assert.Equal(t, "abCDf", NewReplacer(keywords).Replace("abcdf"))
+	}
+}
+
 func TestReplacer_ReplaceLongestOverlap(t *testing.T) {
 	keywords := map[string]string{
 		"456":  "def",