Final touches for combining character support

2025-06-18 14:55:38 -04:00 · 2020-05-20 17:00:56 -04:00 · 2020-05-20 17:00:56 -04:00 · ff0683d6d0
commit ff0683d6d0
parent 79c0ea17ad
5 changed files with 128 additions and 18 deletions
--- a/internal/util/lua.go
+++ b/internal/util/lua.go
@ -1,15 +1,11 @@
 package util

-import (
-	"unicode/utf8"
-)
-
 // LuaRuneAt is a helper function for lua plugins to return the rune
 // at an index within a string
 func LuaRuneAt(str string, runeidx int) string {
 	i := 0
 	for len(str) > 0 {
-		r, size := utf8.DecodeRuneInString(str)
+		r, _, size := DecodeCharacterInString(str)

 		str = str[size:]

@ -26,7 +22,7 @@ func LuaRuneAt(str string, runeidx int) string {
 func LuaGetLeadingWhitespace(s string) string {
 	ws := []byte{}
 	for len(s) > 0 {
-		r, size := utf8.DecodeRuneInString(s)
+		r, _, size := DecodeCharacterInString(s)
 		if r == ' ' || r == '\t' {
 			ws = append(ws, byte(r))
 		} else {
@ -40,6 +36,6 @@ func LuaGetLeadingWhitespace(s string) string {

 // LuaIsWordChar returns true if the first rune in a string is a word character
 func LuaIsWordChar(s string) bool {
-	r, _ := utf8.DecodeRuneInString(s)
+	r, _, _ := DecodeCharacterInString(s)
 	return IsWordChar(r)
 }
--- a/internal/util/unicode.go
+++ b/internal/util/unicode.go
@ -5,6 +5,17 @@ import (
 	"unicode/utf8"
 )

+// Unicode is annoying. A "code point" (rune in Go-speak) may need up to
+// 4 bytes to represent it. In general, a code point will represent a
+// complete character, but this is not always the case. A character with
+// accents may be made up of multiple code points (the code point for the
+// original character, and additional code points for each accent/marking).
+// The functions below are meant to help deal with these additional "combining"
+// code points. In underlying operations (search, replace, etc...), micro will
+// treat a character with combining code points as just the original code point.
+// For rendering, micro will display the combining characters. It's not perfect
+// but it's pretty good.
+
 // combining character range table
 var combining = &unicode.RangeTable{
 	R16: []unicode.Range16{
@ -35,6 +46,25 @@ func DecodeCharacter(b []byte) (rune, []rune, int) {
 	return r, combc, size
 }

+// DecodeCharacterInString returns the next character from a string
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacterInString(str string) (rune, []rune, int) {
+	r, size := utf8.DecodeRuneInString(str)
+	str = str[size:]
+	c, s := utf8.DecodeRuneInString(str)
+
+	var combc []rune
+	for unicode.In(c, combining) {
+		combc = append(combc, c)
+		size += s
+
+		str = str[s:]
+		c, s = utf8.DecodeRuneInString(str)
+	}
+
+	return r, combc, size
+}
+
 // CharacterCount returns the number of characters in a byte array
 // Similar to utf8.RuneCount but for unicode characters
 func CharacterCount(b []byte) int {
--- a/internal/util/util.go
+++ b/internal/util/util.go
@ -13,7 +13,6 @@ import (
 	"strings"
 	"time"
 	"unicode"
-	"unicode/utf8"

 	"github.com/blang/semver"
 	runewidth "github.com/mattn/go-runewidth"
@ -82,7 +81,7 @@ func SliceEndStr(str string, index int) string {
 			return str[totalSize:]
 		}

-		_, size := utf8.DecodeRuneInString(str[totalSize:])
+		_, _, size := DecodeCharacterInString(str[totalSize:])
 		totalSize += size
 		i++
 	}
@ -119,7 +118,7 @@ func SliceStartStr(str string, index int) string {
 			return str[:totalSize]
 		}

-		_, size := utf8.DecodeRuneInString(str[totalSize:])
+		_, _, size := DecodeCharacterInString(str[totalSize:])
 		totalSize += size
 		i++
 	}
--- a/pkg/highlight/highlighter.go
+++ b/pkg/highlight/highlighter.go
@ -3,7 +3,6 @@ package highlight
 import (
 	"regexp"
 	"strings"
-	"unicode/utf8"
 )

 func sliceStart(slc []byte, index int) []byte {
@ -15,7 +14,7 @@ func sliceStart(slc []byte, index int) []byte {
 			return slc[totalSize:]
 		}

-		_, size := utf8.DecodeRune(slc[totalSize:])
+		_, _, size := DecodeCharacter(slc[totalSize:])
 		totalSize += size
 		i++
 	}
@ -32,7 +31,7 @@ func sliceEnd(slc []byte, index int) []byte {
 			return slc[:totalSize]
 		}

-		_, size := utf8.DecodeRune(slc[totalSize:])
+		_, _, size := DecodeCharacter(slc[totalSize:])
 		totalSize += size
 		i++
 	}
@ -47,9 +46,9 @@ func runePos(p int, str []byte) int {
 		return 0
 	}
 	if p >= len(str) {
-		return utf8.RuneCount(str)
+		return CharacterCount(str)
 	}
-	return utf8.RuneCount(str[:p])
+	return CharacterCount(str[:p])
 }

 func combineLineMatch(src, dst LineMatch) LineMatch {
@ -112,7 +111,7 @@ func findIndex(regex *regexp.Regexp, skip *regexp.Regexp, str []byte, canMatchSt
 	var strbytes []byte
 	if skip != nil {
 		strbytes = skip.ReplaceAllFunc(str, func(match []byte) []byte {
-			res := make([]byte, utf8.RuneCount(match))
+			res := make([]byte, CharacterCount(match))
 			return res
 		})
 	} else {
@ -148,7 +147,7 @@ func findAllIndex(regex *regexp.Regexp, str []byte, canMatchStart, canMatchEnd b
 }

 func (h *Highlighter) highlightRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, curRegion *region, statesOnly bool) LineMatch {
-	lineLen := utf8.RuneCount(line)
+	lineLen := CharacterCount(line)
 	if start == 0 {
 		if !statesOnly {
 			if _, ok := highlights[0]; !ok {
@ -236,7 +235,7 @@ func (h *Highlighter) highlightRegion(highlights LineMatch, start int, canMatchE
 }

 func (h *Highlighter) highlightEmptyRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, statesOnly bool) LineMatch {
-	lineLen := utf8.RuneCount(line)
+	lineLen := CharacterCount(line)
 	if lineLen == 0 {
 		if canMatchEnd {
 			h.lastRegion = nil
--- a/pkg/highlight/unicode.go
+++ b/pkg/highlight/unicode.go
@ -0,0 +1,86 @@
+package highlight
+
+import (
+	"unicode"
+	"unicode/utf8"
+)
+
+// combining character range table
+var combining = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		{0x0300, 0x036f, 1}, // combining diacritical marks
+		{0x1ab0, 0x1aff, 1}, // combining diacritical marks extended
+		{0x1dc0, 0x1dff, 1}, // combining diacritical marks supplement
+		{0x20d0, 0x20ff, 1}, // combining diacritical marks for symbols
+		{0xfe20, 0xfe2f, 1}, // combining half marks
+	},
+}
+
+// DecodeCharacter returns the next character from an array of bytes
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacter(b []byte) (rune, []rune, int) {
+	r, size := utf8.DecodeRune(b)
+	b = b[size:]
+	c, s := utf8.DecodeRune(b)
+
+	var combc []rune
+	for unicode.In(c, combining) {
+		combc = append(combc, c)
+		size += s
+
+		b = b[s:]
+		c, s = utf8.DecodeRune(b)
+	}
+
+	return r, combc, size
+}
+
+// DecodeCharacterInString returns the next character from a string
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacterInString(str string) (rune, []rune, int) {
+	r, size := utf8.DecodeRuneInString(str)
+	str = str[size:]
+	c, s := utf8.DecodeRuneInString(str)
+
+	var combc []rune
+	for unicode.In(c, combining) {
+		combc = append(combc, c)
+		size += s
+
+		str = str[s:]
+		c, s = utf8.DecodeRuneInString(str)
+	}
+
+	return r, combc, size
+}
+
+// CharacterCount returns the number of characters in a byte array
+// Similar to utf8.RuneCount but for unicode characters
+func CharacterCount(b []byte) int {
+	s := 0
+
+	for len(b) > 0 {
+		r, size := utf8.DecodeRune(b)
+		if !unicode.In(r, combining) {
+			s++
+		}
+
+		b = b[size:]
+	}
+
+	return s
+}
+
+// CharacterCount returns the number of characters in a string
+// Similar to utf8.RuneCountInString but for unicode characters
+func CharacterCountInString(str string) int {
+	s := 0
+
+	for _, r := range str {
+		if !unicode.In(r, combining) {
+			s++
+		}
+	}
+
+	return s
+}