feat(ansi): define Method and make wcwidth aware variants (#341)

* feat(ansi): define Method and make wcwidth aware variants This commit introduces a new type, Method, that represents how the renderer should calculate the display width of cells. It adds helper methods to the Method type that allow users to use either WcWidth, which uses go-runewidth, or GraphemeWidth, which uses uniseg, to calculate the width of strings and text. Methods like `ansi.StringWidth`, `ansi.Truncate`, `ansi.TruncateLeft`, and `ansi.Cut` now have variants that are aware of wide characters and runes. These new methods are prefixed with `Wc` and use go-runewidth to calculate the width of strings. Related: #217 * feat(ansi): add WcStringWidth function test * refactor: rename wc functions to be more consistent
charmbracelet · Jan 24, 2025 · bffb63e · bffb63e
1 parent e24d5c3
commit bffb63e
Show file tree

Hide file tree

Showing 6 changed files with 406 additions and 24 deletions.
diff --git a/ansi/method.go b/ansi/method.go
@@ -0,0 +1,172 @@
+package ansi
+
+// Method is a type that represents the how the renderer should calculate the
+// display width of cells.
+type Method uint8
+
+// Display width modes.
+const (
+	WcWidth Method = iota
+	GraphemeWidth
+)
+
+// StringWidth returns the width of a string in cells. This is the number of
+// cells that the string will occupy when printed in a terminal. ANSI escape
+// codes are ignored and wide characters (such as East Asians and emojis) are
+// accounted for.
+func (m Method) StringWidth(s string) int {
+	return stringWidth(m, s)
+}
+
+// Truncate truncates a string to a given length, adding a tail to the end if
+// the string is longer than the given length. This function is aware of ANSI
+// escape codes and will not break them, and accounts for wide-characters (such
+// as East-Asian characters and emojis).
+func (m Method) Truncate(s string, length int, tail string) string {
+	return truncate(m, s, length, tail)
+}
+
+// TruncateLeft truncates a string to a given length, adding a prefix to the
+// beginning if the string is longer than the given length. This function is
+// aware of ANSI escape codes and will not break them, and accounts for
+// wide-characters (such as East-Asian characters and emojis).
+func (m Method) TruncateLeft(s string, length int, prefix string) string {
+	return truncateLeft(m, s, length, prefix)
+}
+
+// Cut the string, without adding any prefix or tail strings. This function is
+// aware of ANSI escape codes and will not break them, and accounts for
+// wide-characters (such as East-Asian characters and emojis). Note that the
+// [left] parameter is inclusive, while [right] isn't.
+func (m Method) Cut(s string, left, right int) string {
+	return cut(m, s, left, right)
+}
+
+// Hardwrap wraps a string or a block of text to a given line length, breaking
+// word boundaries. This will preserve ANSI escape codes and will account for
+// wide-characters in the string.
+// When preserveSpace is true, spaces at the beginning of a line will be
+// preserved.
+// This treats the text as a sequence of graphemes.
+func (m Method) Hardwrap(s string, length int, preserveSpace bool) string {
+	return hardwrap(m, s, length, preserveSpace)
+}
+
+// Wordwrap wraps a string or a block of text to a given line length, not
+// breaking word boundaries. This will preserve ANSI escape codes and will
+// account for wide-characters in the string.
+// The breakpoints string is a list of characters that are considered
+// breakpoints for word wrapping. A hyphen (-) is always considered a
+// breakpoint.
+//
+// Note: breakpoints must be a string of 1-cell wide rune characters.
+func (m Method) Wordwrap(s string, length int, breakpoints string) string {
+	return wordwrap(m, s, length, breakpoints)
+}
+
+// Wrap wraps a string or a block of text to a given line length, breaking word
+// boundaries if necessary. This will preserve ANSI escape codes and will
+// account for wide-characters in the string. The breakpoints string is a list
+// of characters that are considered breakpoints for word wrapping. A hyphen
+// (-) is always considered a breakpoint.
+//
+// Note: breakpoints must be a string of 1-cell wide rune characters.
+func (m Method) Wrap(s string, length int, breakpoints string) string {
+	return wrap(m, s, length, breakpoints)
+}
+
+// DecodeSequence decodes the first ANSI escape sequence or a printable
+// grapheme from the given data. It returns the sequence slice, the number of
+// bytes read, the cell width for each sequence, and the new state.
+//
+// The cell width will always be 0 for control and escape sequences, 1 for
+// ASCII printable characters, and the number of cells other Unicode characters
+// occupy. It uses the uniseg package to calculate the width of Unicode
+// graphemes and characters. This means it will always do grapheme clustering
+// (mode 2027).
+//
+// Passing a non-nil [*Parser] as the last argument will allow the decoder to
+// collect sequence parameters, data, and commands. The parser cmd will have
+// the packed command value that contains intermediate and marker characters.
+// In the case of a OSC sequence, the cmd will be the OSC command number. Use
+// [Command] and [Parameter] types to unpack command intermediates and markers as well
+// as parameters.
+//
+// Zero [Command] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
+// validity of other data sequences, OSC, DCS, etc, will require checking for
+// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
+//
+// We store the command byte in [Command] in the most significant byte, the
+// marker byte in the next byte, and the intermediate byte in the least
+// significant byte. This is done to avoid using a struct to store the command
+// and its intermediates and markers. The command byte is always the least
+// significant byte i.e. [Cmd & 0xff]. Use the [Command] type to unpack the
+// command, intermediate, and marker bytes. Note that we only collect the last
+// marker character and intermediate byte.
+//
+// The [p.Params] slice will contain the parameters of the sequence. Any
+// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Parameter] type
+// to unpack the parameters.
+//
+// Example:
+//
+//	var state byte // the initial state is always zero [NormalState]
+//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
+//	input := []byte("\x1b[31mHello, World!\x1b[0m")
+//	for len(input) > 0 {
+//		seq, width, n, newState := DecodeSequence(input, state, p)
+//		log.Printf("seq: %q, width: %d", seq, width)
+//		state = newState
+//		input = input[n:]
+//	}
+func (m Method) DecodeSequence(data []byte, state byte, p *Parser) (seq []byte, width, n int, newState byte) {
+	return decodeSequence(m, data, state, p)
+}
+
+// DecodeSequenceInString decodes the first ANSI escape sequence or a printable
+// grapheme from the given data. It returns the sequence slice, the number of
+// bytes read, the cell width for each sequence, and the new state.
+//
+// The cell width will always be 0 for control and escape sequences, 1 for
+// ASCII printable characters, and the number of cells other Unicode characters
+// occupy. It uses the uniseg package to calculate the width of Unicode
+// graphemes and characters. This means it will always do grapheme clustering
+// (mode 2027).
+//
+// Passing a non-nil [*Parser] as the last argument will allow the decoder to
+// collect sequence parameters, data, and commands. The parser cmd will have
+// the packed command value that contains intermediate and marker characters.
+// In the case of a OSC sequence, the cmd will be the OSC command number. Use
+// [Command] and [Parameter] types to unpack command intermediates and markers as well
+// as parameters.
+//
+// Zero [Command] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
+// validity of other data sequences, OSC, DCS, etc, will require checking for
+// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
+//
+// We store the command byte in [Command] in the most significant byte, the
+// marker byte in the next byte, and the intermediate byte in the least
+// significant byte. This is done to avoid using a struct to store the command
+// and its intermediates and markers. The command byte is always the least
+// significant byte i.e. [Cmd & 0xff]. Use the [Command] type to unpack the
+// command, intermediate, and marker bytes. Note that we only collect the last
+// marker character and intermediate byte.
+//
+// The [p.Params] slice will contain the parameters of the sequence. Any
+// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Parameter] type
+// to unpack the parameters.
+//
+// Example:
+//
+//	var state byte // the initial state is always zero [NormalState]
+//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
+//	input := []byte("\x1b[31mHello, World!\x1b[0m")
+//	for len(input) > 0 {
+//		seq, width, n, newState := DecodeSequenceInString(input, state, p)
+//		log.Printf("seq: %q, width: %d", seq, width)
+//		state = newState
+//		input = input[n:]
+//	}
+func (m Method) DecodeSequenceInString(data string, state byte, p *Parser) (seq string, width, n int, newState byte) {
+	return decodeSequence(m, data, state, p)
+}
diff --git a/ansi/parser_decode.go b/ansi/parser_decode.go
@@ -4,6 +4,7 @@ import (
 	"unicode/utf8"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/mattn/go-runewidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -65,7 +66,63 @@ const (
 //		state = newState
 //		input = input[n:]
 //	}
+//
+// This function treats the text as a sequence of grapheme clusters.
 func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
+	return decodeSequence(GraphemeWidth, b, state, p)
+}
+
+// DecodeSequenceWc decodes the first ANSI escape sequence or a printable
+// grapheme from the given data. It returns the sequence slice, the number of
+// bytes read, the cell width for each sequence, and the new state.
+//
+// The cell width will always be 0 for control and escape sequences, 1 for
+// ASCII printable characters, and the number of cells other Unicode characters
+// occupy. It uses the uniseg package to calculate the width of Unicode
+// graphemes and characters. This means it will always do grapheme clustering
+// (mode 2027).
+//
+// Passing a non-nil [*Parser] as the last argument will allow the decoder to
+// collect sequence parameters, data, and commands. The parser cmd will have
+// the packed command value that contains intermediate and marker characters.
+// In the case of a OSC sequence, the cmd will be the OSC command number. Use
+// [Command] and [Parameter] types to unpack command intermediates and markers as well
+// as parameters.
+//
+// Zero [Command] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
+// validity of other data sequences, OSC, DCS, etc, will require checking for
+// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
+//
+// We store the command byte in [Command] in the most significant byte, the
+// marker byte in the next byte, and the intermediate byte in the least
+// significant byte. This is done to avoid using a struct to store the command
+// and its intermediates and markers. The command byte is always the least
+// significant byte i.e. [Cmd & 0xff]. Use the [Command] type to unpack the
+// command, intermediate, and marker bytes. Note that we only collect the last
+// marker character and intermediate byte.
+//
+// The [p.Params] slice will contain the parameters of the sequence. Any
+// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Parameter] type
+// to unpack the parameters.
+//
+// Example:
+//
+//	var state byte // the initial state is always zero [NormalState]
+//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
+//	input := []byte("\x1b[31mHello, World!\x1b[0m")
+//	for len(input) > 0 {
+//		seq, width, n, newState := DecodeSequenceWc(input, state, p)
+//		log.Printf("seq: %q, width: %d", seq, width)
+//		state = newState
+//		input = input[n:]
+//	}
+//
+// This function treats the text as a sequence of wide characters and runes.
+func DecodeSequenceWc[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
+	return decodeSequence(WcWidth, b, state, p)
+}
+
+func decodeSequence[T string | []byte](m Method, b T, state State, p *Parser) (seq T, width int, n int, newState byte) {
 	for i := 0; i < len(b); i++ {
 		c := b[i]
 
@@ -120,6 +177,9 @@ func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width
 
 			if utf8.RuneStart(c) {
 				seq, _, width, _ = FirstGraphemeCluster(b, -1)
+				if m == WcWidth {
+					width = runewidth.StringWidth(string(seq))
+				}
 				i += len(seq)
 				return b[:i], width, i, NormalState
 			}

diff --git a/ansi/truncate.go b/ansi/truncate.go
@@ -4,29 +4,65 @@ import (
 	"bytes"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/mattn/go-runewidth"
 	"github.com/rivo/uniseg"
 )
 
 // Cut the string, without adding any prefix or tail strings. This function is
 // aware of ANSI escape codes and will not break them, and accounts for
 // wide-characters (such as East-Asian characters and emojis). Note that the
 // [left] parameter is inclusive, while [right] isn't.
+// This treats the text as a sequence of graphemes.
 func Cut(s string, left, right int) string {
+	return cut(GraphemeWidth, s, left, right)
+}
+
+// CutWc the string, without adding any prefix or tail strings. This function is
+// aware of ANSI escape codes and will not break them, and accounts for
+// wide-characters (such as East-Asian characters and emojis). Note that the
+// [left] parameter is inclusive, while [right] isn't.
+// This treats the text as a sequence of wide characters and runes.
+func CutWc(s string, left, right int) string {
+	return cut(WcWidth, s, left, right)
+}
+
+func cut(m Method, s string, left, right int) string {
 	if right <= left {
 		return ""
 	}
 
+	truncate := Truncate
+	truncateLeft := TruncateLeft
+	if m == WcWidth {
+		truncate = TruncateWc
+		truncateLeft = TruncateWc
+	}
+
 	if left == 0 {
-		return Truncate(s, right, "")
+		return truncate(s, right, "")
 	}
-	return TruncateLeft(Truncate(s, right, ""), left, "")
+	return truncateLeft(Truncate(s, right, ""), left, "")
 }
 
 // Truncate truncates a string to a given length, adding a tail to the end if
 // the string is longer than the given length. This function is aware of ANSI
 // escape codes and will not break them, and accounts for wide-characters (such
 // as East-Asian characters and emojis).
+// This treats the text as a sequence of graphemes.
 func Truncate(s string, length int, tail string) string {
+	return truncate(GraphemeWidth, s, length, tail)
+}
+
+// TruncateWc truncates a string to a given length, adding a tail to the end if
+// the string is longer than the given length. This function is aware of ANSI
+// escape codes and will not break them, and accounts for wide-characters (such
+// as East-Asian characters and emojis).
+// This treats the text as a sequence of wide characters and runes.
+func TruncateWc(s string, length int, tail string) string {
+	return truncate(WcWidth, s, length, tail)
+}
+
+func truncate(m Method, s string, length int, tail string) string {
 	if sw := StringWidth(s); sw <= length {
 		return s
 	}
@@ -57,6 +93,9 @@ func Truncate(s string, length int, tail string) string {
 			// This action happens when we transition to the Utf8State.
 			var width int
 			cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1)
+			if m == WcWidth {
+				width = runewidth.StringWidth(string(cluster))
+			}
 
 			// increment the index by the length of the cluster
 			i += len(cluster)
@@ -126,7 +165,21 @@ func Truncate(s string, length int, tail string) string {
 // adding a prefix to the beginning if the string is longer than n.
 // This function is aware of ANSI escape codes and will not break them, and
 // accounts for wide-characters (such as East-Asian characters and emojis).
+// This treats the text as a sequence of graphemes.
 func TruncateLeft(s string, n int, prefix string) string {
+	return truncateLeft(GraphemeWidth, s, n, prefix)
+}
+
+// TruncateLeftWc truncates a string from the left side by removing n characters,
+// adding a prefix to the beginning if the string is longer than n.
+// This function is aware of ANSI escape codes and will not break them, and
+// accounts for wide-characters (such as East-Asian characters and emojis).
+// This treats the text as a sequence of wide characters and runes.
+func TruncateLeftWc(s string, n int, prefix string) string {
+	return truncateLeft(WcWidth, s, n, prefix)
+}
+
+func truncateLeft(m Method, s string, n int, prefix string) string {
 	if n <= 0 {
 		return s
 	}
@@ -149,6 +202,9 @@ func TruncateLeft(s string, n int, prefix string) string {
 		if state == parser.Utf8State {
 			var width int
 			cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1)
+			if m == WcWidth {
+				width = runewidth.StringWidth(string(cluster))
+			}
 
 			i += len(cluster)
 			curWidth += width