429 lines
14 KiB
Rust
429 lines
14 KiB
Rust
|
//! Functionality for finding words.
|
|||
|
//!
|
|||
|
//! In order to wrap text, we need to know where the legal break
|
|||
|
//! points are, i.e., where the words of the text are. This means that
|
|||
|
//! we need to define what a "word" is.
|
|||
|
//!
|
|||
|
//! A simple approach is to simply split the text on whitespace, but
|
|||
|
//! this does not work for East-Asian languages such as Chinese or
|
|||
|
//! Japanese where there are no spaces between words. Breaking a long
|
|||
|
//! sequence of emojis is another example where line breaks might be
|
|||
|
//! wanted even if there are no whitespace to be found.
|
|||
|
//!
|
|||
|
//! The [`WordSeparator`] trait is responsible for determining where
|
|||
|
//! there words are in a line of text. Please refer to the trait and
|
|||
|
//! the structs which implement it for more information.
|
|||
|
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
use crate::core::skip_ansi_escape_sequence;
|
|||
|
use crate::core::Word;
|
|||
|
|
|||
|
/// Describes where words occur in a line of text.
|
|||
|
///
|
|||
|
/// The simplest approach is say that words are separated by one or
|
|||
|
/// more ASCII spaces (`' '`). This works for Western languages
|
|||
|
/// without emojis. A more complex approach is to use the Unicode line
|
|||
|
/// breaking algorithm, which finds break points in non-ASCII text.
|
|||
|
///
|
|||
|
/// The line breaks occur between words, please see
|
|||
|
/// [`WordSplitter`](crate::WordSplitter) for options of how to handle
|
|||
|
/// hyphenation of individual words.
|
|||
|
///
|
|||
|
/// # Examples
|
|||
|
///
|
|||
|
/// ```
|
|||
|
/// use textwrap::core::Word;
|
|||
|
/// use textwrap::WordSeparator::AsciiSpace;
|
|||
|
///
|
|||
|
/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
|
|||
|
/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
|
|||
|
/// ```
|
|||
|
#[derive(Clone, Copy)]
|
|||
|
pub enum WordSeparator {
|
|||
|
/// Find words by splitting on runs of `' '` characters.
|
|||
|
///
|
|||
|
/// # Examples
|
|||
|
///
|
|||
|
/// ```
|
|||
|
/// use textwrap::core::Word;
|
|||
|
/// use textwrap::WordSeparator::AsciiSpace;
|
|||
|
///
|
|||
|
/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
|
|||
|
/// assert_eq!(words, vec![Word::from("Hello "),
|
|||
|
/// Word::from("World!")]);
|
|||
|
/// ```
|
|||
|
AsciiSpace,
|
|||
|
|
|||
|
/// Split `line` into words using Unicode break properties.
|
|||
|
///
|
|||
|
/// This word separator uses the Unicode line breaking algorithm
|
|||
|
/// described in [Unicode Standard Annex
|
|||
|
/// #14](https://www.unicode.org/reports/tr14/) to find legal places
|
|||
|
/// to break lines. There is a small difference in that the U+002D
|
|||
|
/// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
|
|||
|
/// to allow a line break at a hyphen, use
|
|||
|
/// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
|
|||
|
/// Soft hyphens are not currently supported.
|
|||
|
///
|
|||
|
/// # Examples
|
|||
|
///
|
|||
|
/// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
|
|||
|
/// breaking algorithm will find line break opportunities between
|
|||
|
/// some characters with no intervening whitespace:
|
|||
|
///
|
|||
|
/// ```
|
|||
|
/// #[cfg(feature = "unicode-linebreak")] {
|
|||
|
/// use textwrap::core::Word;
|
|||
|
/// use textwrap::WordSeparator::UnicodeBreakProperties;
|
|||
|
///
|
|||
|
/// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂😍").collect::<Vec<_>>(),
|
|||
|
/// vec![Word::from("Emojis: "),
|
|||
|
/// Word::from("😂"),
|
|||
|
/// Word::from("😍")]);
|
|||
|
///
|
|||
|
/// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
|
|||
|
/// vec![Word::from("CJK: "),
|
|||
|
/// Word::from("你"),
|
|||
|
/// Word::from("好")]);
|
|||
|
/// }
|
|||
|
/// ```
|
|||
|
///
|
|||
|
/// A U+2060 (Word Joiner) character can be inserted if you want to
|
|||
|
/// manually override the defaults and keep the characters together:
|
|||
|
///
|
|||
|
/// ```
|
|||
|
/// #[cfg(feature = "unicode-linebreak")] {
|
|||
|
/// use textwrap::core::Word;
|
|||
|
/// use textwrap::WordSeparator::UnicodeBreakProperties;
|
|||
|
///
|
|||
|
/// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂\u{2060}😍").collect::<Vec<_>>(),
|
|||
|
/// vec![Word::from("Emojis: "),
|
|||
|
/// Word::from("😂\u{2060}😍")]);
|
|||
|
/// }
|
|||
|
/// ```
|
|||
|
///
|
|||
|
/// The Unicode line breaking algorithm will also automatically
|
|||
|
/// suppress break breaks around certain punctuation characters::
|
|||
|
///
|
|||
|
/// ```
|
|||
|
/// #[cfg(feature = "unicode-linebreak")] {
|
|||
|
/// use textwrap::core::Word;
|
|||
|
/// use textwrap::WordSeparator::UnicodeBreakProperties;
|
|||
|
///
|
|||
|
/// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
|
|||
|
/// vec![Word::from("[ foo ] "),
|
|||
|
/// Word::from("bar !")]);
|
|||
|
/// }
|
|||
|
/// ```
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
UnicodeBreakProperties,
|
|||
|
|
|||
|
/// Find words using a custom word separator
|
|||
|
Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
|
|||
|
}
|
|||
|
|
|||
|
impl std::fmt::Debug for WordSeparator {
|
|||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|||
|
match self {
|
|||
|
WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
|
|||
|
WordSeparator::Custom(_) => f.write_str("Custom(...)"),
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
impl WordSeparator {
|
|||
|
// This function should really return impl Iterator<Item = Word>, but
|
|||
|
// this isn't possible until Rust supports higher-kinded types:
|
|||
|
// https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
|
|||
|
/// Find all words in `line`.
|
|||
|
pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
|
|||
|
match self {
|
|||
|
WordSeparator::AsciiSpace => find_words_ascii_space(line),
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
|
|||
|
WordSeparator::Custom(func) => func(line),
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
|
|||
|
let mut start = 0;
|
|||
|
let mut in_whitespace = false;
|
|||
|
let mut char_indices = line.char_indices();
|
|||
|
|
|||
|
Box::new(std::iter::from_fn(move || {
|
|||
|
// for (idx, ch) in char_indices does not work, gives this
|
|||
|
// error:
|
|||
|
//
|
|||
|
// > cannot move out of `char_indices`, a captured variable in
|
|||
|
// > an `FnMut` closure
|
|||
|
#[allow(clippy::while_let_on_iterator)]
|
|||
|
while let Some((idx, ch)) = char_indices.next() {
|
|||
|
if in_whitespace && ch != ' ' {
|
|||
|
let word = Word::from(&line[start..idx]);
|
|||
|
start = idx;
|
|||
|
in_whitespace = ch == ' ';
|
|||
|
return Some(word);
|
|||
|
}
|
|||
|
|
|||
|
in_whitespace = ch == ' ';
|
|||
|
}
|
|||
|
|
|||
|
if start < line.len() {
|
|||
|
let word = Word::from(&line[start..]);
|
|||
|
start = line.len();
|
|||
|
return Some(word);
|
|||
|
}
|
|||
|
|
|||
|
None
|
|||
|
}))
|
|||
|
}
|
|||
|
|
|||
|
// Strip all ANSI escape sequences from `text`.
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
fn strip_ansi_escape_sequences(text: &str) -> String {
|
|||
|
let mut result = String::with_capacity(text.len());
|
|||
|
|
|||
|
let mut chars = text.chars();
|
|||
|
while let Some(ch) = chars.next() {
|
|||
|
if skip_ansi_escape_sequence(ch, &mut chars) {
|
|||
|
continue;
|
|||
|
}
|
|||
|
result.push(ch);
|
|||
|
}
|
|||
|
|
|||
|
result
|
|||
|
}
|
|||
|
|
|||
|
/// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
|
|||
|
/// if a line is broken at this point, and otherwise be invisible.
|
|||
|
/// Textwrap does not currently support breaking words at soft
|
|||
|
/// hyphens.
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
const SHY: char = '\u{00ad}';
|
|||
|
|
|||
|
/// Find words in line. ANSI escape sequences are ignored in `line`.
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
fn find_words_unicode_break_properties<'a>(
|
|||
|
line: &'a str,
|
|||
|
) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
|
|||
|
// Construct an iterator over (original index, stripped index)
|
|||
|
// tuples. We find the Unicode linebreaks on a stripped string,
|
|||
|
// but we need the original indices so we can form words based on
|
|||
|
// the original string.
|
|||
|
let mut last_stripped_idx = 0;
|
|||
|
let mut char_indices = line.char_indices();
|
|||
|
let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
|
|||
|
Some((orig_idx, ch)) => {
|
|||
|
let stripped_idx = last_stripped_idx;
|
|||
|
if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
|
|||
|
last_stripped_idx += ch.len_utf8();
|
|||
|
}
|
|||
|
Some((orig_idx, stripped_idx))
|
|||
|
}
|
|||
|
None => None,
|
|||
|
});
|
|||
|
|
|||
|
let stripped = strip_ansi_escape_sequences(line);
|
|||
|
let mut opportunities = unicode_linebreak::linebreaks(&stripped)
|
|||
|
.filter(|(idx, _)| {
|
|||
|
#[allow(clippy::match_like_matches_macro)]
|
|||
|
match &stripped[..*idx].chars().next_back() {
|
|||
|
// We suppress breaks at ‘-’ since we want to control
|
|||
|
// this via the WordSplitter.
|
|||
|
Some('-') => false,
|
|||
|
// Soft hyphens are currently not supported since we
|
|||
|
// require all `Word` fragments to be continuous in
|
|||
|
// the input string.
|
|||
|
Some(SHY) => false,
|
|||
|
// Other breaks should be fine!
|
|||
|
_ => true,
|
|||
|
}
|
|||
|
})
|
|||
|
.collect::<Vec<_>>()
|
|||
|
.into_iter();
|
|||
|
|
|||
|
// Remove final break opportunity, we will add it below using
|
|||
|
// &line[start..]; This ensures that we correctly include a
|
|||
|
// trailing ANSI escape sequence.
|
|||
|
opportunities.next_back();
|
|||
|
|
|||
|
let mut start = 0;
|
|||
|
Box::new(std::iter::from_fn(move || {
|
|||
|
#[allow(clippy::while_let_on_iterator)]
|
|||
|
while let Some((idx, _)) = opportunities.next() {
|
|||
|
if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
|
|||
|
let word = Word::from(&line[start..orig_idx]);
|
|||
|
start = orig_idx;
|
|||
|
return Some(word);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
if start < line.len() {
|
|||
|
let word = Word::from(&line[start..]);
|
|||
|
start = line.len();
|
|||
|
return Some(word);
|
|||
|
}
|
|||
|
|
|||
|
None
|
|||
|
}))
|
|||
|
}
|
|||
|
|
|||
|
#[cfg(test)]
|
|||
|
mod tests {
|
|||
|
use super::WordSeparator::*;
|
|||
|
use super::*;
|
|||
|
|
|||
|
// Like assert_eq!, but the left expression is an iterator.
|
|||
|
macro_rules! assert_iter_eq {
|
|||
|
($left:expr, $right:expr) => {
|
|||
|
assert_eq!($left.collect::<Vec<_>>(), $right);
|
|||
|
};
|
|||
|
}
|
|||
|
|
|||
|
fn to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>> {
|
|||
|
words.into_iter().map(|w: &str| Word::from(&w)).collect()
|
|||
|
}
|
|||
|
|
|||
|
macro_rules! test_find_words {
|
|||
|
($ascii_name:ident,
|
|||
|
$unicode_name:ident,
|
|||
|
$([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
|
|||
|
#[test]
|
|||
|
fn $ascii_name() {
|
|||
|
$(
|
|||
|
let expected_words = to_words($ascii_words.to_vec());
|
|||
|
let actual_words = WordSeparator::AsciiSpace
|
|||
|
.find_words($line)
|
|||
|
.collect::<Vec<_>>();
|
|||
|
assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
|
|||
|
)+
|
|||
|
}
|
|||
|
|
|||
|
#[test]
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
fn $unicode_name() {
|
|||
|
$(
|
|||
|
let expected_words = to_words($unicode_words.to_vec());
|
|||
|
let actual_words = WordSeparator::UnicodeBreakProperties
|
|||
|
.find_words($line)
|
|||
|
.collect::<Vec<_>>();
|
|||
|
assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
|
|||
|
)+
|
|||
|
}
|
|||
|
};
|
|||
|
}
|
|||
|
|
|||
|
test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_single_word,
|
|||
|
unicode_single_word,
|
|||
|
["foo", ["foo"], ["foo"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_two_words,
|
|||
|
unicode_two_words,
|
|||
|
["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_multiple_words,
|
|||
|
unicode_multiple_words,
|
|||
|
["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
|
|||
|
["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_only_whitespace,
|
|||
|
unicode_only_whitespace,
|
|||
|
[" ", [" "], [" "]],
|
|||
|
[" ", [" "], [" "]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_inter_word_whitespace,
|
|||
|
unicode_inter_word_whitespace,
|
|||
|
["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_trailing_whitespace,
|
|||
|
unicode_trailing_whitespace,
|
|||
|
["foo ", ["foo "], ["foo "]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_leading_whitespace,
|
|||
|
unicode_leading_whitespace,
|
|||
|
[" foo", [" ", "foo"], [" ", "foo"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_multi_column_char,
|
|||
|
unicode_multi_column_char,
|
|||
|
["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🤠
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_hyphens,
|
|||
|
unicode_hyphens,
|
|||
|
["foo-bar", ["foo-bar"], ["foo-bar"]],
|
|||
|
["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
|
|||
|
["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
|
|||
|
["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_newline,
|
|||
|
unicode_newline,
|
|||
|
["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_tab,
|
|||
|
unicode_tab,
|
|||
|
["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
|
|||
|
);
|
|||
|
|
|||
|
test_find_words!(
|
|||
|
ascii_non_breaking_space,
|
|||
|
unicode_non_breaking_space,
|
|||
|
["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
|
|||
|
);
|
|||
|
|
|||
|
#[test]
|
|||
|
#[cfg(unix)]
|
|||
|
fn find_words_colored_text() {
|
|||
|
use termion::color::{Blue, Fg, Green, Reset};
|
|||
|
|
|||
|
let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
|
|||
|
let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
|
|||
|
assert_iter_eq!(
|
|||
|
AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
|
|||
|
vec![Word::from(&green_hello), Word::from(&blue_world)]
|
|||
|
);
|
|||
|
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
assert_iter_eq!(
|
|||
|
UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
|
|||
|
vec![Word::from(&green_hello), Word::from(&blue_world)]
|
|||
|
);
|
|||
|
}
|
|||
|
|
|||
|
#[test]
|
|||
|
fn find_words_color_inside_word() {
|
|||
|
let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
|
|||
|
assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]);
|
|||
|
|
|||
|
#[cfg(feature = "unicode-linebreak")]
|
|||
|
assert_iter_eq!(
|
|||
|
UnicodeBreakProperties.find_words(&text),
|
|||
|
vec![Word::from(text)]
|
|||
|
);
|
|||
|
}
|
|||
|
}
|