Files
fparkan/vendor/textwrap/src/word_separators.rs
2024-01-08 01:21:28 +04:00

429 lines
14 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Functionality for finding words.
//!
//! In order to wrap text, we need to know where the legal break
//! points are, i.e., where the words of the text are. This means that
//! we need to define what a "word" is.
//!
//! A simple approach is to simply split the text on whitespace, but
//! this does not work for East-Asian languages such as Chinese or
//! Japanese where there are no spaces between words. Breaking a long
//! sequence of emojis is another example where line breaks might be
//! wanted even if there are no whitespace to be found.
//!
//! The [`WordSeparator`] trait is responsible for determining where
//! there words are in a line of text. Please refer to the trait and
//! the structs which implement it for more information.
#[cfg(feature = "unicode-linebreak")]
use crate::core::skip_ansi_escape_sequence;
use crate::core::Word;
/// Describes where words occur in a line of text.
///
/// The simplest approach is say that words are separated by one or
/// more ASCII spaces (`' '`). This works for Western languages
/// without emojis. A more complex approach is to use the Unicode line
/// breaking algorithm, which finds break points in non-ASCII text.
///
/// The line breaks occur between words, please see
/// [`WordSplitter`](crate::WordSplitter) for options of how to handle
/// hyphenation of individual words.
///
/// # Examples
///
/// ```
/// use textwrap::core::Word;
/// use textwrap::WordSeparator::AsciiSpace;
///
/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
/// ```
#[derive(Clone, Copy)]
pub enum WordSeparator {
/// Find words by splitting on runs of `' '` characters.
///
/// # Examples
///
/// ```
/// use textwrap::core::Word;
/// use textwrap::WordSeparator::AsciiSpace;
///
/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
/// assert_eq!(words, vec![Word::from("Hello "),
/// Word::from("World!")]);
/// ```
AsciiSpace,
/// Split `line` into words using Unicode break properties.
///
/// This word separator uses the Unicode line breaking algorithm
/// described in [Unicode Standard Annex
/// #14](https://www.unicode.org/reports/tr14/) to find legal places
/// to break lines. There is a small difference in that the U+002D
/// (Hyphen-Minus) and U+00AD (Soft Hyphen) dont create a line break:
/// to allow a line break at a hyphen, use
/// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
/// Soft hyphens are not currently supported.
///
/// # Examples
///
/// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
/// breaking algorithm will find line break opportunities between
/// some characters with no intervening whitespace:
///
/// ```
/// #[cfg(feature = "unicode-linebreak")] {
/// use textwrap::core::Word;
/// use textwrap::WordSeparator::UnicodeBreakProperties;
///
/// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂😍").collect::<Vec<_>>(),
/// vec![Word::from("Emojis: "),
/// Word::from("😂"),
/// Word::from("😍")]);
///
/// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
/// vec![Word::from("CJK: "),
/// Word::from("你"),
/// Word::from("好")]);
/// }
/// ```
///
/// A U+2060 (Word Joiner) character can be inserted if you want to
/// manually override the defaults and keep the characters together:
///
/// ```
/// #[cfg(feature = "unicode-linebreak")] {
/// use textwrap::core::Word;
/// use textwrap::WordSeparator::UnicodeBreakProperties;
///
/// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂\u{2060}😍").collect::<Vec<_>>(),
/// vec![Word::from("Emojis: "),
/// Word::from("😂\u{2060}😍")]);
/// }
/// ```
///
/// The Unicode line breaking algorithm will also automatically
/// suppress break breaks around certain punctuation characters::
///
/// ```
/// #[cfg(feature = "unicode-linebreak")] {
/// use textwrap::core::Word;
/// use textwrap::WordSeparator::UnicodeBreakProperties;
///
/// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
/// vec![Word::from("[ foo ] "),
/// Word::from("bar !")]);
/// }
/// ```
#[cfg(feature = "unicode-linebreak")]
UnicodeBreakProperties,
/// Find words using a custom word separator
Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
}
impl std::fmt::Debug for WordSeparator {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
#[cfg(feature = "unicode-linebreak")]
WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
WordSeparator::Custom(_) => f.write_str("Custom(...)"),
}
}
}
impl WordSeparator {
// This function should really return impl Iterator<Item = Word>, but
// this isn't possible until Rust supports higher-kinded types:
// https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
/// Find all words in `line`.
pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
match self {
WordSeparator::AsciiSpace => find_words_ascii_space(line),
#[cfg(feature = "unicode-linebreak")]
WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
WordSeparator::Custom(func) => func(line),
}
}
}
fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
let mut start = 0;
let mut in_whitespace = false;
let mut char_indices = line.char_indices();
Box::new(std::iter::from_fn(move || {
// for (idx, ch) in char_indices does not work, gives this
// error:
//
// > cannot move out of `char_indices`, a captured variable in
// > an `FnMut` closure
#[allow(clippy::while_let_on_iterator)]
while let Some((idx, ch)) = char_indices.next() {
if in_whitespace && ch != ' ' {
let word = Word::from(&line[start..idx]);
start = idx;
in_whitespace = ch == ' ';
return Some(word);
}
in_whitespace = ch == ' ';
}
if start < line.len() {
let word = Word::from(&line[start..]);
start = line.len();
return Some(word);
}
None
}))
}
// Strip all ANSI escape sequences from `text`.
#[cfg(feature = "unicode-linebreak")]
fn strip_ansi_escape_sequences(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut chars = text.chars();
while let Some(ch) = chars.next() {
if skip_ansi_escape_sequence(ch, &mut chars) {
continue;
}
result.push(ch);
}
result
}
/// Soft hyphen, also knows as a “shy hyphen”. Should show up as -
/// if a line is broken at this point, and otherwise be invisible.
/// Textwrap does not currently support breaking words at soft
/// hyphens.
#[cfg(feature = "unicode-linebreak")]
const SHY: char = '\u{00ad}';
/// Find words in line. ANSI escape sequences are ignored in `line`.
#[cfg(feature = "unicode-linebreak")]
fn find_words_unicode_break_properties<'a>(
line: &'a str,
) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
// Construct an iterator over (original index, stripped index)
// tuples. We find the Unicode linebreaks on a stripped string,
// but we need the original indices so we can form words based on
// the original string.
let mut last_stripped_idx = 0;
let mut char_indices = line.char_indices();
let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
Some((orig_idx, ch)) => {
let stripped_idx = last_stripped_idx;
if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
last_stripped_idx += ch.len_utf8();
}
Some((orig_idx, stripped_idx))
}
None => None,
});
let stripped = strip_ansi_escape_sequences(line);
let mut opportunities = unicode_linebreak::linebreaks(&stripped)
.filter(|(idx, _)| {
#[allow(clippy::match_like_matches_macro)]
match &stripped[..*idx].chars().next_back() {
// We suppress breaks at - since we want to control
// this via the WordSplitter.
Some('-') => false,
// Soft hyphens are currently not supported since we
// require all `Word` fragments to be continuous in
// the input string.
Some(SHY) => false,
// Other breaks should be fine!
_ => true,
}
})
.collect::<Vec<_>>()
.into_iter();
// Remove final break opportunity, we will add it below using
// &line[start..]; This ensures that we correctly include a
// trailing ANSI escape sequence.
opportunities.next_back();
let mut start = 0;
Box::new(std::iter::from_fn(move || {
#[allow(clippy::while_let_on_iterator)]
while let Some((idx, _)) = opportunities.next() {
if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
let word = Word::from(&line[start..orig_idx]);
start = orig_idx;
return Some(word);
}
}
if start < line.len() {
let word = Word::from(&line[start..]);
start = line.len();
return Some(word);
}
None
}))
}
#[cfg(test)]
mod tests {
use super::WordSeparator::*;
use super::*;
// Like assert_eq!, but the left expression is an iterator.
macro_rules! assert_iter_eq {
($left:expr, $right:expr) => {
assert_eq!($left.collect::<Vec<_>>(), $right);
};
}
fn to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>> {
words.into_iter().map(|w: &str| Word::from(&w)).collect()
}
macro_rules! test_find_words {
($ascii_name:ident,
$unicode_name:ident,
$([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
#[test]
fn $ascii_name() {
$(
let expected_words = to_words($ascii_words.to_vec());
let actual_words = WordSeparator::AsciiSpace
.find_words($line)
.collect::<Vec<_>>();
assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
)+
}
#[test]
#[cfg(feature = "unicode-linebreak")]
fn $unicode_name() {
$(
let expected_words = to_words($unicode_words.to_vec());
let actual_words = WordSeparator::UnicodeBreakProperties
.find_words($line)
.collect::<Vec<_>>();
assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
)+
}
};
}
test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
test_find_words!(
ascii_single_word,
unicode_single_word,
["foo", ["foo"], ["foo"]]
);
test_find_words!(
ascii_two_words,
unicode_two_words,
["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
);
test_find_words!(
ascii_multiple_words,
unicode_multiple_words,
["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
);
test_find_words!(
ascii_only_whitespace,
unicode_only_whitespace,
[" ", [" "], [" "]],
[" ", [" "], [" "]]
);
test_find_words!(
ascii_inter_word_whitespace,
unicode_inter_word_whitespace,
["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
);
test_find_words!(
ascii_trailing_whitespace,
unicode_trailing_whitespace,
["foo ", ["foo "], ["foo "]]
);
test_find_words!(
ascii_leading_whitespace,
unicode_leading_whitespace,
[" foo", [" ", "foo"], [" ", "foo"]]
);
test_find_words!(
ascii_multi_column_char,
unicode_multi_column_char,
["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🤠
);
test_find_words!(
ascii_hyphens,
unicode_hyphens,
["foo-bar", ["foo-bar"], ["foo-bar"]],
["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
);
test_find_words!(
ascii_newline,
unicode_newline,
["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
);
test_find_words!(
ascii_tab,
unicode_tab,
["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
);
test_find_words!(
ascii_non_breaking_space,
unicode_non_breaking_space,
["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
);
#[test]
#[cfg(unix)]
fn find_words_colored_text() {
use termion::color::{Blue, Fg, Green, Reset};
let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
assert_iter_eq!(
AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
vec![Word::from(&green_hello), Word::from(&blue_world)]
);
#[cfg(feature = "unicode-linebreak")]
assert_iter_eq!(
UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
vec![Word::from(&green_hello), Word::from(&blue_world)]
);
}
#[test]
fn find_words_color_inside_word() {
let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]);
#[cfg(feature = "unicode-linebreak")]
assert_iter_eq!(
UnicodeBreakProperties.find_words(&text),
vec![Word::from(text)]
);
}
}