Initial vendor packages

Signed-off-by: Valentin Popov <valentin@popov.link>
This commit is contained in:
2024-01-08 01:21:28 +04:00
parent 5ecd8cf2cb
commit 1b6a04ca55
7309 changed files with 2160054 additions and 0 deletions

View File

@ -0,0 +1 @@
{"files":{"AUTHORS.md":"f2cf336738ad935a482a799be004083ddd07c904513caf80f9e48011888fe1b6","Cargo.toml":"6fd14a963bfb44b78883bb57c082300138668bc163c949286453836116c7018d","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"da23be69ad3ccf7a5823d62152efcd2b15de9482a5014fcb1b02844662d86abd","README.md":"c33df8cbe2645cd55d3644d4af453c0bd3cf3ffaa4a0c15ac6f3162fade966d6","RELEASES.md":"0a10f449adcf53ab00a43bb0242e38b508a51b529b3afc02eb645dbb0216b3bc","benches/multiiterators.rs":"69c878d010856a24247085356ed3045c6ceb1ac88cd75ea7a00b11206090debe","src/decoding_iterators.rs":"72c9fe0d10240e021dfc46546e814f04ab5c7e142a04305fad8c8da48575fe92","src/errors.rs":"0355e926edd1c8e81b537aca1a80fc324912a8c21e84db278c232860e3476822","src/lib.rs":"972010cd7f1b24dd048d066f1a3ff57fc16d4486a7e3583f7ae995dbd1ada5c8","src/traits.rs":"7ec1b649f23410e55bbfe6df13713040bcd292ee90a81d31437291ad100ea99f","src/utf16_char.rs":"c014de07ebc08592b3527e62d66699c8d637c8c9491835341ccdc71e28f346a1","src/utf16_iterators.rs":"9344132fb95077f05b6da8d9da77eb38ddfc8134e543313d74f286e7e545e875","src/utf8_char.rs":"349a3ebafa8ae2c88efa334958ea4d1863ca559d1b17aeb2c53113cd9afc7d15","src/utf8_iterators.rs":"e3d3bbb23253a582c48985b4eb26a4febfb40fc79a12e5fc1ca28c4cdef9fe81","tests/errs.rs":"7244966b93fc98c19a9ca163870863c1fba6107d526f8c2baa065696eb4cf9b4","tests/exhaustive.rs":"25c71761e57ac45125c2d527ddc5fc5e7b89b9b1055df3db50bce81af9844250","tests/iterators.rs":"1bda1ea031950134eef6b21e8473f4c7a2d338822cd40bb47267db4fe608d586","tests/oks.rs":"9c3e571488bc66696f7cd518f89e25bbb0ad382cbf2834f7461043237c42f6d9"},"package":"a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"}

4
vendor/encode_unicode/AUTHORS.md vendored Normal file
View File

@ -0,0 +1,4 @@
# The encode_unicode Developers
* Torbjørn Birch Moltu
* Aljoscha Meyer

39
vendor/encode_unicode/Cargo.toml vendored Normal file
View File

@ -0,0 +1,39 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
[package]
name = "encode_unicode"
version = "0.3.6"
authors = ["Torbjørn Birch Moltu <t.b.moltu@lyse.net>"]
description = "UTF-8 and UTF-16 character types, iterators and related methods for char, u8 and u16.\n"
documentation = "https://docs.rs/encode_unicode/"
readme = "README.md"
keywords = ["unicode", "UTF-8", "UTF-16"]
categories = ["encoding", "no-std"]
license = "MIT/Apache-2.0"
repository = "https://github.com/tormol/encode_unicode"
[package.metadata.docs.rs]
features = ["ascii/std"]
[dependencies.ascii]
version = ">=0.8, <2"
optional = true
default-features = false
[dependencies.clippy]
version = "0.*"
optional = true
[features]
default = ["std"]
std = []
[target."cfg(unix)".dev-dependencies.lazy_static]
version = "1.0.*"

202
vendor/encode_unicode/LICENSE-APACHE vendored Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

17
vendor/encode_unicode/LICENSE-MIT vendored Normal file
View File

@ -0,0 +1,17 @@
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

59
vendor/encode_unicode/README.md vendored Normal file
View File

@ -0,0 +1,59 @@
# encode_unicode
UTF-8 and UTF-16 character types, iterators and related methods for `char`, `u8` and `u16`.
[![crates.io page](https://img.shields.io/crates/v/encode_unicode.svg)](https://crates.io/crates/encode_unicode/)
## Features
* **[`Utf8Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf8Char.html)**:
A `char` stored as UTF-8. Can be borrowed as a `str` or `u8` slice.
* **[`Utf16Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf16Char.html)**:
A `char` stored as UTF-16. Can be borrowed as an `u16` slice.
* [Conversion methods on `char`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.CharExt.html):
* to and from UTF-8 as `[u8; 4]` or slice.
* to and from UTF-16 as `(u16, Option<u16>)` or slice.
* [Iterator adapters](https://docs.rs/encode_unicode/latest/encode_unicode/trait.IterExt.html)
for converting betwenn `u8`s and `Utf8Char`s or `u16`s and `Utf16Char`s.
* Optimized [slice-based decoding iterators](https://docs.rs/encode_unicode/latest/encode_unicode/trait.SliceExt.html).
* [Precise errors when decoding a char from UTF-8, UTF-16 or `u32` fails](http://docs.rs/encode_unicode/latest/encode_unicode/error/index.html).
* Utility methods on [`u8`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U8UtfExt.html)
and [`u16`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U16UtfExt.html).
The minimum supported version of Rust is 1.15,
older versions might work now but can break with a minor update.
## Optional features
* `#![no_std]`-mode: There are a few differences:
* `Error` doesn't exist, but `description()` is made available as an inherent impl.
* `Extend`/`FromIterator`-implementations for `String`/`Vec<u8>`/`Vec<u16>` are missing.
* There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`.
This feature is enabled by setting `default-features=false` in `Cargo.toml`:
`encode_unicode = {version="0.3.4", default-features=false}`.
* Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate:
Convert `Utf8Char` and `Utf16Char` to and from [ascii::`AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html).
## License
Licensed under either of
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
at your option.
### Contribution
Unless you explicitly state otherwise, any contribution intentionally
submitted for inclusion in the work by you, as defined in the Apache-2.0
license, shall be dual licensed as above, without any additional terms or
conditions.
## History
The original purpose of this crate was to provide standins for the then
unstable `encode_utf8()` and `encode_utf16()`.
The standins were removed in 0.3 when Rust 1.15 stabilized the `encode_`
methods, but the other stuff I added, such as iterators like
those `encode_utf{8,16}() returned for a while, might still be of use.

74
vendor/encode_unicode/RELEASES.md vendored Normal file
View File

@ -0,0 +1,74 @@
Version 0.3.6 (2019-08-23)
==========================
* Fix pointless undefined behavior in `Utf16Char.to_ascii_char()` (which is part of ascii feature)
* Widen ascii version requirement to include 1.*
* Add `[u16; 2]` UTF-16 array alternatives to `(u16, Some(u16))` UTF-16 tuple methods
* Add `Utf16Char.is_bmp()`
Version 0.3.5 (2018-10-23)
==========================
* Fix docs.rs build failure
Version 0.3.4 (2018-10-23)
==========================
* Fix UB in UTF-8 validation which lead to invalid codepoints being accepted in release mode
* Add fallible decoding iterator adapters `Utf8CharMerger` and `Utf16CharMerger`
and slice-based iterators `Utf8CharDecoder` and `Utf16CharDecoder`
* Widen ascii version requirement from 0.8.* to 0.8.0 - 0.10.*
* Implement creating / extending `String`s from `Utf16Char`-producing iterators
Version 0.3.3 (2018-10-16)
==========================
* Fix UTF-8 overlong check. (`from_array()` and `from_slice()` accepted two-byte encodings of ASCII characters >= '@', which includes all letters)
* Implement `FromStr` for `Utf16Char`
* Add `from_str_start()` to `Utf8Char` and `Utf16Char`
* Add `Utf{8,16}Char{s,Indices}`: `str`-based iterators for `Utf8Char` and `Utf16Char` equivalent to `char`'s `Chars` and `CharIndices`.
* Add `StrExt` with functions to create the above iterators.
* Implement `FromIterator` and `Extend` for `Vec<{u8,u16}>` with reference-producing `Utf{8,16}Char` iterators too.
* Add `Utf8CharSplitter` and `Utf16CharSplitter`: `Utf{8,16}Char`-to-`u{8,16}` iterator adapters.
* Add `IterExt`, `iter_bytes()` and `iter_units()` to create the above splitting iterators.
* Add `Utf8Char::from_ascii()`, `Utf16Char::from_bmp()` with `_unchecked` versions of both.
* Add cross-type `PartialEq` and `PartialOrd` implementations.
* Change the `description()` for a few error types.
Version 0.3.2 (2018-08-08)
==========================
* Hide `AsciiExt` deprecation warning and add replacement methods.
* Correct documentation for `U8UtfExt::extra_utf8_bytes()`.
* Fix misspellings in some error descriptions.
* Avoid potentially bad transmutes.
Version 0.3.1 (2017-06-16)
==========================
* Implement `Display` for `Utf8Char` and `Utf16Char`.
Version 0.3.0 (2017-03-29)
==========================
* Replace the "no_std" feature with opt-out "std".
* Upgrade ascii to v0.8.
* Make tests compile on stable.
* Remove `CharExt::write_utf{8,16}()` because `encode_utf{8,16}()` has been stabilized.
* Return a proper error from `U16UtfExt::utf16_needs_extra_unit()` instead of `None`.
* Rename `U16UtfExt::utf_is_leading_surrogate()` to `is_utf16_leading_surrogate()`.
* Rename `Utf16Char::from_slice()` to `from_slice_start()` and `CharExt::from_utf{8,16}_slice()`
to `from_utf{8,16}_slice_start()` to be consistent with `Utf8Char`.
* Fix a bug where `CharExt::from_slice()` would accept some trailing surrogates
as standalone codepoints.
Version 0.2.0 (2016-07-24)
==========================
* Change `CharExt::write_utf{8,16}()` to panic instead of returning `None`
if the slice is too short.
* Fix bug where `CharExt::write_utf8()` and `Utf8Char::to_slice()` could change bytes it shouldn't.
* Rename lots of errors with search and replace:
* CodePoint -> Codepoint
* Several -> Multiple
* Update the ascii feature to use [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) v0.7.
* Support `#[no_std]`; see 70e090ee for differences.
* Ungate impls of `AsciiExt`. (doesn't require ascii or nightly)
* Make the tests compile (and pass) again.
(They still require nightly).
Version 0.1.* (2016-04-07)
==========================
First release.

View File

@ -0,0 +1,93 @@
// uses /usr/share/dict/ for text to convert to Vec<Utf*Char> and iterate over
#![cfg(all(unix, feature="std"))]
#![feature(test)]
extern crate test;
use test::{Bencher, black_box};
#[macro_use] extern crate lazy_static;
extern crate encode_unicode;
use encode_unicode::{CharExt, Utf8Char, Utf16Char, iter_bytes, iter_units};
static ENGLISH: &str = include_str!("/usr/share/dict/american-english");
// TODO find a big chinese file; `aptitude search '?provides(wordlist)'` didn't have one
lazy_static!{
static ref UTF8CHARS: Vec<Utf8Char> = ENGLISH.chars().map(|c| c.to_utf8() ).collect();
static ref UTF16CHARS: Vec<Utf16Char> = ENGLISH.chars().map(|c| c.to_utf16() ).collect();
}
#[bench]
fn utf16_split_all_single_mulititerator(b: &mut Bencher) {
b.iter(|| {
iter_units(black_box(&*UTF16CHARS)).for_each(|u| assert!(u != 0) );
});
}
#[bench]
fn utf16_split_all_single_flatmap(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF16CHARS).iter().flat_map(|&u16c| u16c ).for_each(|u| assert!(u != 0) );
});
}
#[bench]
fn utf16_split_all_single_cloned_flatten(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF16CHARS).iter().cloned().flatten().for_each(|u| assert!(u != 0) );
});
}
#[bench]
fn utf8_split_mostly_ascii_multiiterator(b: &mut Bencher) {
b.iter(|| {
iter_bytes(black_box(&*UTF8CHARS)).for_each(|b| assert!(b != 0) );
});
}
#[bench]
fn utf8_split_mostly_ascii_flatmap(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF8CHARS).iter().flat_map(|&u8c| u8c ).for_each(|b| assert!(b != 0) );
});
}
#[bench]
fn utf8_split_mostly_ascii_cloned_flatten(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF8CHARS).iter().cloned().flatten().for_each(|b| assert!(b != 0) );
});
}
#[bench]
fn utf8_extend_mostly_ascii_multiiterator(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u8> = iter_bytes(black_box(&*UTF8CHARS)).collect();
assert_eq!(black_box(vec).len(), ENGLISH.len());
});
}
#[bench]
fn utf8_extend_mostly_ascii_custom(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u8> = black_box(&*UTF8CHARS).iter().collect();
assert_eq!(black_box(vec).len(), ENGLISH.len());
});
}
#[bench]
fn utf8_extend_mostly_ascii_custom_str(b: &mut Bencher) {
b.iter(|| {
let vec: String = black_box(&*UTF8CHARS).iter().cloned().collect();
assert_eq!(black_box(vec).len(), ENGLISH.len());
});
}
#[bench]
fn utf16_extend_all_single_multiiterator(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u16> = iter_units(black_box(&*UTF16CHARS)).collect();
assert!(black_box(vec).len() < ENGLISH.len());
});
}
#[bench]
fn utf16_extend_all_single_custom(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u16> = black_box(&*UTF16CHARS).iter().collect();
assert!(black_box(vec).len() < ENGLISH.len());
});
}

View File

@ -0,0 +1,494 @@
/* Copyright 2018 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail.
//!
//! To be predictable, all errors consume one element each.
//!
//! The iterator adaptors produce neither offset nor element length to work
//! well with other adaptors,
//! while the slice iterators yield both to make more advanced use cases easy.
use errors::{InvalidUtf8Slice, InvalidUtf16FirstUnit, Utf16PairError};
use errors::InvalidUtf8Slice::*;
use errors::InvalidUtf8::*;
use errors::InvalidUtf8FirstByte::*;
use errors::InvalidUtf16Slice::*;
use errors::InvalidCodepoint::*;
use errors::Utf16PairError::*;
use utf8_char::Utf8Char;
use utf16_char::Utf16Char;
use traits::U16UtfExt;
extern crate core;
use self::core::borrow::Borrow;
use self::core::fmt::{self, Debug};
use self::core::iter::Chain;
use self::core::option;
/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s.
///
/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> {
iter: I,
/// number of bytes that were read before an error was detected
after_err_leftover: u8,
/// stack because it simplifies popping.
after_err_stack: [u8; 3],
}
impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
From<T> for Utf8CharMerger<B, I> {
fn from(t: T) -> Self {
Utf8CharMerger {
iter: t.into_iter(),
after_err_leftover: 0,
after_err_stack: [0; 3],
}
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> {
/// Extract the inner iterator.
///
/// If the last item produced by `.next()` was an `Err`,
/// up to three following bytes might be missing.
/// The exact number of missing bytes for each error type should not be relied on.
///
/// # Examples
///
/// Three bytes swallowed:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// let mut inner: std::slice::Iter<u8> = merger.into_inner();
/// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared
/// ```
///
/// All bytes present:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xb0FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// assert_eq!(merger.into_inner().next(), Some(&b'F'));
/// ```
///
/// Two bytes missing:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// assert_eq!(merger.into_inner().next(), Some(&b'F'));
/// ```
pub fn into_inner(self) -> I {
self.iter
}
fn save(&mut self, bytes: &[u8;4], len: usize) {
// forget bytes[0] and push the others onto self.after_err_stack (in reverse).
for &after_err in bytes[1..len].iter().rev() {
self.after_err_stack[self.after_err_leftover as usize] = after_err;
self.after_err_leftover += 1;
}
}
/// Reads len-1 bytes into bytes[1..]
fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),InvalidUtf8Slice> {
// This is the only function that pushes onto after_err_stack,
// and it checks that all bytes are continuation bytes before fetching the next one.
// Therefore only the last byte retrieved can be a non-continuation byte.
// That last byte is also the last to be retrieved from after_err.
//
// Before this function is called, there has been retrieved at least one byte.
// If that byte was a continuation byte, next() produces an error
// and won't call this function.
// Therefore, we know that after_err is empty at this point.
// This means that we can use self.iter directly, and knows where to start pushing
debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack);
for i in 1..len {
if let Some(extra) = self.iter.next() {
let extra = *extra.borrow();
bytes[i] = extra;
if extra & 0b1100_0000 != 0b1000_0000 {
// not a continuation byte
self.save(bytes, i+1);
return Err(InvalidUtf8Slice::Utf8(NotAContinuationByte(i)))
}
} else {
self.save(bytes, i);
return Err(TooShort(len));
}
}
Ok(())
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> {
type Item = Result<Utf8Char,InvalidUtf8Slice>;
fn next(&mut self) -> Option<Self::Item> {
let first: u8;
if self.after_err_leftover != 0 {
self.after_err_leftover -= 1;
first = self.after_err_stack[self.after_err_leftover as usize];
} else if let Some(next) = self.iter.next() {
first = *next.borrow();
} else {
return None;
}
unsafe {
let mut bytes = [first, 0, 0, 0];
let ok = match first {
0b0000_0000...0b0111_1111 => {/*1 and */Ok(())},
0b1100_0010...0b1101_1111 => {//2 and not overlong
self.extra(&mut bytes, 2) // no extra validation required
},
0b1110_0000...0b1110_1111 => {//3
if let Err(e) = self.extra(&mut bytes, 3) {
Err(e)
} else if bytes[0] == 0b1110_0000 && bytes[1] <= 0b10_011111 {
self.save(&bytes, 3);
Err(Utf8(OverLong))
} else if bytes[0] == 0b1110_1101 && bytes[1] & 0b11_100000 == 0b10_100000 {
self.save(&bytes, 3);
Err(Codepoint(Utf16Reserved))
} else {
Ok(())
}
},
0b1111_0000...0b1111_0100 => {//4
if let Err(e) = self.extra(&mut bytes, 4) {
Err(e)
} else if bytes[0] == 0b11110_000 && bytes[1] <= 0b10_001111 {
self.save(&bytes, 4);
Err(InvalidUtf8Slice::Utf8(OverLong))
} else if bytes[0] == 0b11110_100 && bytes[1] > 0b10_001111 {
self.save(&bytes, 4);
Err(InvalidUtf8Slice::Codepoint(TooHigh))
} else {
Ok(())
}
},
0b1000_0000...0b1011_1111 => {// continuation byte
Err(Utf8(FirstByte(ContinuationByte)))
},
0b1100_0000...0b1100_0001 => {// 2 and overlong
Err(Utf8(OverLong))
},
0b1111_0101...0b1111_0111 => {// 4 and too high codepoint
Err(Codepoint(TooHigh))
},
0b1111_1000...0b1111_1111 => {
Err(Utf8(FirstByte(TooLongSeqence)))
},
_ => unreachable!("all possible byte values should be covered")
};
Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) ))
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let (iter_min, iter_max) = self.iter.size_hint();
// cannot be exact, so KISS
let min = iter_min / 4; // don't bother rounding up or accounting for after_err
// handle edge case of max > usize::MAX-3 just in case.
// Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe.
let max = iter_max.and_then(|max| {
max.checked_add(self.after_err_leftover as usize)
});
(min, max)
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>+Debug> Debug for Utf8CharMerger<B,I> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut in_order = [0u8; 3];
for i in 0..self.after_err_leftover as usize {
in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1];
}
fmtr.debug_struct("Utf8CharMerger")
.field("buffered", &&in_order[..self.after_err_leftover as usize])
.field("inner", &self.iter)
.finish()
}
}
/// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces
/// offsets and lengths, but can only iterate over slices.
///
/// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf8CharDecoder<'a> {
slice: &'a[u8],
index: usize,
}
impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> {
fn from(s: &[u8]) -> Utf8CharDecoder {
Utf8CharDecoder { slice: s, index: 0 }
}
}
impl<'a> Utf8CharDecoder<'a> {
/// Extract the remainder of the source slice.
///
/// # Examples
///
/// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error
/// are never swallowed:
/// ```
/// # use encode_unicode::SliceExt;
/// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices();
/// assert!(iter.next().unwrap().1.is_err());
/// assert_eq!(iter.as_slice(), b"\xa1\xb2FS");
/// ```
pub fn as_slice(&self) -> &'a[u8] {
&self.slice[self.index..]
}
}
impl<'a> Iterator for Utf8CharDecoder<'a> {
type Item = (usize, Result<Utf8Char,InvalidUtf8Slice>, usize);
fn next(&mut self) -> Option<Self::Item> {
let start = self.index;
match Utf8Char::from_slice_start(&self.slice[self.index..]) {
Ok((u8c, len)) => {
self.index += len;
Some((start, Ok(u8c), len))
},
Err(TooShort(1)) => None,
Err(e) => {
self.index += 1;
Some((start, Err(e), 1))
}
}
}
#[inline]
fn size_hint(&self) -> (usize,Option<usize>) {
let bytes = self.slice.len() - self.index;
// Cannot be exact, so KISS and don't bother rounding up.
// The slice is unlikely be full of 4-byte codepoints, so buffers
// allocated with the lower bound will have to be grown anyway.
(bytes/4, Some(bytes))
}
}
impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
if self.index < self.slice.len() {
let extras = self.slice.iter()
.rev()
.take_while(|&b| b & 0b1100_0000 == 0b1000_0000 )
.count();
let starts = self.slice.len() - (extras+1);
match Utf8Char::from_slice_start(&self.slice[starts..]) {
Ok((u8c,len)) if len == 1+extras => {
self.slice = &self.slice[..starts];
Some((starts, Ok(u8c), len))
},
// This enures errors for every byte in both directions,
// but means overlong and codepoint errors will be turned into
// tooshort errors.
Err(e) if extras == 0 => {
self.slice = &self.slice[..self.slice.len()-1];
Some((self.slice.len()-1, Err(e), 1))
},
_ => {
self.slice = &self.slice[..self.slice.len()-1];
Some((self.slice.len()-1, Err(Utf8(FirstByte(ContinuationByte))), 1))
},
}
} else {
None
}
}
}
impl<'a> Debug for Utf8CharDecoder<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice())
}
}
/// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s.
///
/// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf16CharMerger<B:Borrow<u16>, I:Iterator<Item=B>> {
iter: I,
/// Used when a trailing surrogate was expected, the u16 can be any value.
prev: Option<B>,
}
impl<B:Borrow<u16>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
From<T> for Utf16CharMerger<B,I> {
fn from(t: T) -> Self {
Utf16CharMerger { iter: t.into_iter(), prev: None }
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>> Utf16CharMerger<B,I> {
/// Extract the inner iterator.
///
/// If the last item produced was an `Err`, the first unit might be missing.
///
/// # Examples
///
/// Unit right after an error missing
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
/// let mut inner: std::slice::Iter<u16> = merger.into_inner();
/// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger
/// ```
///
/// Error that doesn't swallow any units
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate)));
/// let mut inner: std::slice::Iter<u16> = merger.into_inner();
/// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed
/// ```
pub fn into_inner(self) -> I {
self.iter
}
/// Returns an iterator over the remaining units.
/// Unlike `into_inner()` this will never drop any units.
///
/// The exact type of the returned iterator should not be depended on.
///
/// # Examples
///
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let slice = [0xd901, 'F' as u16, 'S' as u16];
/// let mut merger = slice.iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
/// let mut remaining = merger.into_remaining_units();
/// assert_eq!(remaining.next(), Some('F' as u16).as_ref());
/// ```
pub fn into_remaining_units(self) -> Chain<option::IntoIter<B>,I> {
self.prev.into_iter().chain(self.iter)
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>> Iterator for Utf16CharMerger<B,I> {
type Item = Result<Utf16Char,Utf16PairError>;
fn next(&mut self) -> Option<Self::Item> {
let first = self.prev.take().or_else(|| self.iter.next() );
first.map(|first| unsafe {
match first.borrow().utf16_needs_extra_unit() {
Ok(false) => Ok(Utf16Char::from_array_unchecked([*first.borrow(), 0])),
Ok(true) => match self.iter.next() {
Some(second) => match second.borrow().utf16_needs_extra_unit() {
Err(InvalidUtf16FirstUnit) => Ok(Utf16Char::from_tuple_unchecked((
*first.borrow(),
Some(*second.borrow())
))),
Ok(_) => {
self.prev = Some(second);
Err(Utf16PairError::UnmatchedLeadingSurrogate)
}
},
None => Err(Utf16PairError::Incomplete)
},
Err(InvalidUtf16FirstUnit) => Err(Utf16PairError::UnexpectedTrailingSurrogate),
}
})
}
fn size_hint(&self) -> (usize,Option<usize>) {
let (iter_min, iter_max) = self.iter.size_hint();
// cannot be exact, so KISS
let min = iter_min / 2; // don't bother rounding up or accounting for self.prev
let max = match (iter_max, &self.prev) {
(Some(max), &Some(_)) => max.checked_add(1),
(max, _) => max,
};
(min, max)
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>+Debug> Debug for Utf16CharMerger<B,I> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_struct("Utf16CharMerger")
.field("buffered", &self.prev.as_ref().map(|b| *b.borrow() ))
.field("inner", &self.iter)
.finish()
}
}
/// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces
/// offsets and lengths, but can only iterate over slices.
///
/// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf16CharDecoder<'a> {
slice: &'a[u16],
index: usize,
}
impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> {
fn from(s: &'a[u16]) -> Self {
Utf16CharDecoder{ slice: s, index: 0 }
}
}
impl<'a> Utf16CharDecoder<'a> {
/// Extract the remainder of the source slice.
///
/// # Examples
///
/// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed:
/// ```
/// # use encode_unicode::SliceExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices();
/// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1)));
/// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]);
/// ```
pub fn as_slice(&self) -> &[u16] {
&self.slice[self.index..]
}
}
impl<'a> Iterator for Utf16CharDecoder<'a> {
type Item = (usize,Result<Utf16Char,Utf16PairError>,usize);
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let start = self.index;
match Utf16Char::from_slice_start(self.as_slice()) {
Ok((u16c,len)) => {
self.index += len;
Some((start, Ok(u16c), len))
},
Err(EmptySlice) => None,
Err(FirstLowSurrogate) => {
self.index += 1;
Some((start, Err(UnexpectedTrailingSurrogate), 1))
},
Err(SecondNotLowSurrogate) => {
self.index += 1;
Some((start, Err(UnmatchedLeadingSurrogate), 1))
},
Err(MissingSecond) => {
self.index = self.slice.len();
Some((start, Err(Incomplete), 1))
}
}
}
#[inline]
fn size_hint(&self) -> (usize,Option<usize>) {
let units = self.slice.len() - self.index;
// Cannot be exact, so KISS and don't bother rounding up.
// The slice is unlikely be full of surrogate pairs, so buffers
// allocated with the lower bound will have to be grown anyway.
(units/2, Some(units))
}
}
impl<'a> Debug for Utf16CharDecoder<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice())
}
}

289
vendor/encode_unicode/src/errors.rs vendored Normal file
View File

@ -0,0 +1,289 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Boilerplatey error types
extern crate core;
use self::core::fmt::{self,Display,Formatter};
#[cfg(feature="std")]
use std::error::Error;
macro_rules! description {($err:ty, $desc:expr) => {
#[cfg(not(feature="std"))]
impl $err {
#[allow(missing_docs)]
pub fn description(&self) -> &'static str {
($desc)(self)
}
}
#[cfg(feature="std")]
impl Error for $err {
fn description(&self) -> &'static str {
($desc)(self)
}
}
impl Display for $err {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
write!(fmtr, "{}", self.description())
}
}
}}
macro_rules! single_cause {(#[$doc1:meta] #[$doc2:meta] $err:ident => $desc:expr) => {
// Rust 1.15 doesn't understand $(#[$doc:meta])* $:ident
#[$doc1]
#[$doc2]
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub struct $err;
description!{$err, |_| $desc }
}}
single_cause!{
/// Cannot tell whether an `u16` needs an extra unit,
/// because it's a trailing surrogate itself.
InvalidUtf16FirstUnit => "is a trailing surrogate"
}
single_cause!{
/// Cannot create an `Utf8Char` or `Utf16Char` from the first codepoint of a str,
/// because there are none.
EmptyStrError => "is empty"
}
single_cause!{
/// Cannot create an `Utf8Char` from a standalone `u8`
/// that is not an ASCII character.
NonAsciiError => "is not an ASCII character"
}
single_cause!{
/// Cannot create an `Utf16Char` from a standalone `u16` that is not a
/// codepoint in the basic multilingual plane, but part of a suurrogate pair.
NonBMPError => "is not a codepoint in the basic multilingual plane"
}
macro_rules! simple {(#[$tydoc:meta] $err:ident {
$($(#[$vardoc:meta])* ::$variant:ident => $string:expr),+,
} ) => {
#[$tydoc]
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum $err {
$($(#[$vardoc])* $variant),*
}
description!{$err, |e: &$err| match *e {$($err::$variant=>$string),*} }
}}
simple!{/// Reasons why an `u32` is not a valid UTF codepoint.
InvalidCodepoint {
/// It's reserved for UTF-16 surrogate pairs."
::Utf16Reserved => "is reserved for UTF-16 surrogate pairs",
/// It's higher than the highest codepoint (which is 0x10ffff).
::TooHigh => "is higher than the highest codepoint",
}}
use self::InvalidCodepoint::*;
impl InvalidCodepoint {
/// Get the range of values for which this error would be given.
pub fn error_range(self) -> (u32,u32) {match self {
Utf16Reserved => (0xd8_00, 0xdf_ff),
TooHigh => (0x00_10_ff_ff, 0xff_ff_ff_ff),
}}
}
simple!{/// Reasons why a `[u16; 2]` doesn't form a valid UTF-16 codepoint.
InvalidUtf16Array {
/// The first unit is a trailing/low surrogate, which is never valid.
::FirstIsTrailingSurrogate => "the first unit is a trailing surrogate, which is never valid",
/// The second unit is needed, but is not a trailing surrogate.
::SecondIsNotTrailingSurrogate => "the second unit is needed but is not a trailing surrogate",
}}
simple!{/// Reasons why one or two `u16`s are not valid UTF-16, in sinking precedence.
InvalidUtf16Tuple {
/// The first unit is a trailing/low surrogate, which is never valid.
///
/// Note that the value of a low surrogate is actually higher than a high surrogate.
::FirstIsTrailingSurrogate => "the first unit is a trailing / low surrogate, which is never valid",
/// You provided a second unit, but the first one stands on its own.
::SuperfluousSecond => "the second unit is superfluous",
/// The first and only unit requires a second unit.
::MissingSecond => "the first unit requires a second unit",
/// The first unit requires a second unit, but it's not a trailing/low surrogate.
///
/// Note that the value of a low surrogate is actually higher than a high surrogate.
::InvalidSecond => "the required second unit is not a trailing / low surrogate",
}}
simple!{/// Reasons why a slice of `u16`s doesn't start with valid UTF-16.
InvalidUtf16Slice {
/// The slice is empty.
::EmptySlice => "the slice is empty",
/// The first unit is a low surrogate.
::FirstLowSurrogate => "the first unit is a trailing surrogate",
/// The first and only unit requires a second unit.
::MissingSecond => "the first and only unit requires a second one",
/// The first unit requires a second one, but it's not a trailing surrogate.
::SecondNotLowSurrogate => "the required second unit is not a trailing surrogate",
}}
simple!{/// Types of invalid sequences encountered by `Utf16CharParser`.
Utf16PairError {
/// A trailing surrogate was not preceeded by a leading surrogate.
::UnexpectedTrailingSurrogate => "a trailing surrogate was not preceeded by a leading surrogate",
/// A leading surrogate was followed by an unit that was not a trailing surrogate.
::UnmatchedLeadingSurrogate => "a leading surrogate was followed by an unit that was not a trailing surrogate",
/// A trailing surrogate was expected when the end was reached.
::Incomplete => "a trailing surrogate was expected when the end was reached",
}}
simple!{/// Reasons why `Utf8Char::from_str()` or `Utf16Char::from_str()` failed.
FromStrError {
/// `Utf8Char` or `Utf16Char` cannot store more than a single codepoint.
::MultipleCodepoints => "has more than one codepoint",
/// `Utf8Char` or `Utf16Char` cannot be empty.
::Empty => "is empty",
}}
simple!{/// Reasons why a byte is not the start of a UTF-8 codepoint.
InvalidUtf8FirstByte {
/// Sequences cannot be longer than 4 bytes. Is given for values >= 240.
::TooLongSeqence => "is greater than 247 (UTF-8 sequences cannot be longer than four bytes)",
/// This byte belongs to a previous sequence. Is given for values between 128 and 192 (exclusive).
::ContinuationByte => "is a continuation of a previous sequence",
}}
use self::InvalidUtf8FirstByte::*;
macro_rules! complex {
($err:ty
{$($sub:ty => $to:expr,)*}
{$($desc:pat => $string:expr),+,}
=> $use_cause:expr =>
{$($cause:pat => $result:expr),+,} $(#[$causedoc:meta])*
) => {
$(impl From<$sub> for $err {
fn from(error: $sub) -> $err {
$to(error)
}
})*
#[cfg(not(feature="std"))]
impl $err {
#[allow(missing_docs)]
pub fn description(&self) -> &'static str {
match *self{ $($desc => $string,)* }
}
/// A hack to avoid two Display impls
fn cause(&self) -> Option<&Display> {None}
}
#[cfg(feature="std")]
impl Error for $err {
fn description(&self) -> &'static str {
match *self{ $($desc => $string,)* }
}
$(#[$causedoc])*
fn cause(&self) -> Option<&Error> {
match *self{ $($cause => $result,)* }
}
}
impl Display for $err {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
match (self.cause(), $use_cause) {
(Some(d),true) => write!(fmtr, "{}: {}", self.description(), d),
_ => write!(fmtr, "{}", self.description()),
}
}
}
}}
/// Reasons why a byte sequence is not valid UTF-8, excluding invalid codepoint.
/// In sinking precedence.
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum InvalidUtf8 {
/// Something is wrong with the first byte.
FirstByte(InvalidUtf8FirstByte),
/// The byte at index 1...3 should be a continuation byte,
/// but dosesn't fit the pattern 0b10xx_xxxx.
NotAContinuationByte(usize),
/// There are too many leading zeros: it could be a byte shorter.
///
/// [Decoding this could allow someone to input otherwise prohibited
/// characters and sequences, such as "../"](https://tools.ietf.org/html/rfc3629#section-10).
OverLong,
}
use self::InvalidUtf8::*;
complex!{InvalidUtf8 {
InvalidUtf8FirstByte => FirstByte,
} {
FirstByte(TooLongSeqence) => "the first byte is greater than 239 (UTF-8 sequences cannot be longer than four bytes)",
FirstByte(ContinuationByte) => "the first byte is a continuation of a previous sequence",
OverLong => "the sequence contains too many zeros and could be shorter",
NotAContinuationByte(_) => "the sequence is too short",
} => false => {
FirstByte(ref cause) => Some(cause),
_ => None,
}/// Returns `Some` if the error is a `InvalidUtf8FirstByte`.
}
/// Reasons why a byte array is not valid UTF-8, in sinking precedence.
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum InvalidUtf8Array {
/// Not a valid UTF-8 sequence.
Utf8(InvalidUtf8),
/// Not a valid unicode codepoint.
Codepoint(InvalidCodepoint),
}
complex!{InvalidUtf8Array {
InvalidUtf8 => InvalidUtf8Array::Utf8,
InvalidCodepoint => InvalidUtf8Array::Codepoint,
} {
InvalidUtf8Array::Utf8(_) => "the sequence is invalid UTF-8",
InvalidUtf8Array::Codepoint(_) => "the encoded codepoint is invalid",
} => true => {
InvalidUtf8Array::Utf8(ref u) => Some(u),
InvalidUtf8Array::Codepoint(ref c) => Some(c),
}/// Always returns `Some`.
}
/// Reasons why a byte slice is not valid UTF-8, in sinking precedence.
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum InvalidUtf8Slice {
/// Something is certainly wrong with the first byte.
Utf8(InvalidUtf8),
/// The encoded codepoint is invalid:
Codepoint(InvalidCodepoint),
/// The slice is too short; n bytes was required.
TooShort(usize),
}
complex!{InvalidUtf8Slice {
InvalidUtf8 => InvalidUtf8Slice::Utf8,
InvalidCodepoint => InvalidUtf8Slice::Codepoint,
} {
InvalidUtf8Slice::Utf8(_) => "the sequence is invalid UTF-8",
InvalidUtf8Slice::Codepoint(_) => "the encoded codepoint is invalid",
InvalidUtf8Slice::TooShort(1) => "the slice is empty",
InvalidUtf8Slice::TooShort(_) => "the slice is shorter than the sequence",
} => true => {
InvalidUtf8Slice::Utf8(ref u) => Some(u),
InvalidUtf8Slice::Codepoint(ref c) => Some(c),
InvalidUtf8Slice::TooShort(_) => None,
}
}

78
vendor/encode_unicode/src/lib.rs vendored Normal file
View File

@ -0,0 +1,78 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
/*!
Miscellaneous UTF-8 and UTF-16 types and methods.
# Optional features:
* `#![no_std]`-mode: There are a few differences:
* `Error` doesn't exist, but `description()` is made available as an inherent impl.
* `Extend`/`FromIterator`-implementations for `String`/`Vec<u8>`/`Vec<u16>` are missing.
* There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`.
This feature is enabled by setting `default-features=false` in `Cargo.toml`:
`encode_unicode = {version="0.3.4", default-features=false}`
* Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate:
Convert `Utf8Char` and `Utf16Char` to and from
[ascii::`AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html).
The minimum supported version of Rust is 1.15,
older versions might work now but can break with a minor update.
[crates.io page](https://crates.io/crates/encode_unicode)
[github repository](https://github.com/tormol/encode_unicode)
*/
#![warn(missing_docs)]
#![cfg_attr(not(feature="std"), no_std)]
// either `cargo clippy` doesn't see theese, or I get a warning when I build.
#![cfg_attr(feature="clippy", feature(plugin))]
#![cfg_attr(feature="clippy", plugin(clippy))]
#![cfg_attr(feature="clippy", allow(derive_hash_xor_eq))]// tested
#![cfg_attr(feature="clippy", allow(len_without_is_empty))]// UtfxChar is never empty
#![cfg_attr(feature="clippy", allow(match_same_arms))]// looks better IMO
#![cfg_attr(feature="clippy", allow(needless_return))]// `foo.bar(); foo` looks unfinished
#![cfg_attr(feature="clippy", allow(redundant_closure))]// keep it explicit
#![cfg_attr(feature="clippy", allow(redundant_closure_call))]// not redundant in macros
#![cfg_attr(feature="clippy", allow(cast_lossless))]// too much noise (and too verbose)
// precedence: I prefer spaces to parentheses, but it's nice to recheck.
mod errors;
mod traits;
mod utf8_char;
mod utf8_iterators;
mod utf16_char;
mod utf16_iterators;
mod decoding_iterators;
pub use traits::{CharExt, U8UtfExt, U16UtfExt, StrExt, IterExt, SliceExt};
pub use utf8_char::Utf8Char;
pub use utf16_char::Utf16Char;
pub use utf8_iterators::{Utf8Iterator, iter_bytes};
pub use utf16_iterators::{Utf16Iterator, iter_units};
pub mod error {// keeping the public interface in one file
//! Errors returned by various conversion methods in this crate.
pub use errors::{FromStrError, EmptyStrError};
pub use errors::{InvalidCodepoint, InvalidUtf8};
pub use errors::{InvalidUtf8FirstByte,InvalidUtf16FirstUnit};
pub use errors::{InvalidUtf8Slice,InvalidUtf16Slice};
pub use errors::{InvalidUtf8Array,InvalidUtf16Array,InvalidUtf16Tuple};
pub use errors::Utf16PairError;
}
pub mod iterator {
//! Iterator types that you should rarely need to name
pub use utf8_iterators::{Utf8Iterator, Utf8CharSplitter, Utf8Chars, Utf8CharIndices};
pub use utf16_iterators::{Utf16Iterator, Utf16CharSplitter, Utf16Chars, Utf16CharIndices};
pub use decoding_iterators::{Utf8CharMerger, Utf8CharDecoder};
pub use decoding_iterators::{Utf16CharMerger, Utf16CharDecoder};
}

1014
vendor/encode_unicode/src/traits.rs vendored Normal file

File diff suppressed because it is too large Load Diff

687
vendor/encode_unicode/src/utf16_char.rs vendored Normal file
View File

@ -0,0 +1,687 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use utf16_iterators::Utf16Iterator;
use traits::{CharExt, U16UtfExt};
use utf8_char::Utf8Char;
use errors::{InvalidUtf16Slice, InvalidUtf16Array, InvalidUtf16Tuple};
use errors::{NonBMPError, EmptyStrError, FromStrError};
extern crate core;
use self::core::{hash,fmt};
use self::core::cmp::Ordering;
use self::core::borrow::Borrow;
use self::core::ops::Deref;
use self::core::str::FromStr;
#[cfg(feature="std")]
use self::core::iter::FromIterator;
#[cfg(feature="std")]
#[allow(deprecated)]
use std::ascii::AsciiExt;
#[cfg(feature="ascii")]
use self::core::char;
#[cfg(feature="ascii")]
extern crate ascii;
#[cfg(feature="ascii")]
use self::ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError};
// I don't think there is any good default value for char, but char does.
#[derive(Default)]
// char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either.
// When it's a single unit, the second is zero, so Eq works.
// #[derive(Ord)] however, breaks on surrogate pairs.
#[derive(PartialEq,Eq)]
#[derive(Clone,Copy)]
/// An unicode codepoint stored as UTF-16.
///
/// It can be borrowed as an `u16` slice, and has the same size as `char`.
pub struct Utf16Char {
units: [u16; 2],
}
/////////////////////
//conversion traits//
/////////////////////
impl FromStr for Utf16Char {
type Err = FromStrError;
/// Create an `Utf16Char` from a string slice.
/// The string must contain exactly one codepoint.
///
/// # Examples
///
/// ```
/// use encode_unicode::error::FromStrError::*;
/// use encode_unicode::Utf16Char;
/// use std::str::FromStr;
///
/// assert_eq!(Utf16Char::from_str("a"), Ok(Utf16Char::from('a')));
/// assert_eq!(Utf16Char::from_str("🂠"), Ok(Utf16Char::from('🂠')));
/// assert_eq!(Utf16Char::from_str(""), Err(Empty));
/// assert_eq!(Utf16Char::from_str("ab"), Err(MultipleCodepoints));
/// assert_eq!(Utf16Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark
/// ```
fn from_str(s: &str) -> Result<Self, FromStrError> {
match Utf16Char::from_str_start(s) {
Ok((u16c,bytes)) if bytes == s.len() => Ok(u16c),
Ok((_,_)) => Err(FromStrError::MultipleCodepoints),
Err(EmptyStrError) => Err(FromStrError::Empty),
}
}
}
impl From<char> for Utf16Char {
fn from(c: char) -> Self {
let (first, second) = c.to_utf16_tuple();
Utf16Char{ units: [first, second.unwrap_or(0)] }
}
}
impl From<Utf8Char> for Utf16Char {
fn from(utf8: Utf8Char) -> Utf16Char {
let (b, utf8_len) = utf8.to_array();
match utf8_len {
1 => Utf16Char{ units: [b[0] as u16, 0] },
4 => {// need surrogate
let mut first = 0xd800 - (0x01_00_00u32 >> 10) as u16;
first += (b[0] as u16 & 0x07) << 8;
first += (b[1] as u16 & 0x3f) << 2;
first += (b[2] as u16 & 0x30) >> 4;
let mut second = 0xdc00;
second |= (b[2] as u16 & 0x0f) << 6;
second |= b[3] as u16 & 0x3f;
Utf16Char{ units: [first, second] }
},
_ => { // 2 or 3
let mut unit = ((b[0] as u16 & 0x1f) << 6) | (b[1] as u16 & 0x3f);
if utf8_len == 3 {
unit = (unit << 6) | (b[2] as u16 & 0x3f);
}
Utf16Char{ units: [unit, 0] }
},
}
}
}
impl From<Utf16Char> for char {
fn from(uc: Utf16Char) -> char {
char::from_utf16_array_unchecked(uc.to_array())
}
}
impl IntoIterator for Utf16Char {
type Item=u16;
type IntoIter=Utf16Iterator;
/// Iterate over the units.
fn into_iter(self) -> Utf16Iterator {
Utf16Iterator::from(self)
}
}
#[cfg(feature="std")]
impl Extend<Utf16Char> for Vec<u16> {
fn extend<I:IntoIterator<Item=Utf16Char>>(&mut self, iter: I) {
let iter = iter.into_iter();
self.reserve(iter.size_hint().0);
for u16c in iter {
self.push(u16c.units[0]);
if u16c.units[1] != 0 {
self.push(u16c.units[1]);
}
}
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf16Char> for Vec<u16> {
fn extend<I:IntoIterator<Item=&'a Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl FromIterator<Utf16Char> for Vec<u16> {
fn from_iter<I:IntoIterator<Item=Utf16Char>>(iter: I) -> Self {
let mut vec = Vec::new();
vec.extend(iter);
return vec;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf16Char> for Vec<u16> {
fn from_iter<I:IntoIterator<Item=&'a Utf16Char>>(iter: I) -> Self {
Self::from_iter(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl Extend<Utf16Char> for String {
fn extend<I:IntoIterator<Item=Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().map(|u16c| Utf8Char::from(u16c) ));
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf16Char> for String {
fn extend<I:IntoIterator<Item=&'a Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned());
}
}
#[cfg(feature="std")]
impl FromIterator<Utf16Char> for String {
fn from_iter<I:IntoIterator<Item=Utf16Char>>(iter: I) -> Self {
let mut s = String::new();
s.extend(iter);
return s;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf16Char> for String {
fn from_iter<I:IntoIterator<Item=&'a Utf16Char>>(iter: I) -> Self {
Self::from_iter(iter.into_iter().cloned())
}
}
/////////////////
//getter traits//
/////////////////
impl AsRef<[u16]> for Utf16Char {
#[inline]
fn as_ref(&self) -> &[u16] {
&self.units[..self.len()]
}
}
impl Borrow<[u16]> for Utf16Char {
#[inline]
fn borrow(&self) -> &[u16] {
self.as_ref()
}
}
impl Deref for Utf16Char {
type Target = [u16];
#[inline]
fn deref(&self) -> &[u16] {
self.as_ref()
}
}
////////////////
//ascii traits//
////////////////
#[cfg(feature="std")]
#[allow(deprecated)]
impl AsciiExt for Utf16Char {
type Owned = Self;
fn is_ascii(&self) -> bool {
self.units[0] < 128
}
fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
self.to_ascii_lowercase() == other.to_ascii_lowercase()
}
fn to_ascii_uppercase(&self) -> Self {
let n = self.units[0].wrapping_sub(b'a' as u16);
if n < 26 {Utf16Char{ units: [n+b'A' as u16, 0] }}
else {*self}
}
fn to_ascii_lowercase(&self) -> Self {
let n = self.units[0].wrapping_sub(b'A' as u16);
if n < 26 {Utf16Char{ units: [n+b'a' as u16, 0] }}
else {*self}
}
fn make_ascii_uppercase(&mut self) {
*self = self.to_ascii_uppercase()
}
fn make_ascii_lowercase(&mut self) {
*self = self.to_ascii_lowercase();
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl From<AsciiChar> for Utf16Char {
#[inline]
fn from(ac: AsciiChar) -> Self {
Utf16Char{ units: [ac.as_byte() as u16, 0] }
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl ToAsciiChar for Utf16Char {
#[inline]
fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> {
// ToAsciiCHar is not implemented for u16 in ascii 0.9.0
if self.is_ascii() {self.units[0] as u8} else {255}.to_ascii_char()
}
#[inline]
unsafe fn to_ascii_char_unchecked(self) -> AsciiChar {
(self.units[0] as u8).to_ascii_char_unchecked()
}
}
/////////////////////////////////////////////////////////
//Genaral traits that cannot be derived to emulate char//
/////////////////////////////////////////////////////////
impl hash::Hash for Utf16Char {
fn hash<H : hash::Hasher>(&self, state: &mut H) {
self.to_char().hash(state);
}
}
impl fmt::Debug for Utf16Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(&self.to_char(), fmtr)
}
}
impl fmt::Display for Utf16Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(&Utf8Char::from(*self), fmtr)
}
}
// Cannot derive these impls because two-unit characters must always compare
// greater than one-unit ones.
impl PartialOrd for Utf16Char {
#[inline]
fn partial_cmp(&self, rhs: &Self) -> Option<Ordering> {
Some(self.cmp(rhs))
}
}
impl Ord for Utf16Char {
#[inline]
fn cmp(&self, rhs: &Self) -> Ordering {
// Shift the first unit by 0xd if surrogate, and 0 otherwise.
// This ensures surrogates are always greater than 0xffff, and
// that the second unit only affect the result when the first are equal.
// Multiplying by a constant factor isn't enough because that factor
// would have to be greater than 1023 and smaller than 5.5.
// This transformation is less complicated than combine_surrogates().
let lhs = (self.units[0] as u32, self.units[1] as u32);
let rhs = (rhs.units[0] as u32, rhs.units[1] as u32);
let lhs = (lhs.0 << (lhs.1 >> 12)) + lhs.1;
let rhs = (rhs.0 << (rhs.1 >> 12)) + rhs.1;
lhs.cmp(&rhs)
}
}
////////////////////////////////
//Comparisons with other types//
////////////////////////////////
impl PartialEq<char> for Utf16Char {
fn eq(&self, u32c: &char) -> bool {
*self == Utf16Char::from(*u32c)
}
}
impl PartialEq<Utf16Char> for char {
fn eq(&self, u16c: &Utf16Char) -> bool {
Utf16Char::from(*self) == *u16c
}
}
impl PartialOrd<char> for Utf16Char {
fn partial_cmp(&self, u32c: &char) -> Option<Ordering> {
self.partial_cmp(&Utf16Char::from(*u32c))
}
}
impl PartialOrd<Utf16Char> for char {
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
Utf16Char::from(*self).partial_cmp(u16c)
}
}
impl PartialEq<Utf8Char> for Utf16Char {
fn eq(&self, u8c: &Utf8Char) -> bool {
*self == Utf16Char::from(*u8c)
}
}
impl PartialOrd<Utf8Char> for Utf16Char {
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
self.partial_cmp(&Utf16Char::from(*u8c))
}
}
// The other direction is implemented in utf8_char.rs
/// Only considers the unit equal if the codepoint of the `Utf16Char` is not
/// made up of a surrogate pair.
///
/// There is no impl in the opposite direction, as this should only be used to
/// compare `Utf16Char`s against constants.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert!(Utf16Char::from('6') == b'6' as u16);
/// assert!(Utf16Char::from('\u{FFFF}') == 0xffff_u16);
/// assert!(Utf16Char::from_tuple((0xd876, Some(0xdef9))).unwrap() != 0xd876_u16);
/// ```
impl PartialEq<u16> for Utf16Char {
fn eq(&self, unit: &u16) -> bool {
self.units[0] == *unit && self.units[1] == 0
}
}
/// Only considers the byte equal if the codepoint of the `Utf16Char` is <= U+FF.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert!(Utf16Char::from('6') == b'6');
/// assert!(Utf16Char::from('\u{00FF}') == b'\xff');
/// assert!(Utf16Char::from('\u{0100}') != b'\0');
/// ```
impl PartialEq<u8> for Utf16Char {
fn eq(&self, byte: &u8) -> bool {
self.units[0] == *byte as u16
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII never compare equal.
impl PartialEq<AsciiChar> for Utf16Char {
#[inline]
fn eq(&self, ascii: &AsciiChar) -> bool {
self.units[0] == *ascii as u16
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII never compare equal.
impl PartialEq<Utf16Char> for AsciiChar {
#[inline]
fn eq(&self, u16c: &Utf16Char) -> bool {
*self as u16 == u16c.units[0]
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII always compare greater.
impl PartialOrd<AsciiChar> for Utf16Char {
#[inline]
fn partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering> {
self.units[0].partial_cmp(&(*ascii as u16))
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII always compare greater.
impl PartialOrd<Utf16Char> for AsciiChar {
#[inline]
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
(*self as u16).partial_cmp(&u16c.units[0])
}
}
///////////////////////////////////////////////////////
//pub impls that should be together for nicer rustdoc//
///////////////////////////////////////////////////////
impl Utf16Char {
/// Create an `Utf16Char` from the first codepoint in a string slice,
/// converting from UTF-8 to UTF-16.
///
/// The returned `usize` is the number of UTF-8 bytes used from the str,
/// and not the number of UTF-16 units.
///
/// Returns an error if the `str` is empty.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf16Char;
///
/// assert_eq!(Utf16Char::from_str_start("a"), Ok((Utf16Char::from('a'),1)));
/// assert_eq!(Utf16Char::from_str_start("ab"), Ok((Utf16Char::from('a'),1)));
/// assert_eq!(Utf16Char::from_str_start("🂠 "), Ok((Utf16Char::from('🂠'),4)));
/// assert_eq!(Utf16Char::from_str_start("é"), Ok((Utf16Char::from('e'),1)));// 'e'+u301 combining mark
/// assert!(Utf16Char::from_str_start("").is_err());
/// ```
pub fn from_str_start(s: &str) -> Result<(Self,usize), EmptyStrError> {
if s.is_empty() {
return Err(EmptyStrError);
}
let b = s.as_bytes();
// Read the last byte first to reduce the number of unnecesary length checks.
match b[0] {
0...127 => {// 1 byte => 1 unit
let unit = b[0] as u16;// 0b0000_0000_0xxx_xxxx
Ok((Utf16Char{ units: [unit, 0] }, 1))
},
0b1000_0000...0b1101_1111 => {// 2 bytes => 1 unit
let unit = (((b[1] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[0] & 0x1f) as u16) << 6);// 0b0000_0xxx_xx00_0000
Ok((Utf16Char{ units: [unit, 0] }, 2))
},
0b1110_0000...0b1110_1111 => {// 3 bytes => 1 unit
let unit = (((b[2] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[1] & 0x3f) as u16) << 6) // 0b0000_xxxx_xx00_0000
| (((b[0] & 0x0f) as u16) << 12);// 0bxxxx_0000_0000_0000
Ok((Utf16Char{ units: [unit, 0] }, 3))
},
_ => {// 4 bytes => 2 units
let second = 0xdc00 // 0b1101_1100_0000_0000
| (((b[3] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[2] & 0x0f) as u16) << 6);// 0b0000_00xx_xx00_0000
let first = 0xd800-(0x01_00_00u32>>10) as u16// 0b1101_0111_1100_0000
+ (((b[2] & 0x30) as u16) >> 4) // 0b0000_0000_0000_00xx
+ (((b[1] & 0x3f) as u16) << 2) // 0b0000_0000_xxxx_xx00
+ (((b[0] & 0x07) as u16) << 8); // 0b0000_0xxx_0000_0000
Ok((Utf16Char{ units: [first, second] }, 4))
}
}
}
/// Validate and store the first UTF-16 codepoint in the slice.
/// Also return how many units were needed.
pub fn from_slice_start(src: &[u16]) -> Result<(Self,usize), InvalidUtf16Slice> {
char::from_utf16_slice_start(src).map(|(_,len)| {
let second = if len==2 {src[1]} else {0};
(Utf16Char{ units: [src[0], second] }, len)
})
}
/// Store the first UTF-16 codepoint of the slice.
///
/// # Safety
///
/// The slice must be non-empty and start with a valid UTF-16 codepoint.
/// The length of the slice is never checked.
pub unsafe fn from_slice_start_unchecked(src: &[u16]) -> (Self,usize) {
let first = *src.get_unchecked(0);
if first.is_utf16_leading_surrogate() {
(Utf16Char{ units: [first, *src.get_unchecked(1)] }, 2)
} else {
(Utf16Char{ units: [first, 0] }, 1)
}
}
/// Validate and store an UTF-16 array as returned from `char.to_utf16_array()`.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf16Char;
/// use encode_unicode::error::InvalidUtf16Array;
///
/// assert_eq!(Utf16Char::from_array(['x' as u16, 'y' as u16]), Ok(Utf16Char::from('x')));
/// assert_eq!(Utf16Char::from_array(['睷' as u16, 0]), Ok(Utf16Char::from('睷')));
/// assert_eq!(Utf16Char::from_array([0xda6f, 0xdcde]), Ok(Utf16Char::from('\u{abcde}')));
/// assert_eq!(Utf16Char::from_array([0xf111, 0xdbad]), Ok(Utf16Char::from('\u{f111}')));
/// assert_eq!(Utf16Char::from_array([0xdaaf, 0xdaaf]), Err(InvalidUtf16Array::SecondIsNotTrailingSurrogate));
/// assert_eq!(Utf16Char::from_array([0xdcac, 0x9000]), Err(InvalidUtf16Array::FirstIsTrailingSurrogate));
/// ```
pub fn from_array(units: [u16; 2]) -> Result<Self,InvalidUtf16Array> {
if (units[0] & 0xf8_00) != 0xd8_00 {
Ok(Utf16Char { units: [units[0], 0] })
} else if units[0] < 0xdc_00 && (units[1] & 0xfc_00) == 0xdc_00 {
Ok(Utf16Char { units: units })
} else if units[0] < 0xdc_00 {
Err(InvalidUtf16Array::SecondIsNotTrailingSurrogate)
} else {
Err(InvalidUtf16Array::FirstIsTrailingSurrogate)
}
}
/// Create an `Utf16Char` from an array as returned from `char.to_utf16_array()`.
///
/// # Safety
///
/// The units must form a valid codepoint, and the second unit must be 0
/// when a surrogate pair is not required.
/// Violating this can easily lead to undefined behavior, although unlike
/// `char` bad `Utf16Char`s simply existing is not immediately UB.
pub unsafe fn from_array_unchecked(units: [u16; 2]) -> Self {
Utf16Char { units: units }
}
/// Validate and store a UTF-16 pair as returned from `char.to_utf16_tuple()`.
pub fn from_tuple(utf16: (u16,Option<u16>)) -> Result<Self,InvalidUtf16Tuple> {
unsafe {char::from_utf16_tuple(utf16).map(|_|
Self::from_tuple_unchecked(utf16)
)}
}
/// Create an `Utf16Char` from a tuple as returned from `char.to_utf16_tuple()`.
///
/// # Safety
///
/// The units must form a valid codepoint with the second being 0 when a
/// surrogate pair is not required.
/// Violating this can easily lead to undefined behavior.
pub unsafe fn from_tuple_unchecked(utf16: (u16,Option<u16>)) -> Self {
Utf16Char { units: [utf16.0, utf16.1.unwrap_or(0)] }
}
/// Create an `Utf16Char` from a single unit.
///
/// Codepoints < '\u{1_0000}' (which fit in a `u16`) are part of the basic
/// multilingual plane unless they are reserved for surrogate pairs.
///
/// # Errors
///
/// Returns `NonBMPError` if the unit is in the range `0xd800..0xe000`
/// (which means that it's part of a surrogat pair)
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert_eq!(Utf16Char::from_bmp(0x40).unwrap(), '@');
/// assert_eq!(Utf16Char::from_bmp('ø' as u16).unwrap(), 'ø');
/// assert!(Utf16Char::from_bmp(0xdddd).is_err());
/// ```
pub fn from_bmp(bmp_codepoint: u16) -> Result<Self,NonBMPError> {
if bmp_codepoint & 0xf800 != 0xd800 {
Ok(Utf16Char{ units: [bmp_codepoint, 0] })
} else {
Err(NonBMPError)
}
}
/// Create an `Utf16Char` from a single unit without checking that it's a
/// valid codepoint on its own.
///
/// # Safety
///
/// The unit must be less than 0xd800 or greater than 0xdfff.
/// In other words, not part of a surrogate pair.
/// Violating this can easily lead to undefined behavior.
#[inline]
pub unsafe fn from_bmp_unchecked(bmp_codepoint: u16) -> Self {
Utf16Char{ units: [bmp_codepoint, 0] }
}
/// Checks that the codepoint is in the basic multilingual plane.
///
/// # Examples
/// ```
/// # use encode_unicode::Utf16Char;
/// assert_eq!(Utf16Char::from('e').is_bmp(), true);
/// assert_eq!(Utf16Char::from('€').is_bmp(), true);
/// assert_eq!(Utf16Char::from('𝔼').is_bmp(), false);
/// ```
#[inline]
pub fn is_bmp(&self) -> bool {
self.units[1] == 0
}
/// The number of units this character is made up of.
///
/// Is either 1 or 2 and identical to `.as_char().len_utf16()`
/// or `.as_ref().len()`.
#[inline]
pub fn len(self) -> usize {
1 + (self.units[1] as usize >> 15)
}
// There is no `.is_emty()` because it would always return false.
/// Checks that the codepoint is an ASCII character.
#[inline]
pub fn is_ascii(&self) -> bool {
self.units[0] <= 127
}
/// Checks that two characters are an ASCII case-insensitive match.
///
/// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`.
#[cfg(feature="std")]
pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
self.to_ascii_lowercase() == other.to_ascii_lowercase()
}
/// Converts the character to its ASCII upper case equivalent.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
#[cfg(feature="std")]
pub fn to_ascii_uppercase(&self) -> Self {
let n = self.units[0].wrapping_sub(b'a' as u16);
if n < 26 {Utf16Char{ units: [n+b'A' as u16, 0] }}
else {*self}
}
/// Converts the character to its ASCII lower case equivalent.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
#[cfg(feature="std")]
pub fn to_ascii_lowercase(&self) -> Self {
let n = self.units[0].wrapping_sub(b'A' as u16);
if n < 26 {Utf16Char{ units: [n+b'a' as u16, 0] }}
else {*self}
}
/// Converts the character to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
#[cfg(feature="std")]
pub fn make_ascii_uppercase(&mut self) {
*self = self.to_ascii_uppercase()
}
/// Converts the character to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
#[cfg(feature="std")]
pub fn make_ascii_lowercase(&mut self) {
*self = self.to_ascii_lowercase();
}
/// Convert from UTF-16 to UTF-32
pub fn to_char(self) -> char {
self.into()
}
/// Write the internal representation to a slice,
/// and then returns the number of `u16`s written.
///
/// # Panics
/// Will panic the buffer is too small;
/// You can get the required length from `.len()`,
/// but a buffer of length two is always large enough.
pub fn to_slice(self, dst: &mut[u16]) -> usize {
// Write the last unit first to avoid repeated length checks.
let extra = self.units[1] as usize >> 15;
match dst.get_mut(extra) {
Some(first) => *first = self.units[extra],
None => panic!("The provided buffer is too small.")
}
if extra != 0 {dst[0] = self.units[0];}
extra+1
}
/// Get the character represented as an array of two units.
///
/// The second `u16` is zero for codepoints that fit in one unit.
#[inline]
pub fn to_array(self) -> [u16;2] {
self.units
}
/// The second `u16` is used for surrogate pairs.
#[inline]
pub fn to_tuple(self) -> (u16,Option<u16>) {
(self.units[0], if self.units[1]==0 {None} else {Some(self.units[1])})
}
}

View File

@ -0,0 +1,270 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use traits::CharExt;
use utf16_char::Utf16Char;
use errors::EmptyStrError;
extern crate core;
use self::core::fmt;
use self::core::borrow::Borrow;
// Invalid values that says the field is consumed or empty.
const FIRST_USED: u16 = 0x_dc_00;
const SECOND_USED: u16 = 0;
/// Iterate over the units of the UTF-16 representation of a codepoint.
#[derive(Clone)]
pub struct Utf16Iterator {
first: u16,
second: u16,
}
impl From<char> for Utf16Iterator {
fn from(c: char) -> Self {
let (first, second) = c.to_utf16_tuple();
Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) }
}
}
impl From<Utf16Char> for Utf16Iterator {
fn from(uc: Utf16Char) -> Self {
let (first, second) = uc.to_tuple();
Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) }
}
}
impl Iterator for Utf16Iterator {
type Item=u16;
fn next(&mut self) -> Option<u16> {
match (self.first, self.second) {
(FIRST_USED, SECOND_USED) => { None },
(FIRST_USED, second ) => {self.second = SECOND_USED; Some(second)},
(first , _ ) => {self.first = FIRST_USED; Some(first )},
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl ExactSizeIterator for Utf16Iterator {
fn len(&self) -> usize {
(if self.first == FIRST_USED {0} else {1}) +
(if self.second == SECOND_USED {0} else {1})
}
}
impl fmt::Debug for Utf16Iterator {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut clone = self.clone();
match (clone.next(), clone.next()) {
(Some(one), None) => write!(fmtr, "[{}]", one),
(Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b),
(None, _) => write!(fmtr, "[]"),
}
}
}
/// Converts an iterator of `Utf16Char` (or `&Utf16Char`)
/// to an iterator of `u16`s.
/// Is equivalent to calling `.flat_map()` on the original iterator,
/// but the returned iterator is about twice as fast.
///
/// The exact number of units cannot be known in advance, but `size_hint()`
/// gives the possible range.
///
/// # Examples
///
/// From iterator of values:
///
/// ```
/// use encode_unicode::{iter_units, CharExt};
///
/// let iterator = "foo".chars().map(|c| c.to_utf16() );
/// let mut units = [0; 4];
/// for (u,dst) in iter_units(iterator).zip(&mut units) {*dst=u;}
/// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]);
/// ```
///
/// From iterator of references:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{iter_units, CharExt, Utf16Char};
///
/// // (💣 takes two units)
/// let chars: Vec<Utf16Char> = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect();
/// let units: Vec<u16> = iter_units(&chars).collect();
/// let flat_map: Vec<u16> = chars.iter().flat_map(|u16c| *u16c ).collect();
/// assert_eq!(units, flat_map);
/// ```
pub fn iter_units<U:Borrow<Utf16Char>, I:IntoIterator<Item=U>>
(iterable: I) -> Utf16CharSplitter<U, I::IntoIter> {
Utf16CharSplitter{ inner: iterable.into_iter(), prev_second: 0 }
}
/// The iterator type returned by `iter_units()`
#[derive(Clone)]
pub struct Utf16CharSplitter<U:Borrow<Utf16Char>, I:Iterator<Item=U>> {
inner: I,
prev_second: u16,
}
impl<I:Iterator<Item=Utf16Char>> From<I> for Utf16CharSplitter<Utf16Char,I> {
/// A less generic constructor than `iter_units()`
fn from(iter: I) -> Self {
iter_units(iter)
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Utf16CharSplitter<U,I> {
/// Extracts the source iterator.
///
/// Note that `iter_units(iter.into_inner())` is not a no-op:
/// If the last returned unit from `next()` was a leading surrogate,
/// the trailing surrogate is lost.
pub fn into_inner(self) -> I {
self.inner
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Iterator for Utf16CharSplitter<U,I> {
type Item = u16;
fn next(&mut self) -> Option<Self::Item> {
if self.prev_second == 0 {
self.inner.next().map(|u16c| {
let units = u16c.borrow().to_array();
self.prev_second = units[1];
units[0]
})
} else {
let prev_second = self.prev_second;
self.prev_second = 0;
Some(prev_second)
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
// Doesn't need to handle unlikely overflows correctly because
// size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
let (min, max) = self.inner.size_hint();
let add = if self.prev_second == 0 {0} else {1};
(min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) ))
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16CharIndices<'a>{
str: &'a str,
index: usize,
}
impl<'a> From<&'a str> for Utf16CharIndices<'a> {
fn from(s: &str) -> Utf16CharIndices {
Utf16CharIndices{str: s, index: 0}
}
}
impl<'a> Utf16CharIndices<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16char_indices();
/// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c'))));
/// assert_eq!(iter.next(), Some((0, Utf16Char::from('a'))));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
&self.str[self.index..]
}
}
impl<'a> Iterator for Utf16CharIndices<'a> {
type Item = (usize,Utf16Char);
fn next(&mut self) -> Option<(usize,Utf16Char)> {
match Utf16Char::from_str_start(&self.str[self.index..]) {
Ok((u16c, bytes)) => {
let item = (self.index, u16c);
self.index += bytes;
Some(item)
},
Err(EmptyStrError) => None
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let len = self.str.len() - self.index;
// For len+3 to overflow, the slice must fill all but two bytes of
// addressable memory, and size_hint() doesn't need to be correct.
(len.wrapping_add(3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> {
fn next_back(&mut self) -> Option<(usize,Utf16Char)> {
if self.index < self.str.len() {
let rev = self.str.bytes().rev();
let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
let starts = self.str.len() - len;
let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap();
self.str = &self.str[..starts];
Some((starts, u16c))
} else {
None
}
}
}
impl<'a> fmt::Debug for Utf16CharIndices<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16CharIndices")
.field(&self.index)
.field(&self.as_str())
.finish()
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16Chars<'a>(Utf16CharIndices<'a>);
impl<'a> From<&'a str> for Utf16Chars<'a> {
fn from(s: &str) -> Utf16Chars {
Utf16Chars(Utf16CharIndices::from(s))
}
}
impl<'a> Utf16Chars<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16chars();
/// assert_eq!(iter.next(), Some(Utf16Char::from('a')));
/// assert_eq!(iter.next_back(), Some(Utf16Char::from('c')));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
self.0.as_str()
}
}
impl<'a> Iterator for Utf16Chars<'a> {
type Item = Utf16Char;
fn next(&mut self) -> Option<Utf16Char> {
self.0.next().map(|(_,u16c)| u16c )
}
fn size_hint(&self) -> (usize,Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
fn next_back(&mut self) -> Option<Utf16Char> {
self.0.next_back().map(|(_,u16c)| u16c )
}
}
impl<'a> fmt::Debug for Utf16Chars<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16Chars")
.field(&self.as_str())
.finish()
}
}

647
vendor/encode_unicode/src/utf8_char.rs vendored Normal file
View File

@ -0,0 +1,647 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use errors::{FromStrError, EmptyStrError, NonAsciiError, InvalidUtf8Slice, InvalidUtf8Array};
use utf8_iterators::Utf8Iterator;
use traits::{CharExt, U8UtfExt};
use utf16_char::Utf16Char;
extern crate core;
use self::core::{hash, fmt, str, ptr};
use self::core::cmp::Ordering;
use self::core::borrow::Borrow;
use self::core::ops::Deref;
use self::core::mem::transmute;
#[cfg(feature="std")]
use self::core::iter::FromIterator;
#[cfg(feature="std")]
#[allow(deprecated)]
use std::ascii::AsciiExt;
#[cfg(feature="ascii")]
extern crate ascii;
#[cfg(feature="ascii")]
use self::ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError};
// I don't think there is any good default value for char, but char does.
#[derive(Default)]
// char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either.
// The default impl of Ord for arrays works out because longer codepoints
// start with more ones, so if they're equal, the length is the same,
// breaks down for values above 0x1f_ff_ff but those can only be created by unsafe code.
#[derive(PartialEq,Eq, PartialOrd,Ord)]
#[derive(Clone,Copy)]
/// An unicode codepoint stored as UTF-8.
///
/// It can be borrowed as a `str`, and has the same size as `char`.
pub struct Utf8Char {
bytes: [u8; 4],
}
/////////////////////
//conversion traits//
/////////////////////
impl str::FromStr for Utf8Char {
type Err = FromStrError;
/// Create an `Utf8Char` from a string slice.
/// The string must contain exactly one codepoint.
///
/// # Examples
///
/// ```
/// use encode_unicode::error::FromStrError::*;
/// use encode_unicode::Utf8Char;
/// use std::str::FromStr;
///
/// assert_eq!(Utf8Char::from_str("a"), Ok(Utf8Char::from('a')));
/// assert_eq!(Utf8Char::from_str("🂠"), Ok(Utf8Char::from('🂠')));
/// assert_eq!(Utf8Char::from_str(""), Err(Empty));
/// assert_eq!(Utf8Char::from_str("ab"), Err(MultipleCodepoints));
/// assert_eq!(Utf8Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark
/// ```
fn from_str(s: &str) -> Result<Self, FromStrError> {
if s.is_empty() {
Err(FromStrError::Empty)
} else if s.len() != 1+s.as_bytes()[0].extra_utf8_bytes_unchecked() {
Err(FromStrError::MultipleCodepoints)
} else {
let mut bytes = [0; 4];
bytes[..s.len()].copy_from_slice(s.as_bytes());
Ok(Utf8Char{bytes: bytes})
}
}
}
impl From<Utf16Char> for Utf8Char {
fn from(utf16: Utf16Char) -> Utf8Char {
match utf16.to_tuple() {
(a @ 0...0x00_7f, _) => {
Utf8Char{ bytes: [a as u8, 0, 0, 0] }
},
(u @ 0...0x07_ff, _) => {
let b = 0x80 | (u & 0x00_3f) as u8;
let a = 0xc0 | ((u & 0x07_c0) >> 6) as u8;
Utf8Char{ bytes: [a, b, 0, 0] }
},
(u, None) => {
let c = 0x80 | (u & 0x00_3f) as u8;
let b = 0x80 | ((u & 0x0f_c0) >> 6) as u8;
let a = 0xe0 | ((u & 0xf0_00) >> 12) as u8;
Utf8Char{ bytes: [a, b, c, 0] }
},
(f, Some(s)) => {
let f = f + (0x01_00_00u32 >> 10) as u16;
let d = 0x80 | (s & 0x00_3f) as u8;
let c = 0x80 | ((s & 0x03_c0) >> 6) as u8
| ((f & 0x00_03) << 4) as u8;
let b = 0x80 | ((f & 0x00_fc) >> 2) as u8;
let a = 0xf0 | ((f & 0x07_00) >> 8) as u8;
Utf8Char{ bytes: [a, b, c, d] }
}
}
}
}
impl From<char> for Utf8Char {
fn from(c: char) -> Self {
Utf8Char{ bytes: c.to_utf8_array().0 }
}
}
impl From<Utf8Char> for char {
fn from(uc: Utf8Char) -> char {
unsafe{ char::from_utf8_exact_slice_unchecked(&uc.bytes[..uc.len()]) }
}
}
impl IntoIterator for Utf8Char {
type Item=u8;
type IntoIter=Utf8Iterator;
/// Iterate over the byte values.
fn into_iter(self) -> Utf8Iterator {
Utf8Iterator::from(self)
}
}
#[cfg(feature="std")]
impl Extend<Utf8Char> for Vec<u8> {
fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) {
let iter = iter.into_iter();
self.reserve(iter.size_hint().0);
for u8c in iter {
// twice as fast as self.extend_from_slice(u8c.as_bytes());
self.push(u8c.bytes[0]);
for &extra in &u8c.bytes[1..] {
if extra != 0 {
self.push(extra);
}
}
}
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf8Char> for Vec<u8> {
fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl Extend<Utf8Char> for String {
fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) {
unsafe { self.as_mut_vec().extend(iter) }
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf8Char> for String {
fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl FromIterator<Utf8Char> for String {
fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> String {
let mut string = String::new();
string.extend(iter);
return string;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf8Char> for String {
fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> String {
iter.into_iter().cloned().collect()
}
}
#[cfg(feature="std")]
impl FromIterator<Utf8Char> for Vec<u8> {
fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> Self {
iter.into_iter().collect::<String>().into_bytes()
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf8Char> for Vec<u8> {
fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> Self {
iter.into_iter().cloned().collect::<String>().into_bytes()
}
}
/////////////////
//getter traits//
/////////////////
impl AsRef<[u8]> for Utf8Char {
fn as_ref(&self) -> &[u8] {
&self.bytes[..self.len()]
}
}
impl AsRef<str> for Utf8Char {
fn as_ref(&self) -> &str {
unsafe{ str::from_utf8_unchecked( self.as_ref() ) }
}
}
impl Borrow<[u8]> for Utf8Char {
fn borrow(&self) -> &[u8] {
self.as_ref()
}
}
impl Borrow<str> for Utf8Char {
fn borrow(&self) -> &str {
self.as_ref()
}
}
impl Deref for Utf8Char {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_ref()
}
}
////////////////
//ascii traits//
////////////////
#[cfg(feature="std")]
#[allow(deprecated)]
impl AsciiExt for Utf8Char {
type Owned = Utf8Char;
fn is_ascii(&self) -> bool {
self.bytes[0].is_ascii()
}
fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
if self.is_ascii() {self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])}
else {self == other}
}
fn to_ascii_uppercase(&self) -> Self::Owned {
let mut uc = *self;
uc.make_ascii_uppercase();
uc
}
fn to_ascii_lowercase(&self) -> Self::Owned {
let mut uc = *self;
uc.make_ascii_lowercase();
uc
}
fn make_ascii_uppercase(&mut self) {
self.bytes[0].make_ascii_uppercase()
}
fn make_ascii_lowercase(&mut self) {
self.bytes[0].make_ascii_lowercase();
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl From<AsciiChar> for Utf8Char {
fn from(ac: AsciiChar) -> Self {
Utf8Char{ bytes: [ac.as_byte(),0,0,0] }
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl ToAsciiChar for Utf8Char {
fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> {
self.bytes[0].to_ascii_char()
}
unsafe fn to_ascii_char_unchecked(self) -> AsciiChar {
self.bytes[0].to_ascii_char_unchecked()
}
}
/////////////////////////////////////////////////////////
//Genaral traits that cannot be derived to emulate char//
/////////////////////////////////////////////////////////
impl hash::Hash for Utf8Char {
fn hash<H : hash::Hasher>(&self, state: &mut H) {
self.to_char().hash(state);
}
}
impl fmt::Debug for Utf8Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(&self.to_char(), fmtr)
}
}
impl fmt::Display for Utf8Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.write_str(self.as_str())
}
}
////////////////////////////////
//Comparisons with other types//
////////////////////////////////
impl PartialEq<char> for Utf8Char {
fn eq(&self, u32c: &char) -> bool {
*self == Utf8Char::from(*u32c)
}
}
impl PartialEq<Utf8Char> for char {
fn eq(&self, u8c: &Utf8Char) -> bool {
Utf8Char::from(*self) == *u8c
}
}
impl PartialOrd<char> for Utf8Char {
fn partial_cmp(&self, u32c: &char) -> Option<Ordering> {
self.partial_cmp(&Self::from(*u32c))
}
}
impl PartialOrd<Utf8Char> for char {
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
Utf8Char::from(*self).partial_cmp(u8c)
}
}
impl PartialEq<Utf16Char> for Utf8Char {
fn eq(&self, u16c: &Utf16Char) -> bool {
*self == Self::from(*u16c)
}
}
impl PartialOrd<Utf16Char> for Utf8Char {
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
self.partial_cmp(&Self::from(*u16c))
}
}
// The other direction is implemented in utf16_char.rs
/// Only considers the byte equal if both it and the `Utf8Char` represents ASCII characters.
///
/// There is no impl in the opposite direction, as this should only be used to
/// compare `Utf8Char`s against constants.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf8Char;
/// assert!(Utf8Char::from('8') == b'8');
/// assert!(Utf8Char::from_array([0xf1,0x80,0x80,0x80]).unwrap() != 0xf1);
/// assert!(Utf8Char::from('\u{ff}') != 0xff);
/// assert!(Utf8Char::from('\u{80}') != 0x80);
/// ```
impl PartialEq<u8> for Utf8Char {
fn eq(&self, byte: &u8) -> bool {
self.bytes[0] == *byte && self.bytes[1] == 0
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII never compare equal.
impl PartialEq<AsciiChar> for Utf8Char {
#[inline]
fn eq(&self, ascii: &AsciiChar) -> bool {
self.bytes[0] == *ascii as u8
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII never compare equal.
impl PartialEq<Utf8Char> for AsciiChar {
#[inline]
fn eq(&self, u8c: &Utf8Char) -> bool {
u8c == self
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII always compare greater.
impl PartialOrd<AsciiChar> for Utf8Char {
#[inline]
fn partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering> {
self.bytes[0].partial_cmp(ascii)
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII always compare greater.
impl PartialOrd<Utf8Char> for AsciiChar {
#[inline]
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
self.partial_cmp(&u8c.bytes[0])
}
}
///////////////////////////////////////////////////////
//pub impls that should be together for nicer rustdoc//
///////////////////////////////////////////////////////
impl Utf8Char {
/// Create an `Utf8Char` from the first codepoint in a `str`.
///
/// Returns an error if the `str` is empty.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
///
/// assert_eq!(Utf8Char::from_str_start("a"), Ok((Utf8Char::from('a'),1)));
/// assert_eq!(Utf8Char::from_str_start("ab"), Ok((Utf8Char::from('a'),1)));
/// assert_eq!(Utf8Char::from_str_start("🂠 "), Ok((Utf8Char::from('🂠'),4)));
/// assert_eq!(Utf8Char::from_str_start("é"), Ok((Utf8Char::from('e'),1)));// 'e'+u301 combining mark
/// assert!(Utf8Char::from_str_start("").is_err());
/// ```
pub fn from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError> {
unsafe {
if src.is_empty() {
Err(EmptyStrError)
} else {
Ok(Utf8Char::from_slice_start_unchecked(src.as_bytes()))
}
}
}
/// Create an `Utf8Char` of the first codepoint in an UTF-8 slice.
/// Also returns the length of the UTF-8 sequence for the codepoint.
///
/// If the slice is from a `str`, use `::from_str_start()` to skip UTF-8 validation.
///
/// # Errors
///
/// Returns an `Err` if the slice is empty, doesn't start with a valid
/// UTF-8 sequence or is too short for the sequence.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
/// use encode_unicode::error::InvalidUtf8Slice::*;
/// use encode_unicode::error::InvalidUtf8::*;
///
/// assert_eq!(Utf8Char::from_slice_start(&[b'A', b'B', b'C']), Ok((Utf8Char::from('A'),1)));
/// assert_eq!(Utf8Char::from_slice_start(&[0xdd, 0xbb]), Ok((Utf8Char::from('\u{77b}'),2)));
///
/// assert_eq!(Utf8Char::from_slice_start(&[]), Err(TooShort(1)));
/// assert_eq!(Utf8Char::from_slice_start(&[0xf0, 0x99]), Err(TooShort(4)));
/// assert_eq!(Utf8Char::from_slice_start(&[0xee, b'F', 0x80]), Err(Utf8(NotAContinuationByte(1))));
/// assert_eq!(Utf8Char::from_slice_start(&[0xee, 0x99, 0x0f]), Err(Utf8(NotAContinuationByte(2))));
/// ```
pub fn from_slice_start(src: &[u8]) -> Result<(Self,usize),InvalidUtf8Slice> {
char::from_utf8_slice_start(src).map(|(_,len)| {
let mut bytes = [0; 4];
bytes[..len].copy_from_slice(&src[..len]);
(Utf8Char{ bytes: bytes }, len)
})
}
/// A `from_slice_start()` that doesn't validate the codepoint.
///
/// # Safety
///
/// The slice must be non-empty and start with a valid UTF-8 codepoint.
/// Invalid or incomplete values might cause reads of uninitalized memory.
pub unsafe fn from_slice_start_unchecked(src: &[u8]) -> (Self,usize) {
let len = 1+src.get_unchecked(0).extra_utf8_bytes_unchecked();
let mut bytes = [0; 4];
ptr::copy_nonoverlapping(src.as_ptr(), &mut bytes[0] as *mut u8, len);
(Utf8Char{ bytes: bytes }, len)
}
/// Create an `Utf8Char` from a byte array after validating it.
///
/// The codepoint must start at the first byte.
/// Unused bytes are set to zero by this function and so can be anything.
///
/// # Errors
///
/// Returns an `Err` if the array doesn't start with a valid UTF-8 sequence.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
/// use encode_unicode::error::InvalidUtf8Array::*;
/// use encode_unicode::error::InvalidUtf8::*;
/// use encode_unicode::error::InvalidCodepoint::*;
///
/// assert_eq!(Utf8Char::from_array([b'A', 0, 0, 0]), Ok(Utf8Char::from('A')));
/// assert_eq!(Utf8Char::from_array([0xf4, 0x8b, 0xbb, 0xbb]), Ok(Utf8Char::from('\u{10befb}')));
/// assert_eq!(Utf8Char::from_array([b'A', b'B', b'C', b'D']), Ok(Utf8Char::from('A')));
/// assert_eq!(Utf8Char::from_array([0, 0, 0xcc, 0xbb]), Ok(Utf8Char::from('\0')));
///
/// assert_eq!(Utf8Char::from_array([0xef, b'F', 0x80, 0x80]), Err(Utf8(NotAContinuationByte(1))));
/// assert_eq!(Utf8Char::from_array([0xc1, 0x80, 0, 0]), Err(Utf8(OverLong)));
/// assert_eq!(Utf8Char::from_array([0xf7, 0xaa, 0x99, 0x88]), Err(Codepoint(TooHigh)));
/// ```
pub fn from_array(utf8: [u8;4]) -> Result<Self,InvalidUtf8Array> {
unsafe {
// perform all validation
try!(char::from_utf8_array(utf8));
let extra = utf8[0].extra_utf8_bytes_unchecked() as u32;
// zero unused bytes in one operation by transmuting the arrary to
// u32, apply an endian-corrected mask and transmute back
let mask = u32::from_le(0xff_ff_ff_ff >> 8*(3-extra));
let unused_zeroed = mask & transmute::<_,u32>(utf8);
Ok(Utf8Char{ bytes: transmute(unused_zeroed) })
}
}
/// Zero-cost constructor.
///
/// # Safety
///
/// Must contain a valid codepoint starting at the first byte, with the
/// unused bytes zeroed.
/// Bad values can easily lead to undefined behavior.
#[inline]
pub unsafe fn from_array_unchecked(utf8: [u8;4]) -> Self {
Utf8Char{ bytes: utf8 }
}
/// Create an `Utf8Char` from a single byte.
///
/// The byte must be an ASCII character.
///
/// # Errors
///
/// Returns `NonAsciiError` if the byte greater than 127.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf8Char;
/// assert_eq!(Utf8Char::from_ascii(b'a').unwrap(), 'a');
/// assert!(Utf8Char::from_ascii(128).is_err());
/// ```
pub fn from_ascii(ascii: u8) -> Result<Self,NonAsciiError> {
if ascii as i8 >= 0 {
Ok(Utf8Char{ bytes: [ascii, 0, 0, 0] })
} else {
Err(NonAsciiError)
}
}
/// Create an `Utf8Char` from a single byte without checking that it's a
/// valid codepoint on its own, which is only true for ASCII characters.
///
/// # Safety
///
/// The byte must be less than 128.
#[inline]
pub unsafe fn from_ascii_unchecked(ascii: u8) -> Self {
Utf8Char{ bytes: [ascii, 0, 0, 0] }
}
/// The number of bytes this character needs.
///
/// Is between 1 and 4 (inclusive) and identical to `.as_ref().len()` or
/// `.as_char().len_utf8()`.
#[inline]
pub fn len(self) -> usize {
// Invariants of the extra bytes enambles algorithms that
// `u8.extra_utf8_bytes_unchecked()` cannot use.
// Some of them turned out to require fewer x86 instructions:
// Exploits that unused bytes are zero and calculates the number of
// trailing zero bytes.
// Setting a bit in the first byte prevents the function from returning
// 0 for '\0' (which has 32 leading zeros).
// trailing and leading is swapped below to optimize for little-endian
// architectures.
(4 - (u32::to_le(unsafe{transmute(self.bytes)})|1).leading_zeros()/8) as usize
// Exploits that the extra bytes have their most significant bit set if
// in use.
// Takes fewer instructions than the one above if popcnt can be used,
// (which it cannot by default,
// set RUSTFLAGS='-C target-cpu=native' to enable)
//let all: u32 = unsafe{transmute(self.bytes)};
//let msb_mask = u32::from_be(0x00808080);
//let add_one = u32::from_be(0x80000000);
//((all & msb_mask) | add_one).count_ones() as usize
}
// There is no .is_emty() because this type is never empty.
/// Checks that the codepoint is an ASCII character.
pub fn is_ascii(&self) -> bool {
self.bytes[0] <= 127
}
/// Checks that two characters are an ASCII case-insensitive match.
///
/// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`.
#[cfg(feature="std")]
pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
if self.is_ascii() {self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])}
else {self == other}
}
/// Converts the character to its ASCII upper case equivalent.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
#[cfg(feature="std")]
pub fn to_ascii_uppercase(&self) -> Self {
let mut uc = *self;
uc.make_ascii_uppercase();
uc
}
/// Converts the character to its ASCII lower case equivalent.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
#[cfg(feature="std")]
pub fn to_ascii_lowercase(&self) -> Self {
let mut uc = *self;
uc.make_ascii_lowercase();
uc
}
/// Converts the character to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
#[inline]
#[cfg(feature="std")]
pub fn make_ascii_uppercase(&mut self) {
self.bytes[0].make_ascii_uppercase()
}
/// Converts the character to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
#[inline]
#[cfg(feature="std")]
pub fn make_ascii_lowercase(&mut self) {
self.bytes[0].make_ascii_lowercase();
}
/// Convert from UTF-8 to UTF-32
pub fn to_char(self) -> char {
self.into()
}
/// Write the internal representation to a slice,
/// and then returns the number of bytes written.
///
/// # Panics
///
/// Will panic the buffer is too small;
/// You can get the required length from `.len()`,
/// but a buffer of length four is always large enough.
pub fn to_slice(self, dst: &mut[u8]) -> usize {
if self.len() > dst.len() {
panic!("The provided buffer is too small.");
}
dst[..self.len()].copy_from_slice(&self.bytes[..self.len()]);
self.len()
}
/// Expose the internal array and the number of used bytes.
pub fn to_array(self) -> ([u8;4],usize) {
(self.bytes, self.len())
}
/// Return a `str` view of the array the codepoint is stored as.
///
/// Is an unambiguous version of `.as_ref()`.
pub fn as_str(&self) -> &str {
self.deref()
}
}

View File

@ -0,0 +1,352 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use utf8_char::Utf8Char;
use errors::EmptyStrError;
extern crate core;
use self::core::{mem, u32, u64};
use self::core::ops::Not;
use self::core::fmt;
use self::core::borrow::Borrow;
#[cfg(feature="std")]
use std::io::{Read, Error as ioError};
/// Read or iterate over the bytes of the UTF-8 representation of a codepoint.
#[derive(Clone)]
pub struct Utf8Iterator (u32);
impl From<Utf8Char> for Utf8Iterator {
fn from(uc: Utf8Char) -> Self {
let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) });
// uses u64 because shifting an u32 by 32 bits is a no-op.
let unused_set = (u64::MAX << uc.len() as u64*8) as u32;
Utf8Iterator(used | unused_set)
}
}
impl From<char> for Utf8Iterator {
fn from(c: char) -> Self {
Self::from(Utf8Char::from(c))
}
}
impl Iterator for Utf8Iterator {
type Item=u8;
fn next(&mut self) -> Option<u8> {
let next = self.0 as u8;
if next == 0xff {
None
} else {
self.0 = (self.0 >> 8) | 0xff_00_00_00;
Some(next)
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl ExactSizeIterator for Utf8Iterator {
fn len(&self) -> usize {// not straightforward, but possible
let unused_bytes = self.0.not().leading_zeros() / 8;
4 - unused_bytes as usize
}
}
#[cfg(feature="std")]
impl Read for Utf8Iterator {
/// Always returns Ok
fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
// Cannot call self.next() until I know I can write the result.
for (i, dst) in buf.iter_mut().enumerate() {
match self.next() {
Some(b) => *dst = b,
None => return Ok(i),
}
}
Ok(buf.len())
}
}
impl fmt::Debug for Utf8Iterator {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut content = [0; 4];
let mut i = 0;
for b in self.clone() {
content[i] = b;
i += 1;
}
write!(fmtr, "{:?}", &content[..i])
}
}
/// Converts an iterator of `Utf8Char` (or `&Utf8Char`)
/// to an iterator of `u8`s.
/// Is equivalent to calling `.flat_map()` on the original iterator,
/// but the returned iterator is ~40% faster.
///
/// The iterator also implements `Read` (if the `std` feature isn't disabled).
/// Reading will never produce an error, and calls to `.read()` and `.next()`
/// can be mixed.
///
/// The exact number of bytes cannot be known in advance, but `size_hint()`
/// gives the possible range.
/// (min: all remaining characters are ASCII, max: all require four bytes)
///
/// # Examples
///
/// From iterator of values:
///
/// ```
/// use encode_unicode::{iter_bytes, CharExt};
///
/// let iterator = "foo".chars().map(|c| c.to_utf8() );
/// let mut bytes = [0; 4];
/// for (u,dst) in iter_bytes(iterator).zip(&mut bytes) {*dst=u;}
/// assert_eq!(&bytes, b"foo\0");
/// ```
///
/// From iterator of references:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{iter_bytes, CharExt, Utf8Char};
///
/// let chars: Vec<Utf8Char> = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect();
/// let bytes: Vec<u8> = iter_bytes(&chars).collect();
/// let flat_map: Vec<u8> = chars.iter().flat_map(|u8c| *u8c ).collect();
/// assert_eq!(bytes, flat_map);
/// ```
///
/// `Read`ing from it:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{iter_bytes, CharExt};
/// use std::io::Read;
///
/// let s = "Ååh‽";
/// assert_eq!(s.len(), 8);
/// let mut buf = [b'E'; 9];
/// let mut reader = iter_bytes(s.chars().map(|c| c.to_utf8() ));
/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8);
/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
/// assert_eq!(&buf[..8], s.as_bytes());
/// assert_eq!(buf[8], b'E');
/// ```
pub fn iter_bytes<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>>
(iterable: I) -> Utf8CharSplitter<U, I::IntoIter> {
Utf8CharSplitter{ inner: iterable.into_iter(), prev: 0 }
}
/// The iterator type returned by `iter_bytes()`
///
/// See its documentation for details.
#[derive(Clone)]
pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> {
inner: I,
prev: u32,
}
impl<I:Iterator<Item=Utf8Char>> From<I> for Utf8CharSplitter<Utf8Char,I> {
/// A less generic constructor than `iter_bytes()`
fn from(iter: I) -> Self {
iter_bytes(iter)
}
}
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> {
/// Extracts the source iterator.
///
/// Note that `iter_bytes(iter.into_inner())` is not a no-op:
/// If the last returned byte from `next()` was not an ASCII by,
/// the remaining bytes of that codepoint is lost.
pub fn into_inner(self) -> I {
self.inner
}
}
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
if self.prev == 0 {
self.inner.next().map(|u8c| {
let array = u8c.borrow().to_array().0;
self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8;
array[0]
})
} else {
let next = self.prev as u8;
self.prev >>= 8;
Some(next)
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
// Doesn't need to handle unlikely overflows correctly because
// size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
let (min, max) = self.inner.size_hint();
let add = 4 - (self.prev.leading_zeros() / 8) as usize;
(min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) ))
}
}
#[cfg(feature="std")]
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> {
/// Always returns `Ok`
fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
let mut i = 0;
// write remaining bytes of previous codepoint
while self.prev != 0 && i < buf.len() {
buf[i] = self.prev as u8;
self.prev >>= 8;
i += 1;
}
// write whole characters
while i < buf.len() {
let bytes = match self.inner.next() {
Some(u8c) => u8c.borrow().to_array().0,
None => break
};
buf[i] = bytes[0];
i += 1;
if bytes[1] != 0 {
let len = bytes[0].not().leading_zeros() as usize;
let mut written = 1;
while written < len {
if i < buf.len() {
buf[i] = bytes[written];
i += 1;
written += 1;
} else {
let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) };
self.prev = bytes_as_u32 >> (8*written);
return Ok(i);
}
}
}
}
Ok(i)
}
}
/// An iterator over the `Utf8Char` of a string slice, and their positions.
///
/// This struct is created by the `utf8char_indices() method from [`StrExt`] trait. See its documentation for more.
#[derive(Clone)]
pub struct Utf8CharIndices<'a>{
str: &'a str,
index: usize,
}
impl<'a> From<&'a str> for Utf8CharIndices<'a> {
fn from(s: &str) -> Utf8CharIndices {
Utf8CharIndices{str: s, index: 0}
}
}
impl<'a> Utf8CharIndices<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf8Char};
/// let mut iter = "abc".utf8char_indices();
/// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c'))));
/// assert_eq!(iter.next(), Some((0, Utf8Char::from('a'))));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
&self.str[self.index..]
}
}
impl<'a> Iterator for Utf8CharIndices<'a> {
type Item = (usize,Utf8Char);
fn next(&mut self) -> Option<(usize,Utf8Char)> {
match Utf8Char::from_str_start(&self.str[self.index..]) {
Ok((u8c, len)) => {
let item = (self.index, u8c);
self.index += len;
Some(item)
},
Err(EmptyStrError) => None
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let len = self.str.len() - self.index;
// For len+3 to overflow, the slice must fill all but two bytes of
// addressable memory, and size_hint() doesn't need to be correct.
(len.wrapping_add(3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> {
fn next_back(&mut self) -> Option<(usize,Utf8Char)> {
// Cannot refactor out the unwrap without switching to ::from_slice()
// since slicing the str panics if not on a boundary.
if self.index < self.str.len() {
let rev = self.str.bytes().rev();
let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
let starts = self.str.len() - len;
let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap();
self.str = &self.str[..starts];
Some((starts, u8c))
} else {
None
}
}
}
impl<'a> fmt::Debug for Utf8CharIndices<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf8CharIndices")
.field(&self.index)
.field(&self.as_str())
.finish()
}
}
/// An iterator over the codepoints in a `str` represented as `Utf8Char`.
#[derive(Clone)]
pub struct Utf8Chars<'a>(Utf8CharIndices<'a>);
impl<'a> From<&'a str> for Utf8Chars<'a> {
fn from(s: &str) -> Utf8Chars {
Utf8Chars(Utf8CharIndices::from(s))
}
}
impl<'a> Utf8Chars<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf8Char};
/// let mut iter = "abc".utf8chars();
/// assert_eq!(iter.next(), Some(Utf8Char::from('a')));
/// assert_eq!(iter.next_back(), Some(Utf8Char::from('c')));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
self.0.as_str()
}
}
impl<'a> Iterator for Utf8Chars<'a> {
type Item = Utf8Char;
fn next(&mut self) -> Option<Utf8Char> {
self.0.next().map(|(_,u8c)| u8c )
}
fn size_hint(&self) -> (usize,Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
fn next_back(&mut self) -> Option<Utf8Char> {
self.0.next_back().map(|(_,u8c)| u8c )
}
}
impl<'a> fmt::Debug for Utf8Chars<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf8CharIndices")
.field(&self.as_str())
.finish()
}
}

197
vendor/encode_unicode/tests/errs.rs vendored Normal file
View File

@ -0,0 +1,197 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Test that methods gives the correct error.
//! Some also test a bit more because it's easy.
extern crate core;
use core::char;
extern crate encode_unicode;
use encode_unicode::*;
use encode_unicode::error::*;
use encode_unicode::error::InvalidUtf8Array as a;
use encode_unicode::error::InvalidUtf8Slice as s;
use encode_unicode::error::InvalidCodepoint::*;
use encode_unicode::error::InvalidUtf8::*;
use encode_unicode::error::InvalidUtf8FirstByte::*;
#[test]
fn from_u32() {
for c in 0xd800..0xe000 {
assert_eq!(char::from_u32_detailed(c), Err(Utf16Reserved));
}
let mut c = 0x11_00_00;
loop {
assert_eq!(char::from_u32_detailed(c), Err(TooHigh));
// Don't test every value. (Range.step_by() is unstable)
match c.checked_add(0x10_11_11) {
Some(next) => c = next,
None => break,
}
}
}
#[test]
fn utf8_extra_bytes() {
for c in 0..256 {
assert_eq!( (c as u8).extra_utf8_bytes(), match c {
0b_1000_0000...0b_1011_1111 => Err(ContinuationByte),
0b_1111_1000...0b_1111_1111 => Err(TooLongSeqence),
0b_0000_0000...0b_0111_1111 => Ok(0),
0b_1100_0000...0b_1101_1111 => Ok(1),
0b_1110_0000...0b_1110_1111 => Ok(2),
0b_1111_0000...0b_1111_0111 => Ok(3),
_ => unreachable!(),
});
}
}
#[test]
fn utf16_extra_unit() {
for c in 0..0x1_00_00 {
assert_eq!( (c as u16).utf16_needs_extra_unit(), match c {
0b_0000_0000_0000_0000...0b_1101_0111_1111_1111 => Ok(false),
0b_1101_1000_0000_0000...0b_1101_1011_1111_1111 => Ok(true),
0b_1101_1100_0000_0000...0b_1101_1111_1111_1111 => Err(InvalidUtf16FirstUnit),
0b_1110_0000_0000_0000...0b_1111_1111_1111_1111 => Ok(false),
_ => unreachable!(),
});
}
}
#[test]
fn from_utf16_tuple() {
use encode_unicode::error::InvalidUtf16Tuple::*;
for u in 0xdc00..0xe000 {
let close = if u%3==0 {u-100} else {u+100};
let doesnt_matter = if u%2==0 {Some(close)} else {None};
assert_eq!(char::from_utf16_tuple((u,doesnt_matter)), Err(FirstIsTrailingSurrogate));
}
for u in (0..0xd800).chain(0xe000..0x10000) {
assert_eq!(
char::from_utf16_tuple((u as u16, Some((0x100+u) as u16))),
Err(SuperfluousSecond)
);
}
for u in 0xd800..0xdc00 {
assert_eq!(char::from_utf16_tuple((u,None)), Err(MissingSecond));
assert_eq!(char::from_utf16_tuple((u,Some(u - 0x2ff))), Err(InvalidSecond));
}
}
#[test]
fn from_utf16_slice_start() {
use encode_unicode::error::InvalidUtf16Slice::*;
assert_eq!(char::from_utf16_slice_start(&[]), Err(EmptySlice));
let mut buf = [0; 6];
for u in 0xd800..0xdc00 {
buf[0] = u;
assert_eq!(char::from_utf16_slice_start(&buf[..1]), Err(MissingSecond));
buf[1] = u;
let pass = 2 + (u as usize % (buf.len()-2));
assert_eq!(char::from_utf16_slice_start(&buf[..pass]), Err(SecondNotLowSurrogate));
}
for u in 0xdc00..0xe000 {
buf[0] = u;
let close = if u%3==0 {u-100} else {u+100};
let pass = 1 + (u as usize % (buf.len()-1));
buf[pass] = close;
assert_eq!(char::from_utf16_slice_start(&buf[..pass]), Err(FirstLowSurrogate));
}
}
#[test]
fn utf8_overlong() {
let overlongs = [
[0xf0,0x8f], [0xf0,0x87], [0xf0,0x80], // 4-byte
[0xe0,0x9f], [0xe0,0x8f], [0xe0,0x80], // 3-byte
[0xc1,0xbf], [0xc1,0x92], [0xc1,0x80], // 2-byte
[0xc0,0xbf], [0xc0,0x9f], [0xc0,0x80], // 2-byte
];
for o in overlongs.iter() {
for &last in &[0x80, 0xbf] {
let arr = [o[0], o[1], last, last];
assert_eq!(char::from_utf8_slice_start(&arr), Err(InvalidUtf8Slice::Utf8(OverLong)));
assert_eq!(char::from_utf8_array(arr), Err(InvalidUtf8Array::Utf8(OverLong)));
assert_eq!(Utf8Char::from_slice_start(&arr), Err(InvalidUtf8Slice::Utf8(OverLong)));
assert_eq!(Utf8Char::from_array(arr), Err(InvalidUtf8Array::Utf8(OverLong)));
}
}
}
#[test]
fn from_str_start() {
assert_eq!(Utf8Char::from_str_start(""), Err(EmptyStrError));
assert_eq!(Utf16Char::from_str_start(""), Err(EmptyStrError));
}
#[test] fn utf8_codepoint_is_too_high() {
assert_eq!(Utf8Char::from_array([0xf4, 0x90, 0x80, 0x80]), Err(a::Codepoint(TooHigh)));
assert_eq!(char::from_utf8_array([0xf4, 0x90, 0x80, 0x80]), Err(a::Codepoint(TooHigh)));
assert_eq!(Utf8Char::from_slice_start(&[0xf4, 0x90, 0x80, 0x80]), Err(s::Codepoint(TooHigh)));
assert_eq!(char::from_utf8_slice_start(&[0xf4, 0x90, 0x80, 0x80]), Err(s::Codepoint(TooHigh)));
assert_eq!(Utf8Char::from_array([0xf5, 0x88, 0x99, 0xaa]), Err(a::Codepoint(TooHigh)));
assert_eq!(char::from_utf8_array([0xf5, 0xaa, 0xbb, 0x88]), Err(a::Codepoint(TooHigh)));
assert_eq!(Utf8Char::from_slice_start(&[0xf5, 0x99, 0xaa, 0xbb]), Err(s::Codepoint(TooHigh)));
assert_eq!(char::from_utf8_slice_start(&[0xf5, 0xbb, 0x88, 0x99]), Err(s::Codepoint(TooHigh)));
}
#[test] fn utf8_codepoint_is_utf16_reserved() {
assert_eq!(Utf8Char::from_array([0xed, 0xa0, 0x80, 0xff]), Err(a::Codepoint(Utf16Reserved)));
assert_eq!(char::from_utf8_array([0xed, 0xa0, 0x8f, 0x00]), Err(a::Codepoint(Utf16Reserved)));
assert_eq!(Utf8Char::from_slice_start(&[0xed, 0xa0, 0xbe, 0xa5]), Err(s::Codepoint(Utf16Reserved)));
assert_eq!(char::from_utf8_slice_start(&[0xed, 0xa0, 0xbf]), Err(s::Codepoint(Utf16Reserved)));
assert_eq!(Utf8Char::from_array([0xed, 0xbf, 0x80, 0xff]), Err(a::Codepoint(Utf16Reserved)));
assert_eq!(char::from_utf8_array([0xed, 0xbf, 0x8f, 0x00]), Err(a::Codepoint(Utf16Reserved)));
assert_eq!(Utf8Char::from_slice_start(&[0xed, 0xbf, 0xbe, 0xa5]), Err(s::Codepoint(Utf16Reserved)));
assert_eq!(char::from_utf8_slice_start(&[0xed, 0xbf, 0xbf]), Err(s::Codepoint(Utf16Reserved)));
}
#[test] fn utf8_first_is_continuation_byte() {
for first in 0x80..0xc0 {
let arr = [first, first<<2, first<<4, first<<6];
assert_eq!(Utf8Char::from_array(arr), Err(a::Utf8(FirstByte(ContinuationByte))));
assert_eq!(char::from_utf8_array(arr), Err(a::Utf8(FirstByte(ContinuationByte))));
let len = (1 + first%3) as usize;
assert_eq!(Utf8Char::from_slice_start(&arr[..len]), Err(s::Utf8(FirstByte(ContinuationByte))));
assert_eq!(char::from_utf8_slice_start(&arr[..len]), Err(s::Utf8(FirstByte(ContinuationByte))));
}
}
#[test] fn utf8_too_long() {
for first in 0xf8..0x100 {
let arr = [first as u8, 0x88, 0x80, 0x80];
assert_eq!(Utf8Char::from_array(arr), Err(a::Utf8(FirstByte(TooLongSeqence))));
assert_eq!(char::from_utf8_array(arr), Err(a::Utf8(FirstByte(TooLongSeqence))));
let arr = [first as u8, 0x88, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80];
let slice = &arr[..if first&1 == 0 {1} else {8}];
assert_eq!(Utf8Char::from_slice_start(slice), Err(s::Utf8(FirstByte(TooLongSeqence))));
assert_eq!(char::from_utf8_slice_start(slice), Err(s::Utf8(FirstByte(TooLongSeqence))));
}
}
#[test] fn utf8_not_continuation_byte() {
for first in 0xc2..0xf4 {
let mut arr = [first, 0x90, 0xa0, 0xb0];
let extra = first.extra_utf8_bytes().unwrap();
for corrupt in (1..extra).rev() {
let expected = NotAContinuationByte(corrupt);
for &bad in &[0x00, 0x3f, 0x40, 0x7f, 0xc0, 0xff] {
arr[corrupt] = bad;
assert_eq!(Utf8Char::from_array(arr), Err(a::Utf8(expected)), "{:?}", arr);
assert_eq!(char::from_utf8_array(arr), Err(a::Utf8(expected)));
let slice = if first&1 == 0 {&arr[..1+extra]} else {&arr};
assert_eq!(Utf8Char::from_slice_start(slice), Err(s::Utf8(expected)), "{:?}", slice);
assert_eq!(char::from_utf8_slice_start(slice), Err(s::Utf8(expected)));
}
}
}
}

View File

@ -0,0 +1,34 @@
/* Copyright 2018 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Tests that try all possible values for at least one parameter / byte / unit
//! of the tested function.
use std::char;
extern crate encode_unicode;
use encode_unicode::*;
#[test]
fn from_ascii() {
for cp in 0u32..256 {
assert_eq!(Utf8Char::from_ascii(cp as u8).is_ok(), cp & 0x80 == 0);
if let Ok(u8c) = Utf8Char::from_ascii(cp as u8) {
assert_eq!(u8c, Utf8Char::from(cp as u8 as char));
}
}
}
#[test]
fn from_bmp() {
for cp in 0u32..0x1_00_00 {
assert_eq!(
Utf16Char::from_bmp(cp as u16).ok(),
char::from_u32(cp).map(|u32c| Utf16Char::from(u32c) )
);
}
}

182
vendor/encode_unicode/tests/iterators.rs vendored Normal file
View File

@ -0,0 +1,182 @@
/* Copyright 2018 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Iterator tests
#![cfg(feature="std")]
extern crate encode_unicode;
use encode_unicode::{IterExt, SliceExt, CharExt};
use encode_unicode::iterator::Utf8CharSplitter;
use encode_unicode::error::InvalidUtf8Slice::*;
use encode_unicode::error::InvalidUtf8::*;
use encode_unicode::error::InvalidUtf8FirstByte::*;
use encode_unicode::error::InvalidCodepoint::*;
use encode_unicode::error::Utf16PairError::*;
use std::io::Read;
use std::cmp::min;
#[test] fn utf8charmerger() {
let slice = b"\xf0\xa1\x92X\xcc\xbb";
let mut iter = slice.iter().to_utf8chars();
assert_eq!(iter.size_hint(), (1, Some(6)));
assert_eq!(format!("{:?}", &iter),
format!("Utf8CharMerger {{ buffered: [], inner: {:?} }}", slice.iter()));
assert_eq!(iter.next(), Some(Err(Utf8(NotAContinuationByte(3)))));
assert_eq!(iter.size_hint(), (0, Some(5)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf8CharMerger {{ buffered: [161, 146, 88], inner: {:?} }}", slice[4..].iter())
);
assert_eq!(iter.next(), Some(Err(Utf8(FirstByte(ContinuationByte)))));
assert_eq!(iter.into_inner().next(), Some(&b'\xcc'));
}
#[test] fn utf8chardecoder() {
let slice = b"\xf4\xbf\x80\x80XY\xcc\xbbZ_";
let mut iter = slice.utf8char_indices();
assert_eq!(iter.size_hint(), (2, Some(10)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf8CharDecoder {{ bytes[0..]: {:?} }}", &slice)
);
assert_eq!(iter.next(), Some((0, Err(Codepoint(TooHigh)), 1)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf8CharDecoder {{ bytes[1..]: {:?} }}", &slice[1..])
);
assert_eq!(iter.size_hint(), (2, Some(9)));
assert_eq!(iter.count(), 8);
}
#[test] fn utf16charmerger() {
let slice = [0xd800, 'x' as u16, 0xd900, 0xdfff, 'λ' as u16];
let mut iter = slice.iter().to_utf16chars();
assert_eq!(iter.size_hint(), (2, Some(5)));
assert_eq!(format!("{:?}", &iter),
format!("Utf16CharMerger {{ buffered: None, inner: {:?} }}", slice.iter()));
assert_eq!(iter.next(), Some(Err(UnmatchedLeadingSurrogate)));
assert_eq!(iter.size_hint(), (1, Some(4)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf16CharMerger {{ buffered: Some(120), inner: {:?} }}", slice[2..].iter())
);
assert_eq!(iter.into_inner().next(), Some(&0xd900));
}
#[test] fn utf16chardecoder() {
let slice = [0xd800, 'x' as u16, 0xd900, 0xdfff, 'λ' as u16];
let mut iter = slice.utf16char_indices();
assert_eq!(iter.size_hint(), (2, Some(5)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf16CharDecoder {{ units[0..]: {:?} }}", &slice)
);
assert_eq!(iter.next(), Some((0, Err(UnmatchedLeadingSurrogate), 1)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf16CharDecoder {{ units[1..]: {:?} }}", &slice[1..])
);
assert_eq!(iter.size_hint(), (2, Some(4)));
assert_eq!(iter.count(), 3);
}
/// Tests for ensuring that iterators which also implement Read support
/// interleaving calls of `read()` and `next()`, and that they implement Read
/// correctly (support any buffer size at any time).
#[test] fn read_single_ascii() {
let uc = 'a'.to_utf8();
assert_eq!(uc.len(), 1);
for chunk in 1..5 {
let mut buf = [b'E'; 6];
let mut iter = uc.into_iter();
let mut written = 0;
for _ in 0..4 {
assert_eq!(iter.read(&mut buf[..0]).unwrap(), 0);
let wrote = iter.read(&mut buf[written..written+chunk]).unwrap();
assert_eq!(wrote, min(1-written, chunk));
written += wrote;
for &b in &buf[written..] {assert_eq!(b, b'E');}
assert_eq!(buf[..written], AsRef::<[u8]>::as_ref(&uc)[..written]);
}
assert_eq!(written, 1);
}
}
#[test] fn read_single_nonascii() {
let uc = 'ä'.to_utf8();
assert_eq!(uc.len(), 2);
for chunk in 1..5 {
let mut buf = [b'E'; 6];
let mut iter = uc.into_iter();
let mut written = 0;
for _ in 0..4 {
assert_eq!(iter.read(&mut buf[..0]).unwrap(), 0);
let wrote = iter.read(&mut buf[written..written+chunk]).unwrap();
assert_eq!(wrote, min(2-written, chunk));
written += wrote;
for &b in &buf[written..] {assert_eq!(b, b'E');}
assert_eq!(buf[..written], AsRef::<[u8]>::as_ref(&uc)[..written]);
}
assert_eq!(written, 2);
}
}
#[test] fn utf8charsplitter_read_all_sizes() {
let s = "1111\u{104444}\u{222}1\u{833}1111\u{100004}";
assert!(s.len()%3 == 1);
let mut buf = vec![b'E'; s.len()+6];
for size in 2..6 {//s.len()+4 {
let mut reader = Utf8CharSplitter::from(s.chars().map(|c| c.to_utf8() ));
for (offset, part) in s.as_bytes().chunks(size).enumerate() {
let read_to = if part.len() == size {(offset+1)*size} else {buf.len()};
assert_eq!(reader.read(&mut buf[offset*size..read_to]).unwrap(), part.len());
assert_eq!(&buf[..offset*size+part.len()], &s.as_bytes()[..offset*size+part.len()]);
}
assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
assert!(buf[s.len()..].iter().all(|&b| b==b'E' ));
}
}
#[test] fn utf8charsplitter_alternate_iter_read() {
let s = "1111\u{104444}\u{222}1\u{833}1111\u{100004}";
let mut buf = [b'0'; 10];
for n in 0..2 {
// need to collect to test size_hint()
// because chars().size_hint() returns ((bytes+3)/4, Some(bytes))
let u8chars = s.chars().map(|c| c.to_utf8() ).collect::<Vec<_>>();
let mut iter: Utf8CharSplitter<_,_> = u8chars.into_iter().into();
for (i, byte) in s.bytes().enumerate() {
let until_next = s.as_bytes()[i..].iter().take_while(|&b| (b>>6)==0b10u8 ).count();
let remaining_chars = s[i+until_next..].chars().count();
println!("{}. run: byte {:02} of {}, remaining: {:02}+{}: 0b{:08b} = {:?}",
n, i, s.len(), remaining_chars, until_next, byte, byte as char);
assert_eq!(iter.read(&mut[][..]).unwrap(), 0);
if i % 2 == n {
assert_eq!(iter.next(), Some(byte));
} else {
assert_eq!(iter.read(&mut buf[..1]).unwrap(), 1);
assert_eq!(buf[0], byte);
}
}
assert_eq!(iter.size_hint(), (0, Some(0)));
assert_eq!(iter.next(), None);
assert_eq!(iter.read(&mut buf[..]).unwrap(), 0);
}
}

284
vendor/encode_unicode/tests/oks.rs vendored Normal file
View File

@ -0,0 +1,284 @@
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Test that every method gives the correct result for valid values.
//! Except iterators, which are stateful.
use std::char;
use std::str::{self,FromStr};
use std::cmp::Ordering;
use std::hash::{Hash,Hasher};
use std::collections::hash_map::DefaultHasher;
#[allow(deprecated,unused)]
use std::ascii::AsciiExt;
use std::iter::FromIterator;
extern crate encode_unicode;
use encode_unicode::*;
#[test]
fn equal_defaults() {
assert_eq!(Utf8Char::default().to_char(), char::default());
assert_eq!(Utf16Char::default().to_char(), char::default());
}
#[test]
fn same_size_as_char() {
use std::mem::size_of;
assert_eq!(size_of::<Utf8Char>(), size_of::<char>());
assert_eq!(size_of::<Utf16Char>(), size_of::<char>());
}
#[test]
fn utf16chars_to_string() {
let s = "\u{10ffff}\u{100000}\u{fee1}";
let u16cs = s.chars().map(|c| Utf16Char::from(c) ).collect::<Vec<Utf16Char>>();
let mut from_refs: String = u16cs.iter().collect();
assert_eq!(&from_refs, s);
from_refs.extend(&u16cs);
assert_eq!(&from_refs[s.len()..], s);
let mut from_vals: String = u16cs.iter().cloned().collect();
assert_eq!(&from_vals, s);
from_vals.extend(u16cs);
assert_eq!(&from_vals[s.len()..], s);
}
const EDGES_AND_BETWEEN: [char;19] = [
'\u{0}',// min
'\u{3b}',// middle ASCII
'A',// min ASCII uppercase
'N',// middle ASCII uppercase
'Z',// max ASCII uppercase
'a',// min ASCII lowercase
'm',// middle ASCII lowercase
'z',// max ASCII lowercase
'\u{7f}',// max ASCII and 1-byte UTF-8
'\u{80}',// min 2-byte UTF-8
'\u{111}',// middle
'\u{7ff}',// max 2-byte UTF-8
'\u{800}',// min 3-byte UTF-8
'\u{d7ff}',// before reserved
'\u{e000}',// after reserved
'\u{ffff}',// max UTF-16 single and 3-byte UTF-8
'\u{10000}',// min UTF-16 surrogate and 4-byte UTF-8
'\u{abcde}',// middle
'\u{10ffff}',// max
];
fn eq_cmp_hash(c: char) -> (Utf8Char, Utf16Char) {
fn hash<T:Hash>(v: T) -> u64 {
#[allow(deprecated)]
let mut hasher = DefaultHasher::new();
v.hash(&mut hasher);
hasher.finish()
}
let u8c = c.to_utf8();
assert_eq!(u8c.to_char(), c);
assert_eq!(u8c, u8c);
assert_eq!(hash(u8c), hash(u8c));
assert_eq!(u8c.cmp(&u8c), Ordering::Equal);
assert!(u8c.eq_ignore_ascii_case(&u8c));
let u16c = c.to_utf16();
assert_eq!(u16c.to_char(), c);
assert_eq!(u16c, u16c);
assert_eq!(hash(u16c), hash(c));
assert_eq!(u16c.cmp(&u16c), Ordering::Equal);
assert!(u16c.eq_ignore_ascii_case(&u16c));
assert_eq!(u8c, c);
assert_eq!(c, u8c);
assert_eq!(u16c, c);
assert_eq!(c, u16c);
assert_eq!(u8c, u16c);
assert_eq!(u16c, u8c);
assert_eq!(u8c == c as u8, c <= '\u{7F}');
assert_eq!(u16c == c as u8, c <= '\u{FF}');
assert_eq!(u16c == c as u16, c <= '\u{FFFF}');
assert_eq!(u8c.partial_cmp(&c), Some(Ordering::Equal));
assert_eq!(c.partial_cmp(&u8c), Some(Ordering::Equal));
assert_eq!(u16c.partial_cmp(&c), Some(Ordering::Equal));
assert_eq!(c.partial_cmp(&u16c), Some(Ordering::Equal));
assert_eq!(u8c.partial_cmp(&u16c), Some(Ordering::Equal));
assert_eq!(u16c.partial_cmp(&u8c), Some(Ordering::Equal));
for &other in &EDGES_AND_BETWEEN {
let u8other = other.to_utf8();
assert_eq!(u8c == u8other, c == other);
assert_eq!(hash(u8c)==hash(u8other), hash(c)==hash(other));
assert_eq!(u8c.cmp(&u8other), c.cmp(&other));
assert_eq!(u8c.eq_ignore_ascii_case(&u8other), c.eq_ignore_ascii_case(&other));
assert_eq!(u8c.partial_cmp(&other), c.partial_cmp(&other));
assert_eq!(c.partial_cmp(&u8other), c.partial_cmp(&other));
assert_eq!(u8other.partial_cmp(&c), other.partial_cmp(&c));
assert_eq!(other.partial_cmp(&u8c), other.partial_cmp(&c));
assert_eq!(u8c == other as u8, other as u8 <= 127 && c == other as u8 as char);
let u16other = other.to_utf16();
assert_eq!(u16c == u16other, c == other);
assert_eq!(hash(u16c)==hash(u16other), hash(c)==hash(other));
assert_eq!(u16c.cmp(&u16other), c.cmp(&other));
assert_eq!(u16c.eq_ignore_ascii_case(&u16other), c.eq_ignore_ascii_case(&other));
assert_eq!(u16c.partial_cmp(&other), c.partial_cmp(&other));
assert_eq!(c.partial_cmp(&u16other), c.partial_cmp(&other));
assert_eq!(u16other.partial_cmp(&c), other.partial_cmp(&c));
assert_eq!(other.partial_cmp(&u16c), other.partial_cmp(&c));
assert_eq!(u16c == other as u8, c == other as u8 as char);
assert_eq!(u16c == other as u16, c as u32 == other as u16 as u32);
assert_eq!(u8c == u16other, c == other);
assert_eq!(u16c == u8other, c == other);
assert_eq!(u8c.partial_cmp(&u16other), c.partial_cmp(&other));
assert_eq!(u16c.partial_cmp(&u8other), c.partial_cmp(&other));
assert_eq!(u8other.partial_cmp(&u16c), other.partial_cmp(&c));
assert_eq!(u16other.partial_cmp(&u8c), other.partial_cmp(&c));
}
(u8c, u16c)
}
fn iterators(c: char) {
let mut iter = c.iter_utf8_bytes();
let mut buf = [0; 4];
let mut iter_ref = c.encode_utf8(&mut buf[..]).as_bytes().iter();
for _ in 0..6 {
assert_eq!(iter.size_hint(), iter_ref.size_hint());
assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice()));
assert_eq!(iter.next(), iter_ref.next().cloned());
}
let mut iter = c.iter_utf16_units();
let mut buf = [0; 2];
let mut iter_ref = c.encode_utf16(&mut buf[..]).iter();
for _ in 0..4 {
assert_eq!(iter.size_hint(), iter_ref.size_hint());
assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice()));
assert_eq!(iter.next(), iter_ref.next().cloned());
}
}
fn test(c: char) {
assert_eq!(char::from_u32(c as u32), Some(c));
assert_eq!(char::from_u32_detailed(c as u32), Ok(c));
assert_eq!(unsafe{ char::from_u32_unchecked(c as u32) }, c);
let (u8c, u16c) = eq_cmp_hash(c);
iterators(c);
assert_eq!(Utf16Char::from(u8c), u16c);
assert_eq!(Utf8Char::from(u16c), u8c);
let utf8_len = c.len_utf8();
let utf16_len = c.len_utf16();
let mut as_str = c.to_string();
// UTF-8
let mut buf = [0; 4];
let reference = c.encode_utf8(&mut buf[..]).as_bytes();
let len = reference.len(); // short name because it is used in many places.
assert_eq!(len, utf8_len);
assert_eq!(reference[0].extra_utf8_bytes(), Ok(len-1));
assert_eq!(reference[0].extra_utf8_bytes_unchecked(), len-1);
assert_eq!(AsRef::<[u8]>::as_ref(&u8c), reference);
let (arr,arrlen) = u8c.to_array();
assert_eq!(arrlen, len);
assert_eq!(Utf8Char::from_array(arr), Ok(u8c));
assert_eq!(c.to_utf8_array(), (arr, len));
let str_ = str::from_utf8(reference).unwrap();
let ustr = Utf8Char::from_str(str_).unwrap();
assert_eq!(ustr.to_array().0, arr);// bitwise equality
assert_eq!(char::from_utf8_array(arr), Ok(c));
let mut longer = [0xff; 5]; // 0xff is never valid
longer[..len].copy_from_slice(reference);
assert_eq!(char::from_utf8_slice_start(reference), Ok((c,len)));
assert_eq!(char::from_utf8_slice_start(&longer), Ok((c,len)));
assert_eq!(Utf8Char::from_slice_start(reference), Ok((u8c,len)));
assert_eq!(Utf8Char::from_slice_start(&longer), Ok((u8c,len)));
for other in &mut longer[len..] {*other = b'?'}
assert_eq!(Utf8Char::from_str(str_), Ok(u8c));
assert_eq!(Utf8Char::from_str_start(str_), Ok((u8c,len)));
assert_eq!(Utf8Char::from_str_start(str::from_utf8(&longer).unwrap()), Ok((u8c,len)));
unsafe {
// Hopefully make bugs easier to catch by making reads into unallocated memory by filling
// a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes.
// I have no idea whether this works.
let mut boxed = Box::new([0xffu8; 16]);
let start = boxed.len()-len; // reach the end
boxed[start..].copy_from_slice(reference);
let slice = &boxed[start..start]; // length of slice should be ignored.
assert_eq!(Utf8Char::from_slice_start_unchecked(slice), (u8c,len));
}
assert_eq!(&Vec::<u8>::from_iter(Some(u8c))[..], reference);
assert_eq!(&String::from_iter(Some(u8c))[..], str_);
assert_eq!(format!("{:?}", u8c), format!("{:?}", c));
assert_eq!(format!("{}", u8c), format!("{}", c));
assert_eq!(u8c.is_ascii(), c.is_ascii());
assert_eq!(u8c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase());
assert_eq!(u8c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase());
// UTF-16
let mut buf = [0; 2];
let reference = c.encode_utf16(&mut buf[..]);
let len = reference.len();
assert_eq!(len, utf16_len);
assert_eq!(reference[0].utf16_needs_extra_unit(), Ok(len==2));
assert_eq!(reference[0].is_utf16_leading_surrogate(), len==2);
assert_eq!(u16c.as_ref(), reference);
let mut longer = [0; 3];
longer[..len].copy_from_slice(reference);
assert_eq!(char::from_utf16_slice_start(reference), Ok((c,len)));
assert_eq!(char::from_utf16_slice_start(&longer), Ok((c,len)));
assert_eq!(Utf16Char::from_slice_start(reference), Ok((u16c,len)));
assert_eq!(Utf16Char::from_slice_start(&longer), Ok((u16c,len)));
assert_eq!(Utf16Char::from_str(&as_str), Ok(u16c));
as_str.push(c);
assert_eq!(Utf16Char::from_str_start(&as_str), Ok((u16c,utf8_len)));
unsafe {
// Hopefully make bugs easier to catch by making reads into unallocated memory by filling
// a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes.
// I have no idea whether this works.
let mut boxed = Box::new([0u16; 8]);
let start = boxed.len()-len; // reach the end
boxed[start..].copy_from_slice(reference);
let slice = &boxed[start..start]; // length of slice should be ignored.
assert_eq!(Utf16Char::from_slice_start_unchecked(slice), (u16c,len));
}
let array = c.to_utf16_array();
let tuple = c.to_utf16_tuple();
assert_eq!(&array[..reference.len()], reference);
assert_eq!(tuple, (reference[0],reference.get(1).cloned()));
assert_eq!(char::from_utf16_array(array), Ok(c));
assert_eq!(char::from_utf16_tuple(tuple), Ok(c));
assert_eq!(c.to_utf16().to_char(), c);
assert_eq!(&Vec::<u16>::from_iter(Some(u16c))[..], reference);
assert_eq!(format!("{:?}", u16c), format!("{:?}", c));
assert_eq!(format!("{}", u16c), format!("{}", c));
assert_eq!(u16c.is_ascii(), c.is_ascii());
assert_eq!(u16c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase());
assert_eq!(u16c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase());
}
#[test]
fn edges_middle() {
for &c in &EDGES_AND_BETWEEN {
test(c);
}
}
#[test]
#[ignore]
fn all() {
for cp in std::iter::Iterator::chain(0..0xd800, 0xe000..0x110000) {
let c = char::from_u32(cp).expect("not a valid char");
test(c);
}
}