1031 lines
38 KiB
Rust
1031 lines
38 KiB
Rust
use crate::arch::all::{
|
|
packedpair::{HeuristicFrequencyRank, Pair},
|
|
rabinkarp, twoway,
|
|
};
|
|
|
|
#[cfg(target_arch = "aarch64")]
|
|
use crate::arch::aarch64::neon::packedpair as neon;
|
|
#[cfg(target_arch = "wasm32")]
|
|
use crate::arch::wasm32::simd128::packedpair as simd128;
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
use crate::arch::x86_64::{
|
|
avx2::packedpair as avx2, sse2::packedpair as sse2,
|
|
};
|
|
|
|
/// A "meta" substring searcher.
|
|
///
|
|
/// To a first approximation, this chooses what it believes to be the "best"
|
|
/// substring search implemnetation based on the needle at construction time.
|
|
/// Then, every call to `find` will execute that particular implementation. To
|
|
/// a second approximation, multiple substring search algorithms may be used,
|
|
/// depending on the haystack. For example, for supremely short haystacks,
|
|
/// Rabin-Karp is typically used.
|
|
///
|
|
/// See the documentation on `Prefilter` for an explanation of the dispatching
|
|
/// mechanism. The quick summary is that an enum has too much overhead and
|
|
/// we can't use dynamic dispatch via traits because we need to work in a
|
|
/// core-only environment. (Dynamic dispatch works in core-only, but you
|
|
/// need `&dyn Trait` and we really need a `Box<dyn Trait>` here. The latter
|
|
/// requires `alloc`.) So instead, we use a union and an appropriately paired
|
|
/// free function to read from the correct field on the union and execute the
|
|
/// chosen substring search implementation.
|
|
#[derive(Clone)]
|
|
pub(crate) struct Searcher {
|
|
call: SearcherKindFn,
|
|
kind: SearcherKind,
|
|
rabinkarp: rabinkarp::Finder,
|
|
}
|
|
|
|
impl Searcher {
|
|
/// Creates a new "meta" substring searcher that attempts to choose the
|
|
/// best algorithm based on the needle, heuristics and what the current
|
|
/// target supports.
|
|
#[inline]
|
|
pub(crate) fn new<R: HeuristicFrequencyRank>(
|
|
prefilter: PrefilterConfig,
|
|
ranker: R,
|
|
needle: &[u8],
|
|
) -> Searcher {
|
|
let rabinkarp = rabinkarp::Finder::new(needle);
|
|
if needle.len() <= 1 {
|
|
return if needle.is_empty() {
|
|
trace!("building empty substring searcher");
|
|
Searcher {
|
|
call: searcher_kind_empty,
|
|
kind: SearcherKind { empty: () },
|
|
rabinkarp,
|
|
}
|
|
} else {
|
|
trace!("building one-byte substring searcher");
|
|
debug_assert_eq!(1, needle.len());
|
|
Searcher {
|
|
call: searcher_kind_one_byte,
|
|
kind: SearcherKind { one_byte: needle[0] },
|
|
rabinkarp,
|
|
}
|
|
};
|
|
}
|
|
let pair = match Pair::with_ranker(needle, &ranker) {
|
|
Some(pair) => pair,
|
|
None => return Searcher::twoway(needle, rabinkarp, None),
|
|
};
|
|
debug_assert_ne!(
|
|
pair.index1(),
|
|
pair.index2(),
|
|
"pair offsets should not be equivalent"
|
|
);
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
{
|
|
if let Some(pp) = avx2::Finder::with_pair(needle, pair) {
|
|
if do_packed_search(needle) {
|
|
trace!("building x86_64 AVX2 substring searcher");
|
|
let kind = SearcherKind { avx2: pp };
|
|
Searcher { call: searcher_kind_avx2, kind, rabinkarp }
|
|
} else if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
let prestrat = Prefilter::avx2(pp, needle);
|
|
Searcher::twoway(needle, rabinkarp, Some(prestrat))
|
|
}
|
|
} else if let Some(pp) = sse2::Finder::with_pair(needle, pair) {
|
|
if do_packed_search(needle) {
|
|
trace!("building x86_64 SSE2 substring searcher");
|
|
let kind = SearcherKind { sse2: pp };
|
|
Searcher { call: searcher_kind_sse2, kind, rabinkarp }
|
|
} else if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
let prestrat = Prefilter::sse2(pp, needle);
|
|
Searcher::twoway(needle, rabinkarp, Some(prestrat))
|
|
}
|
|
} else if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
// We're pretty unlikely to get to this point, but it is
|
|
// possible to be running on x86_64 without SSE2. Namely, it's
|
|
// really up to the OS whether it wants to support vector
|
|
// registers or not.
|
|
let prestrat = Prefilter::fallback(ranker, pair, needle);
|
|
Searcher::twoway(needle, rabinkarp, prestrat)
|
|
}
|
|
}
|
|
#[cfg(target_arch = "wasm32")]
|
|
{
|
|
if let Some(pp) = simd128::Finder::with_pair(needle, pair) {
|
|
if do_packed_search(needle) {
|
|
trace!("building wasm32 simd128 substring searcher");
|
|
let kind = SearcherKind { simd128: pp };
|
|
Searcher { call: searcher_kind_simd128, kind, rabinkarp }
|
|
} else if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
let prestrat = Prefilter::simd128(pp, needle);
|
|
Searcher::twoway(needle, rabinkarp, Some(prestrat))
|
|
}
|
|
} else if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
let prestrat = Prefilter::fallback(ranker, pair, needle);
|
|
Searcher::twoway(needle, rabinkarp, prestrat)
|
|
}
|
|
}
|
|
#[cfg(target_arch = "aarch64")]
|
|
{
|
|
if let Some(pp) = neon::Finder::with_pair(needle, pair) {
|
|
if do_packed_search(needle) {
|
|
trace!("building aarch64 neon substring searcher");
|
|
let kind = SearcherKind { neon: pp };
|
|
Searcher { call: searcher_kind_neon, kind, rabinkarp }
|
|
} else if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
let prestrat = Prefilter::neon(pp, needle);
|
|
Searcher::twoway(needle, rabinkarp, Some(prestrat))
|
|
}
|
|
} else if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
let prestrat = Prefilter::fallback(ranker, pair, needle);
|
|
Searcher::twoway(needle, rabinkarp, prestrat)
|
|
}
|
|
}
|
|
#[cfg(not(any(
|
|
all(target_arch = "x86_64", target_feature = "sse2"),
|
|
target_arch = "wasm32",
|
|
target_arch = "aarch64"
|
|
)))]
|
|
{
|
|
if prefilter.is_none() {
|
|
Searcher::twoway(needle, rabinkarp, None)
|
|
} else {
|
|
let prestrat = Prefilter::fallback(ranker, pair, needle);
|
|
Searcher::twoway(needle, rabinkarp, prestrat)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Creates a new searcher that always uses the Two-Way algorithm. This is
|
|
/// typically used when vector algorithms are unavailable or inappropriate.
|
|
/// (For example, when the needle is "too long.")
|
|
///
|
|
/// If a prefilter is given, then the searcher returned will be accelerated
|
|
/// by the prefilter.
|
|
#[inline]
|
|
fn twoway(
|
|
needle: &[u8],
|
|
rabinkarp: rabinkarp::Finder,
|
|
prestrat: Option<Prefilter>,
|
|
) -> Searcher {
|
|
let finder = twoway::Finder::new(needle);
|
|
match prestrat {
|
|
None => {
|
|
trace!("building scalar two-way substring searcher");
|
|
let kind = SearcherKind { two_way: finder };
|
|
Searcher { call: searcher_kind_two_way, kind, rabinkarp }
|
|
}
|
|
Some(prestrat) => {
|
|
trace!(
|
|
"building scalar two-way \
|
|
substring searcher with a prefilter"
|
|
);
|
|
let two_way_with_prefilter =
|
|
TwoWayWithPrefilter { finder, prestrat };
|
|
let kind = SearcherKind { two_way_with_prefilter };
|
|
Searcher {
|
|
call: searcher_kind_two_way_with_prefilter,
|
|
kind,
|
|
rabinkarp,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Searches the given haystack for the given needle. The needle given
|
|
/// should be the same as the needle that this finder was initialized
|
|
/// with.
|
|
///
|
|
/// Inlining this can lead to big wins for latency, and #[inline] doesn't
|
|
/// seem to be enough in some cases.
|
|
#[inline(always)]
|
|
pub(crate) fn find(
|
|
&self,
|
|
prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
if haystack.len() < needle.len() {
|
|
None
|
|
} else {
|
|
// SAFETY: By construction, we've ensured that the function
|
|
// in `self.call` is properly paired with the union used in
|
|
// `self.kind`.
|
|
unsafe { (self.call)(self, prestate, haystack, needle) }
|
|
}
|
|
}
|
|
}
|
|
|
|
impl core::fmt::Debug for Searcher {
|
|
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
|
f.debug_struct("Searcher")
|
|
.field("call", &"<searcher function>")
|
|
.field("kind", &"<searcher kind union>")
|
|
.field("rabinkarp", &self.rabinkarp)
|
|
.finish()
|
|
}
|
|
}
|
|
|
|
/// A union indicating one of several possible substring search implementations
|
|
/// that are in active use.
|
|
///
|
|
/// This union should only be read by one of the functions prefixed with
|
|
/// `searcher_kind_`. Namely, the correct function is meant to be paired with
|
|
/// the union by the caller, such that the function always reads from the
|
|
/// designated union field.
|
|
#[derive(Clone, Copy)]
|
|
union SearcherKind {
|
|
empty: (),
|
|
one_byte: u8,
|
|
two_way: twoway::Finder,
|
|
two_way_with_prefilter: TwoWayWithPrefilter,
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
sse2: crate::arch::x86_64::sse2::packedpair::Finder,
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
avx2: crate::arch::x86_64::avx2::packedpair::Finder,
|
|
#[cfg(target_arch = "wasm32")]
|
|
simd128: crate::arch::wasm32::simd128::packedpair::Finder,
|
|
#[cfg(target_arch = "aarch64")]
|
|
neon: crate::arch::aarch64::neon::packedpair::Finder,
|
|
}
|
|
|
|
/// A two-way substring searcher with a prefilter.
|
|
#[derive(Copy, Clone, Debug)]
|
|
struct TwoWayWithPrefilter {
|
|
finder: twoway::Finder,
|
|
prestrat: Prefilter,
|
|
}
|
|
|
|
/// The type of a substring search function.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// When using a function of this type, callers must ensure that the correct
|
|
/// function is paired with the value populated in `SearcherKind` union.
|
|
type SearcherKindFn = unsafe fn(
|
|
searcher: &Searcher,
|
|
prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize>;
|
|
|
|
/// Reads from the `empty` field of `SearcherKind` to handle the case of
|
|
/// searching for the empty needle. Works on all platforms.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.empty` union field is set.
|
|
unsafe fn searcher_kind_empty(
|
|
_searcher: &Searcher,
|
|
_prestate: &mut PrefilterState,
|
|
_haystack: &[u8],
|
|
_needle: &[u8],
|
|
) -> Option<usize> {
|
|
Some(0)
|
|
}
|
|
|
|
/// Reads from the `one_byte` field of `SearcherKind` to handle the case of
|
|
/// searching for a single byte needle. Works on all platforms.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.one_byte` union field is set.
|
|
unsafe fn searcher_kind_one_byte(
|
|
searcher: &Searcher,
|
|
_prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
_needle: &[u8],
|
|
) -> Option<usize> {
|
|
let needle = searcher.kind.one_byte;
|
|
crate::memchr(needle, haystack)
|
|
}
|
|
|
|
/// Reads from the `two_way` field of `SearcherKind` to handle the case of
|
|
/// searching for an arbitrary needle without prefilter acceleration. Works on
|
|
/// all platforms.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.two_way` union field is set.
|
|
unsafe fn searcher_kind_two_way(
|
|
searcher: &Searcher,
|
|
_prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
if rabinkarp::is_fast(haystack, needle) {
|
|
searcher.rabinkarp.find(haystack, needle)
|
|
} else {
|
|
searcher.kind.two_way.find(haystack, needle)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `two_way_with_prefilter` field of `SearcherKind` to handle
|
|
/// the case of searching for an arbitrary needle with prefilter acceleration.
|
|
/// Works on all platforms.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.two_way_with_prefilter` union
|
|
/// field is set.
|
|
unsafe fn searcher_kind_two_way_with_prefilter(
|
|
searcher: &Searcher,
|
|
prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
if rabinkarp::is_fast(haystack, needle) {
|
|
searcher.rabinkarp.find(haystack, needle)
|
|
} else {
|
|
let TwoWayWithPrefilter { ref finder, ref prestrat } =
|
|
searcher.kind.two_way_with_prefilter;
|
|
let pre = Pre { prestate, prestrat };
|
|
finder.find_with_prefilter(Some(pre), haystack, needle)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `sse2` field of `SearcherKind` to execute the x86_64 SSE2
|
|
/// vectorized substring search implementation.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.sse2` union field is set.
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
unsafe fn searcher_kind_sse2(
|
|
searcher: &Searcher,
|
|
_prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &searcher.kind.sse2;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
searcher.rabinkarp.find(haystack, needle)
|
|
} else {
|
|
finder.find(haystack, needle)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `avx2` field of `SearcherKind` to execute the x86_64 AVX2
|
|
/// vectorized substring search implementation.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.avx2` union field is set.
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
unsafe fn searcher_kind_avx2(
|
|
searcher: &Searcher,
|
|
_prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &searcher.kind.avx2;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
searcher.rabinkarp.find(haystack, needle)
|
|
} else {
|
|
finder.find(haystack, needle)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `simd128` field of `SearcherKind` to execute the wasm32
|
|
/// simd128 vectorized substring search implementation.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.simd128` union field is set.
|
|
#[cfg(target_arch = "wasm32")]
|
|
unsafe fn searcher_kind_simd128(
|
|
searcher: &Searcher,
|
|
_prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &searcher.kind.simd128;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
searcher.rabinkarp.find(haystack, needle)
|
|
} else {
|
|
finder.find(haystack, needle)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `neon` field of `SearcherKind` to execute the aarch64 neon
|
|
/// vectorized substring search implementation.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `searcher.kind.neon` union field is set.
|
|
#[cfg(target_arch = "aarch64")]
|
|
unsafe fn searcher_kind_neon(
|
|
searcher: &Searcher,
|
|
_prestate: &mut PrefilterState,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &searcher.kind.neon;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
searcher.rabinkarp.find(haystack, needle)
|
|
} else {
|
|
finder.find(haystack, needle)
|
|
}
|
|
}
|
|
|
|
/// A reverse substring searcher.
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) struct SearcherRev {
|
|
kind: SearcherRevKind,
|
|
rabinkarp: rabinkarp::FinderRev,
|
|
}
|
|
|
|
/// The kind of the reverse searcher.
|
|
///
|
|
/// For the reverse case, we don't do any SIMD acceleration or prefilters.
|
|
/// There is no specific technical reason why we don't, but rather don't do it
|
|
/// because it's not clear it's worth the extra code to do so. If you have a
|
|
/// use case for it, please file an issue.
|
|
///
|
|
/// We also don't do the union trick as we do with the forward case and
|
|
/// prefilters. Basically for the same reason we don't have prefilters or
|
|
/// vector algorithms for reverse searching: it's not clear it's worth doing.
|
|
/// Please file an issue if you have a compelling use case for fast reverse
|
|
/// substring search.
|
|
#[derive(Clone, Debug)]
|
|
enum SearcherRevKind {
|
|
Empty,
|
|
OneByte { needle: u8 },
|
|
TwoWay { finder: twoway::FinderRev },
|
|
}
|
|
|
|
impl SearcherRev {
|
|
/// Creates a new searcher for finding occurrences of the given needle in
|
|
/// reverse. That is, it reports the last (instead of the first) occurrence
|
|
/// of a needle in a haystack.
|
|
#[inline]
|
|
pub(crate) fn new(needle: &[u8]) -> SearcherRev {
|
|
let kind = if needle.len() <= 1 {
|
|
if needle.is_empty() {
|
|
trace!("building empty reverse substring searcher");
|
|
SearcherRevKind::Empty
|
|
} else {
|
|
trace!("building one-byte reverse substring searcher");
|
|
debug_assert_eq!(1, needle.len());
|
|
SearcherRevKind::OneByte { needle: needle[0] }
|
|
}
|
|
} else {
|
|
trace!("building scalar two-way reverse substring searcher");
|
|
let finder = twoway::FinderRev::new(needle);
|
|
SearcherRevKind::TwoWay { finder }
|
|
};
|
|
let rabinkarp = rabinkarp::FinderRev::new(needle);
|
|
SearcherRev { kind, rabinkarp }
|
|
}
|
|
|
|
/// Searches the given haystack for the last occurrence of the given
|
|
/// needle. The needle given should be the same as the needle that this
|
|
/// finder was initialized with.
|
|
#[inline]
|
|
pub(crate) fn rfind(
|
|
&self,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
if haystack.len() < needle.len() {
|
|
return None;
|
|
}
|
|
match self.kind {
|
|
SearcherRevKind::Empty => Some(haystack.len()),
|
|
SearcherRevKind::OneByte { needle } => {
|
|
crate::memrchr(needle, haystack)
|
|
}
|
|
SearcherRevKind::TwoWay { ref finder } => {
|
|
if rabinkarp::is_fast(haystack, needle) {
|
|
self.rabinkarp.rfind(haystack, needle)
|
|
} else {
|
|
finder.rfind(haystack, needle)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Prefilter controls whether heuristics are used to accelerate searching.
|
|
///
|
|
/// A prefilter refers to the idea of detecting candidate matches very quickly,
|
|
/// and then confirming whether those candidates are full matches. This
|
|
/// idea can be quite effective since it's often the case that looking for
|
|
/// candidates can be a lot faster than running a complete substring search
|
|
/// over the entire input. Namely, looking for candidates can be done with
|
|
/// extremely fast vectorized code.
|
|
///
|
|
/// The downside of a prefilter is that it assumes false positives (which are
|
|
/// candidates generated by a prefilter that aren't matches) are somewhat rare
|
|
/// relative to the frequency of full matches. That is, if a lot of false
|
|
/// positives are generated, then it's possible for search time to be worse
|
|
/// than if the prefilter wasn't enabled in the first place.
|
|
///
|
|
/// Another downside of a prefilter is that it can result in highly variable
|
|
/// performance, where some cases are extraordinarily fast and others aren't.
|
|
/// Typically, variable performance isn't a problem, but it may be for your use
|
|
/// case.
|
|
///
|
|
/// The use of prefilters in this implementation does use a heuristic to detect
|
|
/// when a prefilter might not be carrying its weight, and will dynamically
|
|
/// disable its use. Nevertheless, this configuration option gives callers
|
|
/// the ability to disable prefilters if you have knowledge that they won't be
|
|
/// useful.
|
|
#[derive(Clone, Copy, Debug)]
|
|
#[non_exhaustive]
|
|
pub enum PrefilterConfig {
|
|
/// Never used a prefilter in substring search.
|
|
None,
|
|
/// Automatically detect whether a heuristic prefilter should be used. If
|
|
/// it is used, then heuristics will be used to dynamically disable the
|
|
/// prefilter if it is believed to not be carrying its weight.
|
|
Auto,
|
|
}
|
|
|
|
impl Default for PrefilterConfig {
|
|
fn default() -> PrefilterConfig {
|
|
PrefilterConfig::Auto
|
|
}
|
|
}
|
|
|
|
impl PrefilterConfig {
|
|
/// Returns true when this prefilter is set to the `None` variant.
|
|
fn is_none(&self) -> bool {
|
|
matches!(*self, PrefilterConfig::None)
|
|
}
|
|
}
|
|
|
|
/// The implementation of a prefilter.
|
|
///
|
|
/// This type encapsulates dispatch to one of several possible choices for a
|
|
/// prefilter. Generally speaking, all prefilters have the same approximate
|
|
/// algorithm: they choose a couple of bytes from the needle that are believed
|
|
/// to be rare, use a fast vector algorithm to look for those bytes and return
|
|
/// positions as candidates for some substring search algorithm (currently only
|
|
/// Two-Way) to confirm as a match or not.
|
|
///
|
|
/// The differences between the algorithms are actually at the vector
|
|
/// implementation level. Namely, we need different routines based on both
|
|
/// which target architecture we're on and what CPU features are supported.
|
|
///
|
|
/// The straight-forwardly obvious approach here is to use an enum, and make
|
|
/// `Prefilter::find` do case analysis to determine which algorithm was
|
|
/// selected and invoke it. However, I've observed that this leads to poor
|
|
/// codegen in some cases, especially in latency sensitive benchmarks. That is,
|
|
/// this approach comes with overhead that I wasn't able to eliminate.
|
|
///
|
|
/// The second obvious approach is to use dynamic dispatch with traits. Doing
|
|
/// that in this context where `Prefilter` owns the selection generally
|
|
/// requires heap allocation, and this code is designed to run in core-only
|
|
/// environments.
|
|
///
|
|
/// So we settle on using a union (that's `PrefilterKind`) and a function
|
|
/// pointer (that's `PrefilterKindFn`). We select the right function pointer
|
|
/// based on which field in the union we set, and that function in turn
|
|
/// knows which field of the union to access. The downside of this approach
|
|
/// is that it forces us to think about safety, but the upside is that
|
|
/// there are some nice latency improvements to benchmarks. (Especially the
|
|
/// `memmem/sliceslice/short` benchmark.)
|
|
///
|
|
/// In cases where we've selected a vector algorithm and the haystack given
|
|
/// is too short, we fallback to the scalar version of `memchr` on the
|
|
/// `rarest_byte`. (The scalar version of `memchr` is still better than a naive
|
|
/// byte-at-a-time loop because it will read in `usize`-sized chunks at a
|
|
/// time.)
|
|
#[derive(Clone, Copy)]
|
|
struct Prefilter {
|
|
call: PrefilterKindFn,
|
|
kind: PrefilterKind,
|
|
rarest_byte: u8,
|
|
rarest_offset: u8,
|
|
}
|
|
|
|
impl Prefilter {
|
|
/// Return a "fallback" prefilter, but only if it is believed to be
|
|
/// effective.
|
|
#[inline]
|
|
fn fallback<R: HeuristicFrequencyRank>(
|
|
ranker: R,
|
|
pair: Pair,
|
|
needle: &[u8],
|
|
) -> Option<Prefilter> {
|
|
/// The maximum frequency rank permitted for the fallback prefilter.
|
|
/// If the rarest byte in the needle has a frequency rank above this
|
|
/// value, then no prefilter is used if the fallback prefilter would
|
|
/// otherwise be selected.
|
|
const MAX_FALLBACK_RANK: u8 = 250;
|
|
|
|
trace!("building fallback prefilter");
|
|
let rarest_offset = pair.index1();
|
|
let rarest_byte = needle[usize::from(rarest_offset)];
|
|
let rarest_rank = ranker.rank(rarest_byte);
|
|
if rarest_rank > MAX_FALLBACK_RANK {
|
|
None
|
|
} else {
|
|
let finder = crate::arch::all::packedpair::Finder::with_pair(
|
|
needle,
|
|
pair.clone(),
|
|
)?;
|
|
let call = prefilter_kind_fallback;
|
|
let kind = PrefilterKind { fallback: finder };
|
|
Some(Prefilter { call, kind, rarest_byte, rarest_offset })
|
|
}
|
|
}
|
|
|
|
/// Return a prefilter using a x86_64 SSE2 vector algorithm.
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
#[inline]
|
|
fn sse2(finder: sse2::Finder, needle: &[u8]) -> Prefilter {
|
|
trace!("building x86_64 SSE2 prefilter");
|
|
let rarest_offset = finder.pair().index1();
|
|
let rarest_byte = needle[usize::from(rarest_offset)];
|
|
Prefilter {
|
|
call: prefilter_kind_sse2,
|
|
kind: PrefilterKind { sse2: finder },
|
|
rarest_byte,
|
|
rarest_offset,
|
|
}
|
|
}
|
|
|
|
/// Return a prefilter using a x86_64 AVX2 vector algorithm.
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
#[inline]
|
|
fn avx2(finder: avx2::Finder, needle: &[u8]) -> Prefilter {
|
|
trace!("building x86_64 AVX2 prefilter");
|
|
let rarest_offset = finder.pair().index1();
|
|
let rarest_byte = needle[usize::from(rarest_offset)];
|
|
Prefilter {
|
|
call: prefilter_kind_avx2,
|
|
kind: PrefilterKind { avx2: finder },
|
|
rarest_byte,
|
|
rarest_offset,
|
|
}
|
|
}
|
|
|
|
/// Return a prefilter using a wasm32 simd128 vector algorithm.
|
|
#[cfg(target_arch = "wasm32")]
|
|
#[inline]
|
|
fn simd128(finder: simd128::Finder, needle: &[u8]) -> Prefilter {
|
|
trace!("building wasm32 simd128 prefilter");
|
|
let rarest_offset = finder.pair().index1();
|
|
let rarest_byte = needle[usize::from(rarest_offset)];
|
|
Prefilter {
|
|
call: prefilter_kind_simd128,
|
|
kind: PrefilterKind { simd128: finder },
|
|
rarest_byte,
|
|
rarest_offset,
|
|
}
|
|
}
|
|
|
|
/// Return a prefilter using a aarch64 neon vector algorithm.
|
|
#[cfg(target_arch = "aarch64")]
|
|
#[inline]
|
|
fn neon(finder: neon::Finder, needle: &[u8]) -> Prefilter {
|
|
trace!("building aarch64 neon prefilter");
|
|
let rarest_offset = finder.pair().index1();
|
|
let rarest_byte = needle[usize::from(rarest_offset)];
|
|
Prefilter {
|
|
call: prefilter_kind_neon,
|
|
kind: PrefilterKind { neon: finder },
|
|
rarest_byte,
|
|
rarest_offset,
|
|
}
|
|
}
|
|
|
|
/// Return a *candidate* position for a match.
|
|
///
|
|
/// When this returns an offset, it implies that a match could begin at
|
|
/// that offset, but it may not. That is, it is possible for a false
|
|
/// positive to be returned.
|
|
///
|
|
/// When `None` is returned, then it is guaranteed that there are no
|
|
/// matches for the needle in the given haystack. That is, it is impossible
|
|
/// for a false negative to be returned.
|
|
///
|
|
/// The purpose of this routine is to look for candidate matching positions
|
|
/// as quickly as possible before running a (likely) slower confirmation
|
|
/// step.
|
|
#[inline]
|
|
fn find(&self, haystack: &[u8]) -> Option<usize> {
|
|
// SAFETY: By construction, we've ensured that the function in
|
|
// `self.call` is properly paired with the union used in `self.kind`.
|
|
unsafe { (self.call)(self, haystack) }
|
|
}
|
|
|
|
/// A "simple" prefilter that just looks for the occurrence of the rarest
|
|
/// byte from the needle. This is generally only used for very small
|
|
/// haystacks.
|
|
#[inline]
|
|
fn find_simple(&self, haystack: &[u8]) -> Option<usize> {
|
|
// We don't use crate::memchr here because the haystack should be small
|
|
// enough that memchr won't be able to use vector routines anyway. So
|
|
// we just skip straight to the fallback implementation which is likely
|
|
// faster. (A byte-at-a-time loop is only used when the haystack is
|
|
// smaller than `size_of::<usize>()`.)
|
|
crate::arch::all::memchr::One::new(self.rarest_byte)
|
|
.find(haystack)
|
|
.map(|i| i.saturating_sub(usize::from(self.rarest_offset)))
|
|
}
|
|
}
|
|
|
|
impl core::fmt::Debug for Prefilter {
|
|
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
|
f.debug_struct("Prefilter")
|
|
.field("call", &"<prefilter function>")
|
|
.field("kind", &"<prefilter kind union>")
|
|
.field("rarest_byte", &self.rarest_byte)
|
|
.field("rarest_offset", &self.rarest_offset)
|
|
.finish()
|
|
}
|
|
}
|
|
|
|
/// A union indicating one of several possible prefilters that are in active
|
|
/// use.
|
|
///
|
|
/// This union should only be read by one of the functions prefixed with
|
|
/// `prefilter_kind_`. Namely, the correct function is meant to be paired with
|
|
/// the union by the caller, such that the function always reads from the
|
|
/// designated union field.
|
|
#[derive(Clone, Copy)]
|
|
union PrefilterKind {
|
|
fallback: crate::arch::all::packedpair::Finder,
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
sse2: crate::arch::x86_64::sse2::packedpair::Finder,
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
avx2: crate::arch::x86_64::avx2::packedpair::Finder,
|
|
#[cfg(target_arch = "wasm32")]
|
|
simd128: crate::arch::wasm32::simd128::packedpair::Finder,
|
|
#[cfg(target_arch = "aarch64")]
|
|
neon: crate::arch::aarch64::neon::packedpair::Finder,
|
|
}
|
|
|
|
/// The type of a prefilter function.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// When using a function of this type, callers must ensure that the correct
|
|
/// function is paired with the value populated in `PrefilterKind` union.
|
|
type PrefilterKindFn =
|
|
unsafe fn(strat: &Prefilter, haystack: &[u8]) -> Option<usize>;
|
|
|
|
/// Reads from the `fallback` field of `PrefilterKind` to execute the fallback
|
|
/// prefilter. Works on all platforms.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `strat.kind.fallback` union field is set.
|
|
unsafe fn prefilter_kind_fallback(
|
|
strat: &Prefilter,
|
|
haystack: &[u8],
|
|
) -> Option<usize> {
|
|
strat.kind.fallback.find_prefilter(haystack)
|
|
}
|
|
|
|
/// Reads from the `sse2` field of `PrefilterKind` to execute the x86_64 SSE2
|
|
/// prefilter.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `strat.kind.sse2` union field is set.
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
unsafe fn prefilter_kind_sse2(
|
|
strat: &Prefilter,
|
|
haystack: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &strat.kind.sse2;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
strat.find_simple(haystack)
|
|
} else {
|
|
finder.find_prefilter(haystack)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `avx2` field of `PrefilterKind` to execute the x86_64 AVX2
|
|
/// prefilter.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `strat.kind.avx2` union field is set.
|
|
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
|
unsafe fn prefilter_kind_avx2(
|
|
strat: &Prefilter,
|
|
haystack: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &strat.kind.avx2;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
strat.find_simple(haystack)
|
|
} else {
|
|
finder.find_prefilter(haystack)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `simd128` field of `PrefilterKind` to execute the wasm32
|
|
/// simd128 prefilter.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `strat.kind.simd128` union field is set.
|
|
#[cfg(target_arch = "wasm32")]
|
|
unsafe fn prefilter_kind_simd128(
|
|
strat: &Prefilter,
|
|
haystack: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &strat.kind.simd128;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
strat.find_simple(haystack)
|
|
} else {
|
|
finder.find_prefilter(haystack)
|
|
}
|
|
}
|
|
|
|
/// Reads from the `neon` field of `PrefilterKind` to execute the aarch64 neon
|
|
/// prefilter.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Callers must ensure that the `strat.kind.neon` union field is set.
|
|
#[cfg(target_arch = "aarch64")]
|
|
unsafe fn prefilter_kind_neon(
|
|
strat: &Prefilter,
|
|
haystack: &[u8],
|
|
) -> Option<usize> {
|
|
let finder = &strat.kind.neon;
|
|
if haystack.len() < finder.min_haystack_len() {
|
|
strat.find_simple(haystack)
|
|
} else {
|
|
finder.find_prefilter(haystack)
|
|
}
|
|
}
|
|
|
|
/// PrefilterState tracks state associated with the effectiveness of a
|
|
/// prefilter. It is used to track how many bytes, on average, are skipped by
|
|
/// the prefilter. If this average dips below a certain threshold over time,
|
|
/// then the state renders the prefilter inert and stops using it.
|
|
///
|
|
/// A prefilter state should be created for each search. (Where creating an
|
|
/// iterator is treated as a single search.) A prefilter state should only be
|
|
/// created from a `Freqy`. e.g., An inert `Freqy` will produce an inert
|
|
/// `PrefilterState`.
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub(crate) struct PrefilterState {
|
|
/// The number of skips that has been executed. This is always 1 greater
|
|
/// than the actual number of skips. The special sentinel value of 0
|
|
/// indicates that the prefilter is inert. This is useful to avoid
|
|
/// additional checks to determine whether the prefilter is still
|
|
/// "effective." Once a prefilter becomes inert, it should no longer be
|
|
/// used (according to our heuristics).
|
|
skips: u32,
|
|
/// The total number of bytes that have been skipped.
|
|
skipped: u32,
|
|
}
|
|
|
|
impl PrefilterState {
|
|
/// The minimum number of skip attempts to try before considering whether
|
|
/// a prefilter is effective or not.
|
|
const MIN_SKIPS: u32 = 50;
|
|
|
|
/// The minimum amount of bytes that skipping must average.
|
|
///
|
|
/// This value was chosen based on varying it and checking
|
|
/// the microbenchmarks. In particular, this can impact the
|
|
/// pathological/repeated-{huge,small} benchmarks quite a bit if it's set
|
|
/// too low.
|
|
const MIN_SKIP_BYTES: u32 = 8;
|
|
|
|
/// Create a fresh prefilter state.
|
|
#[inline]
|
|
pub(crate) fn new() -> PrefilterState {
|
|
PrefilterState { skips: 1, skipped: 0 }
|
|
}
|
|
|
|
/// Update this state with the number of bytes skipped on the last
|
|
/// invocation of the prefilter.
|
|
#[inline]
|
|
fn update(&mut self, skipped: usize) {
|
|
self.skips = self.skips.saturating_add(1);
|
|
// We need to do this dance since it's technically possible for
|
|
// `skipped` to overflow a `u32`. (And we use a `u32` to reduce the
|
|
// size of a prefilter state.)
|
|
self.skipped = match u32::try_from(skipped) {
|
|
Err(_) => core::u32::MAX,
|
|
Ok(skipped) => self.skipped.saturating_add(skipped),
|
|
};
|
|
}
|
|
|
|
/// Return true if and only if this state indicates that a prefilter is
|
|
/// still effective.
|
|
#[inline]
|
|
fn is_effective(&mut self) -> bool {
|
|
if self.is_inert() {
|
|
return false;
|
|
}
|
|
if self.skips() < PrefilterState::MIN_SKIPS {
|
|
return true;
|
|
}
|
|
if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips() {
|
|
return true;
|
|
}
|
|
|
|
// We're inert.
|
|
self.skips = 0;
|
|
false
|
|
}
|
|
|
|
/// Returns true if the prefilter this state represents should no longer
|
|
/// be used.
|
|
#[inline]
|
|
fn is_inert(&self) -> bool {
|
|
self.skips == 0
|
|
}
|
|
|
|
/// Returns the total number of times the prefilter has been used.
|
|
#[inline]
|
|
fn skips(&self) -> u32 {
|
|
// Remember, `0` is a sentinel value indicating inertness, so we
|
|
// always need to subtract `1` to get our actual number of skips.
|
|
self.skips.saturating_sub(1)
|
|
}
|
|
}
|
|
|
|
/// A combination of prefilter effectiveness state and the prefilter itself.
|
|
#[derive(Debug)]
|
|
pub(crate) struct Pre<'a> {
|
|
/// State that tracks the effectiveness of a prefilter.
|
|
prestate: &'a mut PrefilterState,
|
|
/// The actual prefilter.
|
|
prestrat: &'a Prefilter,
|
|
}
|
|
|
|
impl<'a> Pre<'a> {
|
|
/// Call this prefilter on the given haystack with the given needle.
|
|
#[inline]
|
|
pub(crate) fn find(&mut self, haystack: &[u8]) -> Option<usize> {
|
|
let result = self.prestrat.find(haystack);
|
|
self.prestate.update(result.unwrap_or(haystack.len()));
|
|
result
|
|
}
|
|
|
|
/// Return true if and only if this prefilter should be used.
|
|
#[inline]
|
|
pub(crate) fn is_effective(&mut self) -> bool {
|
|
self.prestate.is_effective()
|
|
}
|
|
}
|
|
|
|
/// Returns true if the needle has the right characteristics for a vector
|
|
/// algorithm to handle the entirety of substring search.
|
|
///
|
|
/// Vector algorithms can be used for prefilters for other substring search
|
|
/// algorithms (like Two-Way), but they can also be used for substring search
|
|
/// on their own. When used for substring search, vector algorithms will
|
|
/// quickly identify candidate match positions (just like in the prefilter
|
|
/// case), but instead of returning the candidate position they will try to
|
|
/// confirm the match themselves. Confirmation happens via `memcmp`. This
|
|
/// works well for short needles, but can break down when many false candidate
|
|
/// positions are generated for large needles. Thus, we only permit vector
|
|
/// algorithms to own substring search when the needle is of a certain length.
|
|
#[inline]
|
|
fn do_packed_search(needle: &[u8]) -> bool {
|
|
/// The minimum length of a needle required for this algorithm. The minimum
|
|
/// is 2 since a length of 1 should just use memchr and a length of 0 isn't
|
|
/// a case handled by this searcher.
|
|
const MIN_LEN: usize = 2;
|
|
|
|
/// The maximum length of a needle required for this algorithm.
|
|
///
|
|
/// In reality, there is no hard max here. The code below can handle any
|
|
/// length needle. (Perhaps that suggests there are missing optimizations.)
|
|
/// Instead, this is a heuristic and a bound guaranteeing our linear time
|
|
/// complexity.
|
|
///
|
|
/// It is a heuristic because when a candidate match is found, memcmp is
|
|
/// run. For very large needles with lots of false positives, memcmp can
|
|
/// make the code run quite slow.
|
|
///
|
|
/// It is a bound because the worst case behavior with memcmp is
|
|
/// multiplicative in the size of the needle and haystack, and we want
|
|
/// to keep that additive. This bound ensures we still meet that bound
|
|
/// theoretically, since it's just a constant. We aren't acting in bad
|
|
/// faith here, memcmp on tiny needles is so fast that even in pathological
|
|
/// cases (see pathological vector benchmarks), this is still just as fast
|
|
/// or faster in practice.
|
|
///
|
|
/// This specific number was chosen by tweaking a bit and running
|
|
/// benchmarks. The rare-medium-needle, for example, gets about 5% faster
|
|
/// by using this algorithm instead of a prefilter-accelerated Two-Way.
|
|
/// There's also a theoretical desire to keep this number reasonably
|
|
/// low, to mitigate the impact of pathological cases. I did try 64, and
|
|
/// some benchmarks got a little better, and others (particularly the
|
|
/// pathological ones), got a lot worse. So... 32 it is?
|
|
const MAX_LEN: usize = 32;
|
|
MIN_LEN <= needle.len() && needle.len() <= MAX_LEN
|
|
}
|