wu_manber/
lib.rs

1// Copyright 2015 Joe Neeman.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms
8
9//! This crate gives an implementation of Wu and Manber's algorithm for finding one of several
10//! strings (which we will call "needles") in a much larger string (the "haystack").  This is not
11//! to be confused with Wu and Manber's algorithm for fuzzy matching.
12//!
13//! The Wu-Manber algorithm is very efficient when all of the strings to be matched are long. It
14//! requires a pre-processing step with a fair amount of memory overhead -- currently about 32kb in
15//! this implementation, but future improvements may reduce that when there are not too many
16//! needles.
17//!
18//! This implementation supports a maximum of 65536 needles, each of which can be at most 65536
19//! bytes long. These requirements may be relaxed in the future.
20//!
21//! # Example
22//! ```
23//! use wu_manber::{Match, TwoByteWM};
24//! let needles = vec!["quick", "brown", "lazy", "wombat"];
25//! let haystack = "The quick brown fox jumps over the lazy dog.";
26//! let searcher = TwoByteWM::new(&needles);
27//! let mat = searcher.find(haystack).next().unwrap();
28//! assert_eq!(mat, Match { start: 4, end: 9, pat_idx: 0 });
29//! ```
30
31use std::cmp::min;
32
33#[cfg(test)]
34extern crate aho_corasick;
35
36/// This is the type for indexing into the bytes of the needles.  Its size determines the maximum
37/// length of a needle.
38type NByteIdx = u16;
39
40/// This is the type for indexing into the list of needles.  Its size determines the maximum number
41/// of needles.
42type NeedleIdx = u16;
43
44/// `TwoByteWM` stores the precomputed tables needed for a two-byte-wide implementation of the
45/// Wu-Manber algorithm.
46///
47/// "Two-byte-wide" means that the search phase in the Wu-Manber algorithm uses spans of two bytes
48/// to look for potential matches.  This is suitable for moderately sized sets of needles; if there
49/// are too many needles then it might be faster to use spans of three bytes (but that isn't yet
50/// implemented by this crate).
51#[derive(Debug)]
52pub struct TwoByteWM {
53    /// The needles that we are trying to match against, and their indices.
54    ///
55    /// Each of the needles has length (in bytes) at least 2.  They are sorted in increasing order
56    /// of the hash value of their two critical bytes.
57    needles: Vec<(usize, Vec<u8>)>,
58
59    /// For each of the needles above, this contains the first two bytes, concatenated into a
60    /// `u16`.
61    ///
62    /// This `Vec` is indexed in the same way as `needles`.
63    prefix: Vec<u16>,
64
65    /// The minimimum length of any needle.
66    pat_len: NByteIdx,
67
68    /// If `shift[HashFn(a, b)] = i` then no needle contains the two-byte string `ab` starting
69    /// anywhere between positions `pat_len - 2 - i` and `pat_len - 2`.
70    ///
71    /// Note that because this `Vec` can be quite long, we might save a substantial amount of space
72    /// by shrinking the size of `NByteIdx`.
73    shift: Vec<NByteIdx>,
74
75    /// If `hash[HashFn(a, b)] = i` then the needles whose critical bytes hash to `HashFn(a, b)`
76    /// begin at `needles[i]`.
77    ///
78    /// Note that because this `Vec` can be quite long, we might save a substantial amount of space
79    /// by shrinking the size of `NeedleIdx`.
80    hash: Vec<NeedleIdx>,
81}
82
83#[derive(Debug, PartialEq)]
84pub struct Match {
85    pub start: usize,
86    pub end: usize,
87    pub pat_idx: usize,
88}
89
90pub struct Matches<'a, P: AsRef<[u8]>> {
91    wm: &'a TwoByteWM,
92    haystack: P,
93    cur_pos: usize,
94}
95impl<'a, P> Iterator for Matches<'a, P>
96where
97    P: AsRef<[u8]>,
98{
99    type Item = Match;
100    fn next(&mut self) -> Option<Match> {
101        self.wm
102            .find_from(self.haystack.as_ref(), self.cur_pos)
103            .map(|m| {
104                self.cur_pos = m.end;
105                m
106            })
107    }
108}
109
110/// For now, we default to this hash function (which is the one from the original paper of Wu and
111/// Manber). In the future, we may want to look for a better one depending on the needles.
112fn hash_fn(a: u8, b: u8) -> NeedleIdx {
113    ((a as NeedleIdx) << 5) + (b as NeedleIdx)
114}
115
116const HASH_MAX: usize = (0xFFusize << 5) + 0xFF;
117
118impl TwoByteWM {
119    fn pat(&self, p_idx: NeedleIdx) -> &[u8] {
120        &self.needles[p_idx as usize].1
121    }
122
123    fn pat_idx(&self, p_idx: NeedleIdx) -> usize {
124        self.needles[p_idx as usize].0
125    }
126
127    /// Creates lookup tables to efficiently search for the given needles.
128    ///
129    /// The order of `needles` is significant, since all `Match`es returned from this `TwoByteWM`
130    /// will include an index into `needles` saying which needle matched.
131    pub fn new<I, P>(needles: I) -> TwoByteWM
132    where
133        P: AsRef<[u8]>,
134        I: IntoIterator<Item = P>,
135    {
136        let needles: Vec<_> = needles.into_iter().map(|s| s.as_ref().to_vec()).collect();
137        if needles.is_empty() {
138            panic!("cannot create TwoByteWM from an empty set of needles");
139        } else if needles.len() > NeedleIdx::max_value() as usize {
140            panic!("too many needles");
141        }
142
143        let pat_len = needles.iter().map(|p| p.len()).min().unwrap();
144        if pat_len < 2 {
145            panic!("all needles must have length (in bytes) at least 2");
146        } else if pat_len > NByteIdx::max_value() as usize {
147            panic!("these needles are too long");
148        }
149        let pat_len = pat_len as NByteIdx;
150
151        let h = |p: &[u8]| hash_fn(p[(pat_len-2) as usize], p[(pat_len-1) as usize]);
152        let mut needles: Vec<_> = needles.into_iter().enumerate().collect();
153        needles.sort_by(|p, q| h(&p.1).cmp(&h(&q.1)));
154        let needles = needles;
155        let prefix: Vec<_> = needles.iter()
156            .map(|p| ((p.1[0] as u16) << 8) + (p.1[1] as u16))
157            .collect();
158
159        let mut hash = vec![0; HASH_MAX + 2];
160        for (p_idx, &(_, ref p)) in needles.iter().enumerate().rev() {
161            let h_idx = h(&p) as usize;
162            hash[h_idx] = p_idx as NeedleIdx;
163            if hash[h_idx + 1] == 0 {
164                hash[h_idx + 1] = p_idx as NeedleIdx + 1;
165            }
166        }
167
168        let mut shift = vec![pat_len - 1; HASH_MAX + 1];
169        for &(_, ref p) in &needles {
170            for p_pos in 0..(pat_len - 1) {
171                let h = hash_fn(p[p_pos as usize], p[(p_pos + 1) as usize]);
172                shift[h as usize] = min(shift[h as usize], pat_len - p_pos - 2);
173            }
174        }
175
176        TwoByteWM {
177            needles: needles,
178            prefix: prefix,
179            pat_len: pat_len,
180            shift: shift,
181            hash: hash,
182        }
183    }
184
185    /// Searches for a single match, starting from the given byte offset.
186    pub fn find_from<P>(&self, haystack: P, offset: usize) -> Option<Match>
187    where
188        P: AsRef<[u8]>,
189    {
190        // `pos` points to the index in `haystack` that we are trying to align against the index
191        // `pat_len - 1` of the needles.
192        let pat_len = self.pat_len as usize;
193        let mut pos = offset + pat_len - 1;
194        let haystack = haystack.as_ref();
195        while pos <= haystack.len() - 1 {
196            let h = hash_fn(haystack[pos - 1], haystack[pos]) as usize;
197            let shift = self.shift[h] as usize;
198            if shift == 0 {
199                // We might have matched the end of some needle.  Iterate over all the needles
200                // that we might have matched, and see if they match the beginning.
201                let a = haystack[1 + pos - pat_len];
202                let b = haystack[2 + pos - pat_len];
203                let prefix = ((a as u16) << 8) + (b as u16);
204                let mut found: Option<NeedleIdx> = None;
205                for p_idx in self.hash[h]..self.hash[h+1] {
206                    if self.prefix[p_idx as usize] == prefix {
207                        // The prefix matches too, so now check for the full match.
208                        let p = self.pat(p_idx);
209                        if haystack[(1 + pos - pat_len)..].starts_with(&p) {
210                            found = match found {
211                                None => Some(p_idx),
212                                Some(q_idx) => {
213                                    let q = self.pat(q_idx);
214                                    Some(if p.len() < q.len() { p_idx } else { q_idx })
215                                }
216                            }
217                        }
218                    }
219                }
220                if let Some(p_idx) = found {
221                    return Some(Match {
222                        start: 1 + pos - pat_len,
223                        end: 1 + pos - pat_len + self.pat(p_idx).len(),
224                        pat_idx: self.pat_idx(p_idx),
225                    })
226                }
227
228                pos += 1;
229            } else {
230                pos += shift;
231            }
232        }
233
234        None
235    }
236
237    /// Returns an iterator over non-overlapping matches.
238    pub fn find<'a, 'b, P>(&'a self, haystack: P) -> Matches<'a, P>
239    where
240        P: AsRef<[u8]> + 'b,
241    {
242        Matches {
243            wm: &self,
244            haystack,
245            cur_pos: 0,
246        }
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use ::{Match, TwoByteWM};
253    use aho_corasick::{AcAutomaton, Automaton};
254
255    #[test]
256    fn examples() {
257        let needles = vec![
258            "fox",
259            "brown",
260            "vwxyz",
261            "yz",
262            "ijk",
263            "ijklm",
264        ];
265        let haystacks = vec![
266            "The quick brown fox jumped over the lazy dog.",
267            "abcdefghijklmnopqrstuvwxyz",
268        ];
269
270        let wm = TwoByteWM::new(&needles);
271        let ac = AcAutomaton::new(&needles);
272        for hay in &haystacks {
273            let wm_answer: Vec<Match> = wm.find(hay).collect();
274            let ac_answer: Vec<Match> = ac.find(hay)
275                .map(|m| Match { start: m.start, end: m.end, pat_idx: m.pati })
276                .collect();
277            assert_eq!(wm_answer, ac_answer);
278        }
279    }
280
281    #[test]
282    fn binary() {
283        let mut needles: Vec<Vec<u8>> = Vec::new();
284        needles.push("foo".into());
285        needles.push("\x00\x0f".into());
286        let haystack: Vec<u8> = "--foo-b\x00\x0f".into();
287
288        let wm = TwoByteWM::new(&needles);
289        let results = wm.find(haystack).collect::<Vec<_>>();
290
291        assert_eq!(results.len(), 2, "expected 2 results");
292
293        assert_eq!(results[0].pat_idx, 0);
294        assert_eq!(results[0].start, 2);
295        assert_eq!(results[0].end, 5);
296
297        assert_eq!(results[1].pat_idx, 1);
298        assert_eq!(results[1].start, 7);
299        assert_eq!(results[1].end, 9);
300    }
301
302    #[test]
303    fn match_at_beginning() {
304        let needles = vec![
305            "Hello world",
306            "it is a beautiful day",
307            "somewhere."
308        ];
309
310        let haystack = "it is a beautiful day in Cairo";
311        let wm = TwoByteWM::new(&needles);
312        let results = wm.find(haystack).collect::<Vec<_>>();
313
314        assert_eq!(results.len(), 1);
315        assert_eq!(results[0].pat_idx, 1);
316    }
317}
318
319