wu_manber/lib.rs
1// Copyright 2015 Joe Neeman.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms
8
9//! This crate gives an implementation of Wu and Manber's algorithm for finding one of several
10//! strings (which we will call "needles") in a much larger string (the "haystack"). This is not
11//! to be confused with Wu and Manber's algorithm for fuzzy matching.
12//!
13//! The Wu-Manber algorithm is very efficient when all of the strings to be matched are long. It
14//! requires a pre-processing step with a fair amount of memory overhead -- currently about 32kb in
15//! this implementation, but future improvements may reduce that when there are not too many
16//! needles.
17//!
18//! This implementation supports a maximum of 65536 needles, each of which can be at most 65536
19//! bytes long. These requirements may be relaxed in the future.
20//!
21//! # Example
22//! ```
23//! use wu_manber::{Match, TwoByteWM};
24//! let needles = vec!["quick", "brown", "lazy", "wombat"];
25//! let haystack = "The quick brown fox jumps over the lazy dog.";
26//! let searcher = TwoByteWM::new(&needles);
27//! let mat = searcher.find(haystack).next().unwrap();
28//! assert_eq!(mat, Match { start: 4, end: 9, pat_idx: 0 });
29//! ```
30
31use std::cmp::min;
32
33#[cfg(test)]
34extern crate aho_corasick;
35
36/// This is the type for indexing into the bytes of the needles. Its size determines the maximum
37/// length of a needle.
38type NByteIdx = u16;
39
40/// This is the type for indexing into the list of needles. Its size determines the maximum number
41/// of needles.
42type NeedleIdx = u16;
43
44/// `TwoByteWM` stores the precomputed tables needed for a two-byte-wide implementation of the
45/// Wu-Manber algorithm.
46///
47/// "Two-byte-wide" means that the search phase in the Wu-Manber algorithm uses spans of two bytes
48/// to look for potential matches. This is suitable for moderately sized sets of needles; if there
49/// are too many needles then it might be faster to use spans of three bytes (but that isn't yet
50/// implemented by this crate).
51#[derive(Debug)]
52pub struct TwoByteWM {
53 /// The needles that we are trying to match against, and their indices.
54 ///
55 /// Each of the needles has length (in bytes) at least 2. They are sorted in increasing order
56 /// of the hash value of their two critical bytes.
57 needles: Vec<(usize, Vec<u8>)>,
58
59 /// For each of the needles above, this contains the first two bytes, concatenated into a
60 /// `u16`.
61 ///
62 /// This `Vec` is indexed in the same way as `needles`.
63 prefix: Vec<u16>,
64
65 /// The minimimum length of any needle.
66 pat_len: NByteIdx,
67
68 /// If `shift[HashFn(a, b)] = i` then no needle contains the two-byte string `ab` starting
69 /// anywhere between positions `pat_len - 2 - i` and `pat_len - 2`.
70 ///
71 /// Note that because this `Vec` can be quite long, we might save a substantial amount of space
72 /// by shrinking the size of `NByteIdx`.
73 shift: Vec<NByteIdx>,
74
75 /// If `hash[HashFn(a, b)] = i` then the needles whose critical bytes hash to `HashFn(a, b)`
76 /// begin at `needles[i]`.
77 ///
78 /// Note that because this `Vec` can be quite long, we might save a substantial amount of space
79 /// by shrinking the size of `NeedleIdx`.
80 hash: Vec<NeedleIdx>,
81}
82
83#[derive(Debug, PartialEq)]
84pub struct Match {
85 pub start: usize,
86 pub end: usize,
87 pub pat_idx: usize,
88}
89
90pub struct Matches<'a, P: AsRef<[u8]>> {
91 wm: &'a TwoByteWM,
92 haystack: P,
93 cur_pos: usize,
94}
95impl<'a, P> Iterator for Matches<'a, P>
96where
97 P: AsRef<[u8]>,
98{
99 type Item = Match;
100 fn next(&mut self) -> Option<Match> {
101 self.wm
102 .find_from(self.haystack.as_ref(), self.cur_pos)
103 .map(|m| {
104 self.cur_pos = m.end;
105 m
106 })
107 }
108}
109
110/// For now, we default to this hash function (which is the one from the original paper of Wu and
111/// Manber). In the future, we may want to look for a better one depending on the needles.
112fn hash_fn(a: u8, b: u8) -> NeedleIdx {
113 ((a as NeedleIdx) << 5) + (b as NeedleIdx)
114}
115
116const HASH_MAX: usize = (0xFFusize << 5) + 0xFF;
117
118impl TwoByteWM {
119 fn pat(&self, p_idx: NeedleIdx) -> &[u8] {
120 &self.needles[p_idx as usize].1
121 }
122
123 fn pat_idx(&self, p_idx: NeedleIdx) -> usize {
124 self.needles[p_idx as usize].0
125 }
126
127 /// Creates lookup tables to efficiently search for the given needles.
128 ///
129 /// The order of `needles` is significant, since all `Match`es returned from this `TwoByteWM`
130 /// will include an index into `needles` saying which needle matched.
131 pub fn new<I, P>(needles: I) -> TwoByteWM
132 where
133 P: AsRef<[u8]>,
134 I: IntoIterator<Item = P>,
135 {
136 let needles: Vec<_> = needles.into_iter().map(|s| s.as_ref().to_vec()).collect();
137 if needles.is_empty() {
138 panic!("cannot create TwoByteWM from an empty set of needles");
139 } else if needles.len() > NeedleIdx::max_value() as usize {
140 panic!("too many needles");
141 }
142
143 let pat_len = needles.iter().map(|p| p.len()).min().unwrap();
144 if pat_len < 2 {
145 panic!("all needles must have length (in bytes) at least 2");
146 } else if pat_len > NByteIdx::max_value() as usize {
147 panic!("these needles are too long");
148 }
149 let pat_len = pat_len as NByteIdx;
150
151 let h = |p: &[u8]| hash_fn(p[(pat_len-2) as usize], p[(pat_len-1) as usize]);
152 let mut needles: Vec<_> = needles.into_iter().enumerate().collect();
153 needles.sort_by(|p, q| h(&p.1).cmp(&h(&q.1)));
154 let needles = needles;
155 let prefix: Vec<_> = needles.iter()
156 .map(|p| ((p.1[0] as u16) << 8) + (p.1[1] as u16))
157 .collect();
158
159 let mut hash = vec![0; HASH_MAX + 2];
160 for (p_idx, &(_, ref p)) in needles.iter().enumerate().rev() {
161 let h_idx = h(&p) as usize;
162 hash[h_idx] = p_idx as NeedleIdx;
163 if hash[h_idx + 1] == 0 {
164 hash[h_idx + 1] = p_idx as NeedleIdx + 1;
165 }
166 }
167
168 let mut shift = vec![pat_len - 1; HASH_MAX + 1];
169 for &(_, ref p) in &needles {
170 for p_pos in 0..(pat_len - 1) {
171 let h = hash_fn(p[p_pos as usize], p[(p_pos + 1) as usize]);
172 shift[h as usize] = min(shift[h as usize], pat_len - p_pos - 2);
173 }
174 }
175
176 TwoByteWM {
177 needles: needles,
178 prefix: prefix,
179 pat_len: pat_len,
180 shift: shift,
181 hash: hash,
182 }
183 }
184
185 /// Searches for a single match, starting from the given byte offset.
186 pub fn find_from<P>(&self, haystack: P, offset: usize) -> Option<Match>
187 where
188 P: AsRef<[u8]>,
189 {
190 // `pos` points to the index in `haystack` that we are trying to align against the index
191 // `pat_len - 1` of the needles.
192 let pat_len = self.pat_len as usize;
193 let mut pos = offset + pat_len - 1;
194 let haystack = haystack.as_ref();
195 while pos <= haystack.len() - 1 {
196 let h = hash_fn(haystack[pos - 1], haystack[pos]) as usize;
197 let shift = self.shift[h] as usize;
198 if shift == 0 {
199 // We might have matched the end of some needle. Iterate over all the needles
200 // that we might have matched, and see if they match the beginning.
201 let a = haystack[1 + pos - pat_len];
202 let b = haystack[2 + pos - pat_len];
203 let prefix = ((a as u16) << 8) + (b as u16);
204 let mut found: Option<NeedleIdx> = None;
205 for p_idx in self.hash[h]..self.hash[h+1] {
206 if self.prefix[p_idx as usize] == prefix {
207 // The prefix matches too, so now check for the full match.
208 let p = self.pat(p_idx);
209 if haystack[(1 + pos - pat_len)..].starts_with(&p) {
210 found = match found {
211 None => Some(p_idx),
212 Some(q_idx) => {
213 let q = self.pat(q_idx);
214 Some(if p.len() < q.len() { p_idx } else { q_idx })
215 }
216 }
217 }
218 }
219 }
220 if let Some(p_idx) = found {
221 return Some(Match {
222 start: 1 + pos - pat_len,
223 end: 1 + pos - pat_len + self.pat(p_idx).len(),
224 pat_idx: self.pat_idx(p_idx),
225 })
226 }
227
228 pos += 1;
229 } else {
230 pos += shift;
231 }
232 }
233
234 None
235 }
236
237 /// Returns an iterator over non-overlapping matches.
238 pub fn find<'a, 'b, P>(&'a self, haystack: P) -> Matches<'a, P>
239 where
240 P: AsRef<[u8]> + 'b,
241 {
242 Matches {
243 wm: &self,
244 haystack,
245 cur_pos: 0,
246 }
247 }
248}
249
250#[cfg(test)]
251mod tests {
252 use ::{Match, TwoByteWM};
253 use aho_corasick::{AcAutomaton, Automaton};
254
255 #[test]
256 fn examples() {
257 let needles = vec![
258 "fox",
259 "brown",
260 "vwxyz",
261 "yz",
262 "ijk",
263 "ijklm",
264 ];
265 let haystacks = vec![
266 "The quick brown fox jumped over the lazy dog.",
267 "abcdefghijklmnopqrstuvwxyz",
268 ];
269
270 let wm = TwoByteWM::new(&needles);
271 let ac = AcAutomaton::new(&needles);
272 for hay in &haystacks {
273 let wm_answer: Vec<Match> = wm.find(hay).collect();
274 let ac_answer: Vec<Match> = ac.find(hay)
275 .map(|m| Match { start: m.start, end: m.end, pat_idx: m.pati })
276 .collect();
277 assert_eq!(wm_answer, ac_answer);
278 }
279 }
280
281 #[test]
282 fn binary() {
283 let mut needles: Vec<Vec<u8>> = Vec::new();
284 needles.push("foo".into());
285 needles.push("\x00\x0f".into());
286 let haystack: Vec<u8> = "--foo-b\x00\x0f".into();
287
288 let wm = TwoByteWM::new(&needles);
289 let results = wm.find(haystack).collect::<Vec<_>>();
290
291 assert_eq!(results.len(), 2, "expected 2 results");
292
293 assert_eq!(results[0].pat_idx, 0);
294 assert_eq!(results[0].start, 2);
295 assert_eq!(results[0].end, 5);
296
297 assert_eq!(results[1].pat_idx, 1);
298 assert_eq!(results[1].start, 7);
299 assert_eq!(results[1].end, 9);
300 }
301
302 #[test]
303 fn match_at_beginning() {
304 let needles = vec![
305 "Hello world",
306 "it is a beautiful day",
307 "somewhere."
308 ];
309
310 let haystack = "it is a beautiful day in Cairo";
311 let wm = TwoByteWM::new(&needles);
312 let results = wm.find(haystack).collect::<Vec<_>>();
313
314 assert_eq!(results.len(), 1);
315 assert_eq!(results[0].pat_idx, 1);
316 }
317}
318
319