| 1 | /*! |
| 2 | Provides routines for extracting literal prefixes and suffixes from an `Hir`. |
| 3 | */ |
| 4 | |
| 5 | use std::cmp; |
| 6 | use std::fmt; |
| 7 | use std::iter; |
| 8 | use std::mem; |
| 9 | use std::ops; |
| 10 | |
| 11 | use crate::hir::{self, Hir, HirKind}; |
| 12 | |
| 13 | /// A set of literal byte strings extracted from a regular expression. |
| 14 | /// |
| 15 | /// Every member of the set is a `Literal`, which is represented by a |
| 16 | /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is |
| 17 | /// said to be either *complete* or *cut*. A complete literal means that |
| 18 | /// it extends until the beginning (or end) of the regular expression. In |
| 19 | /// some circumstances, this can be used to indicate a match in the regular |
| 20 | /// expression. |
| 21 | /// |
| 22 | /// A key aspect of literal extraction is knowing when to stop. It is not |
| 23 | /// feasible to blindly extract all literals from a regular expression, even if |
| 24 | /// there are finitely many. For example, the regular expression `[0-9]{10}` |
| 25 | /// has `10^10` distinct literals. For this reason, literal extraction is |
| 26 | /// bounded to some low number by default using heuristics, but the limits can |
| 27 | /// be tweaked. |
| 28 | /// |
| 29 | /// **WARNING**: Literal extraction uses stack space proportional to the size |
| 30 | /// of the `Hir` expression. At some point, this drawback will be eliminated. |
| 31 | /// To protect yourself, set a reasonable |
| 32 | /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). |
| 33 | /// This is done for you by default. |
| 34 | #[derive (Clone, Eq, PartialEq)] |
| 35 | pub struct Literals { |
| 36 | lits: Vec<Literal>, |
| 37 | limit_size: usize, |
| 38 | limit_class: usize, |
| 39 | } |
| 40 | |
| 41 | /// A single member of a set of literals extracted from a regular expression. |
| 42 | /// |
| 43 | /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice |
| 44 | /// and `Vec` operations are available. |
| 45 | #[derive (Clone, Eq, Ord)] |
| 46 | pub struct Literal { |
| 47 | v: Vec<u8>, |
| 48 | cut: bool, |
| 49 | } |
| 50 | |
| 51 | impl Literals { |
| 52 | /// Returns a new empty set of literals using default limits. |
| 53 | pub fn empty() -> Literals { |
| 54 | Literals { lits: vec![], limit_size: 250, limit_class: 10 } |
| 55 | } |
| 56 | |
| 57 | /// Returns a set of literal prefixes extracted from the given `Hir`. |
| 58 | pub fn prefixes(expr: &Hir) -> Literals { |
| 59 | let mut lits = Literals::empty(); |
| 60 | lits.union_prefixes(expr); |
| 61 | lits |
| 62 | } |
| 63 | |
| 64 | /// Returns a set of literal suffixes extracted from the given `Hir`. |
| 65 | pub fn suffixes(expr: &Hir) -> Literals { |
| 66 | let mut lits = Literals::empty(); |
| 67 | lits.union_suffixes(expr); |
| 68 | lits |
| 69 | } |
| 70 | |
| 71 | /// Get the approximate size limit (in bytes) of this set. |
| 72 | pub fn limit_size(&self) -> usize { |
| 73 | self.limit_size |
| 74 | } |
| 75 | |
| 76 | /// Set the approximate size limit (in bytes) of this set. |
| 77 | /// |
| 78 | /// If extracting a literal would put the set over this limit, then |
| 79 | /// extraction stops. |
| 80 | /// |
| 81 | /// The new limits will only apply to additions to this set. Existing |
| 82 | /// members remain unchanged, even if the set exceeds the new limit. |
| 83 | pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { |
| 84 | self.limit_size = size; |
| 85 | self |
| 86 | } |
| 87 | |
| 88 | /// Get the character class size limit for this set. |
| 89 | pub fn limit_class(&self) -> usize { |
| 90 | self.limit_class |
| 91 | } |
| 92 | |
| 93 | /// Limits the size of character(or byte) classes considered. |
| 94 | /// |
| 95 | /// A value of `0` prevents all character classes from being considered. |
| 96 | /// |
| 97 | /// This limit also applies to case insensitive literals, since each |
| 98 | /// character in the case insensitive literal is converted to a class, and |
| 99 | /// then case folded. |
| 100 | /// |
| 101 | /// The new limits will only apply to additions to this set. Existing |
| 102 | /// members remain unchanged, even if the set exceeds the new limit. |
| 103 | pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { |
| 104 | self.limit_class = size; |
| 105 | self |
| 106 | } |
| 107 | |
| 108 | /// Returns the set of literals as a slice. Its order is unspecified. |
| 109 | pub fn literals(&self) -> &[Literal] { |
| 110 | &self.lits |
| 111 | } |
| 112 | |
| 113 | /// Returns the length of the smallest literal. |
| 114 | /// |
| 115 | /// Returns None is there are no literals in the set. |
| 116 | pub fn min_len(&self) -> Option<usize> { |
| 117 | let mut min = None; |
| 118 | for lit in &self.lits { |
| 119 | match min { |
| 120 | None => min = Some(lit.len()), |
| 121 | Some(m) if lit.len() < m => min = Some(lit.len()), |
| 122 | _ => {} |
| 123 | } |
| 124 | } |
| 125 | min |
| 126 | } |
| 127 | |
| 128 | /// Returns true if all members in this set are complete. |
| 129 | pub fn all_complete(&self) -> bool { |
| 130 | !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) |
| 131 | } |
| 132 | |
| 133 | /// Returns true if any member in this set is complete. |
| 134 | pub fn any_complete(&self) -> bool { |
| 135 | self.lits.iter().any(|lit| !lit.is_cut()) |
| 136 | } |
| 137 | |
| 138 | /// Returns true if this set contains an empty literal. |
| 139 | pub fn contains_empty(&self) -> bool { |
| 140 | self.lits.iter().any(|lit| lit.is_empty()) |
| 141 | } |
| 142 | |
| 143 | /// Returns true if this set is empty or if all of its members is empty. |
| 144 | pub fn is_empty(&self) -> bool { |
| 145 | self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) |
| 146 | } |
| 147 | |
| 148 | /// Returns a new empty set of literals using this set's limits. |
| 149 | pub fn to_empty(&self) -> Literals { |
| 150 | let mut lits = Literals::empty(); |
| 151 | lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); |
| 152 | lits |
| 153 | } |
| 154 | |
| 155 | /// Returns the longest common prefix of all members in this set. |
| 156 | pub fn longest_common_prefix(&self) -> &[u8] { |
| 157 | if self.is_empty() { |
| 158 | return &[]; |
| 159 | } |
| 160 | let lit0 = &*self.lits[0]; |
| 161 | let mut len = lit0.len(); |
| 162 | for lit in &self.lits[1..] { |
| 163 | len = cmp::min( |
| 164 | len, |
| 165 | lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), |
| 166 | ); |
| 167 | } |
| 168 | &self.lits[0][..len] |
| 169 | } |
| 170 | |
| 171 | /// Returns the longest common suffix of all members in this set. |
| 172 | pub fn longest_common_suffix(&self) -> &[u8] { |
| 173 | if self.is_empty() { |
| 174 | return &[]; |
| 175 | } |
| 176 | let lit0 = &*self.lits[0]; |
| 177 | let mut len = lit0.len(); |
| 178 | for lit in &self.lits[1..] { |
| 179 | len = cmp::min( |
| 180 | len, |
| 181 | lit.iter() |
| 182 | .rev() |
| 183 | .zip(lit0.iter().rev()) |
| 184 | .take_while(|&(a, b)| a == b) |
| 185 | .count(), |
| 186 | ); |
| 187 | } |
| 188 | &self.lits[0][self.lits[0].len() - len..] |
| 189 | } |
| 190 | |
| 191 | /// Returns a new set of literals with the given number of bytes trimmed |
| 192 | /// from the suffix of each literal. |
| 193 | /// |
| 194 | /// If any literal would be cut out completely by trimming, then None is |
| 195 | /// returned. |
| 196 | /// |
| 197 | /// Any duplicates that are created as a result of this transformation are |
| 198 | /// removed. |
| 199 | pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { |
| 200 | if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { |
| 201 | return None; |
| 202 | } |
| 203 | let mut new = self.to_empty(); |
| 204 | for mut lit in self.lits.iter().cloned() { |
| 205 | let new_len = lit.len() - num_bytes; |
| 206 | lit.truncate(new_len); |
| 207 | lit.cut(); |
| 208 | new.lits.push(lit); |
| 209 | } |
| 210 | new.lits.sort(); |
| 211 | new.lits.dedup(); |
| 212 | Some(new) |
| 213 | } |
| 214 | |
| 215 | /// Returns a new set of prefixes of this set of literals that are |
| 216 | /// guaranteed to be unambiguous. |
| 217 | /// |
| 218 | /// Any substring match with a member of the set is returned is guaranteed |
| 219 | /// to never overlap with a substring match of another member of the set |
| 220 | /// at the same starting position. |
| 221 | /// |
| 222 | /// Given any two members of the returned set, neither is a substring of |
| 223 | /// the other. |
| 224 | pub fn unambiguous_prefixes(&self) -> Literals { |
| 225 | if self.lits.is_empty() { |
| 226 | return self.to_empty(); |
| 227 | } |
| 228 | let mut old = self.lits.to_vec(); |
| 229 | let mut new = self.to_empty(); |
| 230 | 'OUTER: while let Some(mut candidate) = old.pop() { |
| 231 | if candidate.is_empty() { |
| 232 | continue; |
| 233 | } |
| 234 | if new.lits.is_empty() { |
| 235 | new.lits.push(candidate); |
| 236 | continue; |
| 237 | } |
| 238 | for lit2 in &mut new.lits { |
| 239 | if lit2.is_empty() { |
| 240 | continue; |
| 241 | } |
| 242 | if &candidate == lit2 { |
| 243 | // If the literal is already in the set, then we can |
| 244 | // just drop it. But make sure that cut literals are |
| 245 | // infectious! |
| 246 | candidate.cut = candidate.cut || lit2.cut; |
| 247 | lit2.cut = candidate.cut; |
| 248 | continue 'OUTER; |
| 249 | } |
| 250 | if candidate.len() < lit2.len() { |
| 251 | if let Some(i) = position(&candidate, &lit2) { |
| 252 | candidate.cut(); |
| 253 | let mut lit3 = lit2.clone(); |
| 254 | lit3.truncate(i); |
| 255 | lit3.cut(); |
| 256 | old.push(lit3); |
| 257 | lit2.clear(); |
| 258 | } |
| 259 | } else if let Some(i) = position(&lit2, &candidate) { |
| 260 | lit2.cut(); |
| 261 | let mut new_candidate = candidate.clone(); |
| 262 | new_candidate.truncate(i); |
| 263 | new_candidate.cut(); |
| 264 | old.push(new_candidate); |
| 265 | candidate.clear(); |
| 266 | } |
| 267 | // Oops, the candidate is already represented in the set. |
| 268 | if candidate.is_empty() { |
| 269 | continue 'OUTER; |
| 270 | } |
| 271 | } |
| 272 | new.lits.push(candidate); |
| 273 | } |
| 274 | new.lits.retain(|lit| !lit.is_empty()); |
| 275 | new.lits.sort(); |
| 276 | new.lits.dedup(); |
| 277 | new |
| 278 | } |
| 279 | |
| 280 | /// Returns a new set of suffixes of this set of literals that are |
| 281 | /// guaranteed to be unambiguous. |
| 282 | /// |
| 283 | /// Any substring match with a member of the set is returned is guaranteed |
| 284 | /// to never overlap with a substring match of another member of the set |
| 285 | /// at the same ending position. |
| 286 | /// |
| 287 | /// Given any two members of the returned set, neither is a substring of |
| 288 | /// the other. |
| 289 | pub fn unambiguous_suffixes(&self) -> Literals { |
| 290 | // This is a touch wasteful... |
| 291 | let mut lits = self.clone(); |
| 292 | lits.reverse(); |
| 293 | let mut unamb = lits.unambiguous_prefixes(); |
| 294 | unamb.reverse(); |
| 295 | unamb |
| 296 | } |
| 297 | |
| 298 | /// Unions the prefixes from the given expression to this set. |
| 299 | /// |
| 300 | /// If prefixes could not be added (for example, this set would exceed its |
| 301 | /// size limits or the set of prefixes from `expr` includes the empty |
| 302 | /// string), then false is returned. |
| 303 | /// |
| 304 | /// Note that prefix literals extracted from `expr` are said to be complete |
| 305 | /// if and only if the literal extends from the beginning of `expr` to the |
| 306 | /// end of `expr`. |
| 307 | pub fn union_prefixes(&mut self, expr: &Hir) -> bool { |
| 308 | let mut lits = self.to_empty(); |
| 309 | prefixes(expr, &mut lits); |
| 310 | !lits.is_empty() && !lits.contains_empty() && self.union(lits) |
| 311 | } |
| 312 | |
| 313 | /// Unions the suffixes from the given expression to this set. |
| 314 | /// |
| 315 | /// If suffixes could not be added (for example, this set would exceed its |
| 316 | /// size limits or the set of suffixes from `expr` includes the empty |
| 317 | /// string), then false is returned. |
| 318 | /// |
| 319 | /// Note that prefix literals extracted from `expr` are said to be complete |
| 320 | /// if and only if the literal extends from the end of `expr` to the |
| 321 | /// beginning of `expr`. |
| 322 | pub fn union_suffixes(&mut self, expr: &Hir) -> bool { |
| 323 | let mut lits = self.to_empty(); |
| 324 | suffixes(expr, &mut lits); |
| 325 | lits.reverse(); |
| 326 | !lits.is_empty() && !lits.contains_empty() && self.union(lits) |
| 327 | } |
| 328 | |
| 329 | /// Unions this set with another set. |
| 330 | /// |
| 331 | /// If the union would cause the set to exceed its limits, then the union |
| 332 | /// is skipped and it returns false. Otherwise, if the union succeeds, it |
| 333 | /// returns true. |
| 334 | pub fn union(&mut self, lits: Literals) -> bool { |
| 335 | if self.num_bytes() + lits.num_bytes() > self.limit_size { |
| 336 | return false; |
| 337 | } |
| 338 | if lits.is_empty() { |
| 339 | self.lits.push(Literal::empty()); |
| 340 | } else { |
| 341 | self.lits.extend(lits.lits); |
| 342 | } |
| 343 | true |
| 344 | } |
| 345 | |
| 346 | /// Extends this set with another set. |
| 347 | /// |
| 348 | /// The set of literals is extended via a cross product. |
| 349 | /// |
| 350 | /// If a cross product would cause this set to exceed its limits, then the |
| 351 | /// cross product is skipped and it returns false. Otherwise, if the cross |
| 352 | /// product succeeds, it returns true. |
| 353 | pub fn cross_product(&mut self, lits: &Literals) -> bool { |
| 354 | if lits.is_empty() { |
| 355 | return true; |
| 356 | } |
| 357 | // Check that we make sure we stay in our limits. |
| 358 | let mut size_after; |
| 359 | if self.is_empty() || !self.any_complete() { |
| 360 | size_after = self.num_bytes(); |
| 361 | for lits_lit in lits.literals() { |
| 362 | size_after += lits_lit.len(); |
| 363 | } |
| 364 | } else { |
| 365 | size_after = self.lits.iter().fold(0, |accum, lit| { |
| 366 | accum + if lit.is_cut() { lit.len() } else { 0 } |
| 367 | }); |
| 368 | for lits_lit in lits.literals() { |
| 369 | for self_lit in self.literals() { |
| 370 | if !self_lit.is_cut() { |
| 371 | size_after += self_lit.len() + lits_lit.len(); |
| 372 | } |
| 373 | } |
| 374 | } |
| 375 | } |
| 376 | if size_after > self.limit_size { |
| 377 | return false; |
| 378 | } |
| 379 | |
| 380 | let mut base = self.remove_complete(); |
| 381 | if base.is_empty() { |
| 382 | base = vec![Literal::empty()]; |
| 383 | } |
| 384 | for lits_lit in lits.literals() { |
| 385 | for mut self_lit in base.clone() { |
| 386 | self_lit.extend(&**lits_lit); |
| 387 | self_lit.cut = lits_lit.cut; |
| 388 | self.lits.push(self_lit); |
| 389 | } |
| 390 | } |
| 391 | true |
| 392 | } |
| 393 | |
| 394 | /// Extends each literal in this set with the bytes given. |
| 395 | /// |
| 396 | /// If the set is empty, then the given literal is added to the set. |
| 397 | /// |
| 398 | /// If adding any number of bytes to all members of this set causes a limit |
| 399 | /// to be exceeded, then no bytes are added and false is returned. If a |
| 400 | /// prefix of `bytes` can be fit into this set, then it is used and all |
| 401 | /// resulting literals are cut. |
| 402 | pub fn cross_add(&mut self, bytes: &[u8]) -> bool { |
| 403 | // N.B. This could be implemented by simply calling cross_product with |
| 404 | // a literal set containing just `bytes`, but we can be smarter about |
| 405 | // taking shorter prefixes of `bytes` if they'll fit. |
| 406 | if bytes.is_empty() { |
| 407 | return true; |
| 408 | } |
| 409 | if self.lits.is_empty() { |
| 410 | let i = cmp::min(self.limit_size, bytes.len()); |
| 411 | self.lits.push(Literal::new(bytes[..i].to_owned())); |
| 412 | self.lits[0].cut = i < bytes.len(); |
| 413 | return !self.lits[0].is_cut(); |
| 414 | } |
| 415 | let size = self.num_bytes(); |
| 416 | if size + self.lits.len() >= self.limit_size { |
| 417 | return false; |
| 418 | } |
| 419 | let mut i = 1; |
| 420 | while size + (i * self.lits.len()) <= self.limit_size |
| 421 | && i < bytes.len() |
| 422 | { |
| 423 | i += 1; |
| 424 | } |
| 425 | for lit in &mut self.lits { |
| 426 | if !lit.is_cut() { |
| 427 | lit.extend(&bytes[..i]); |
| 428 | if i < bytes.len() { |
| 429 | lit.cut(); |
| 430 | } |
| 431 | } |
| 432 | } |
| 433 | true |
| 434 | } |
| 435 | |
| 436 | /// Adds the given literal to this set. |
| 437 | /// |
| 438 | /// Returns false if adding this literal would cause the class to be too |
| 439 | /// big. |
| 440 | pub fn add(&mut self, lit: Literal) -> bool { |
| 441 | if self.num_bytes() + lit.len() > self.limit_size { |
| 442 | return false; |
| 443 | } |
| 444 | self.lits.push(lit); |
| 445 | true |
| 446 | } |
| 447 | |
| 448 | /// Extends each literal in this set with the character class given. |
| 449 | /// |
| 450 | /// Returns false if the character class was too big to add. |
| 451 | pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { |
| 452 | self._add_char_class(cls, false) |
| 453 | } |
| 454 | |
| 455 | /// Extends each literal in this set with the character class given, |
| 456 | /// writing the bytes of each character in reverse. |
| 457 | /// |
| 458 | /// Returns false if the character class was too big to add. |
| 459 | fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { |
| 460 | self._add_char_class(cls, true) |
| 461 | } |
| 462 | |
| 463 | fn _add_char_class( |
| 464 | &mut self, |
| 465 | cls: &hir::ClassUnicode, |
| 466 | reverse: bool, |
| 467 | ) -> bool { |
| 468 | use std::char; |
| 469 | |
| 470 | if self.class_exceeds_limits(cls_char_count(cls)) { |
| 471 | return false; |
| 472 | } |
| 473 | let mut base = self.remove_complete(); |
| 474 | if base.is_empty() { |
| 475 | base = vec![Literal::empty()]; |
| 476 | } |
| 477 | for r in cls.iter() { |
| 478 | let (s, e) = (r.start as u32, r.end as u32 + 1); |
| 479 | for c in (s..e).filter_map(char::from_u32) { |
| 480 | for mut lit in base.clone() { |
| 481 | let mut bytes = c.to_string().into_bytes(); |
| 482 | if reverse { |
| 483 | bytes.reverse(); |
| 484 | } |
| 485 | lit.extend(&bytes); |
| 486 | self.lits.push(lit); |
| 487 | } |
| 488 | } |
| 489 | } |
| 490 | true |
| 491 | } |
| 492 | |
| 493 | /// Extends each literal in this set with the byte class given. |
| 494 | /// |
| 495 | /// Returns false if the byte class was too big to add. |
| 496 | pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { |
| 497 | if self.class_exceeds_limits(cls_byte_count(cls)) { |
| 498 | return false; |
| 499 | } |
| 500 | let mut base = self.remove_complete(); |
| 501 | if base.is_empty() { |
| 502 | base = vec![Literal::empty()]; |
| 503 | } |
| 504 | for r in cls.iter() { |
| 505 | let (s, e) = (r.start as u32, r.end as u32 + 1); |
| 506 | for b in (s..e).map(|b| b as u8) { |
| 507 | for mut lit in base.clone() { |
| 508 | lit.push(b); |
| 509 | self.lits.push(lit); |
| 510 | } |
| 511 | } |
| 512 | } |
| 513 | true |
| 514 | } |
| 515 | |
| 516 | /// Cuts every member of this set. When a member is cut, it can never |
| 517 | /// be extended. |
| 518 | pub fn cut(&mut self) { |
| 519 | for lit in &mut self.lits { |
| 520 | lit.cut(); |
| 521 | } |
| 522 | } |
| 523 | |
| 524 | /// Reverses all members in place. |
| 525 | pub fn reverse(&mut self) { |
| 526 | for lit in &mut self.lits { |
| 527 | lit.reverse(); |
| 528 | } |
| 529 | } |
| 530 | |
| 531 | /// Clears this set of all members. |
| 532 | pub fn clear(&mut self) { |
| 533 | self.lits.clear(); |
| 534 | } |
| 535 | |
| 536 | /// Pops all complete literals out of this set. |
| 537 | fn remove_complete(&mut self) -> Vec<Literal> { |
| 538 | let mut base = vec![]; |
| 539 | for lit in mem::replace(&mut self.lits, vec![]) { |
| 540 | if lit.is_cut() { |
| 541 | self.lits.push(lit); |
| 542 | } else { |
| 543 | base.push(lit); |
| 544 | } |
| 545 | } |
| 546 | base |
| 547 | } |
| 548 | |
| 549 | /// Returns the total number of bytes in this set. |
| 550 | fn num_bytes(&self) -> usize { |
| 551 | self.lits.iter().fold(0, |accum, lit| accum + lit.len()) |
| 552 | } |
| 553 | |
| 554 | /// Returns true if a character class with the given size would cause this |
| 555 | /// set to exceed its limits. |
| 556 | /// |
| 557 | /// The size given should correspond to the number of items in the class. |
| 558 | fn class_exceeds_limits(&self, size: usize) -> bool { |
| 559 | if size > self.limit_class { |
| 560 | return true; |
| 561 | } |
| 562 | // This is an approximation since codepoints in a char class can encode |
| 563 | // to 1-4 bytes. |
| 564 | let new_byte_count = if self.lits.is_empty() { |
| 565 | size |
| 566 | } else { |
| 567 | self.lits.iter().fold(0, |accum, lit| { |
| 568 | accum |
| 569 | + if lit.is_cut() { |
| 570 | // If the literal is cut, then we'll never add |
| 571 | // anything to it, so don't count it. |
| 572 | 0 |
| 573 | } else { |
| 574 | (lit.len() + 1) * size |
| 575 | } |
| 576 | }) |
| 577 | }; |
| 578 | new_byte_count > self.limit_size |
| 579 | } |
| 580 | } |
| 581 | |
| 582 | fn prefixes(expr: &Hir, lits: &mut Literals) { |
| 583 | match *expr.kind() { |
| 584 | HirKind::Literal(hir::Literal::Unicode(c)) => { |
| 585 | let mut buf = [0; 4]; |
| 586 | lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); |
| 587 | } |
| 588 | HirKind::Literal(hir::Literal::Byte(b)) => { |
| 589 | lits.cross_add(&[b]); |
| 590 | } |
| 591 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
| 592 | if !lits.add_char_class(cls) { |
| 593 | lits.cut(); |
| 594 | } |
| 595 | } |
| 596 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
| 597 | if !lits.add_byte_class(cls) { |
| 598 | lits.cut(); |
| 599 | } |
| 600 | } |
| 601 | HirKind::Group(hir::Group { ref hir, .. }) => { |
| 602 | prefixes(&**hir, lits); |
| 603 | } |
| 604 | HirKind::Repetition(ref x) => match x.kind { |
| 605 | hir::RepetitionKind::ZeroOrOne => { |
| 606 | repeat_zero_or_one_literals(&x.hir, lits, prefixes); |
| 607 | } |
| 608 | hir::RepetitionKind::ZeroOrMore => { |
| 609 | repeat_zero_or_more_literals(&x.hir, lits, prefixes); |
| 610 | } |
| 611 | hir::RepetitionKind::OneOrMore => { |
| 612 | repeat_one_or_more_literals(&x.hir, lits, prefixes); |
| 613 | } |
| 614 | hir::RepetitionKind::Range(ref rng) => { |
| 615 | let (min, max) = match *rng { |
| 616 | hir::RepetitionRange::Exactly(m) => (m, Some(m)), |
| 617 | hir::RepetitionRange::AtLeast(m) => (m, None), |
| 618 | hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), |
| 619 | }; |
| 620 | repeat_range_literals( |
| 621 | &x.hir, min, max, x.greedy, lits, prefixes, |
| 622 | ) |
| 623 | } |
| 624 | }, |
| 625 | HirKind::Concat(ref es) if es.is_empty() => {} |
| 626 | HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), |
| 627 | HirKind::Concat(ref es) => { |
| 628 | for e in es { |
| 629 | if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { |
| 630 | if !lits.is_empty() { |
| 631 | lits.cut(); |
| 632 | break; |
| 633 | } |
| 634 | lits.add(Literal::empty()); |
| 635 | continue; |
| 636 | } |
| 637 | let mut lits2 = lits.to_empty(); |
| 638 | prefixes(e, &mut lits2); |
| 639 | if !lits.cross_product(&lits2) || !lits2.any_complete() { |
| 640 | // If this expression couldn't yield any literal that |
| 641 | // could be extended, then we need to quit. Since we're |
| 642 | // short-circuiting, we also need to freeze every member. |
| 643 | lits.cut(); |
| 644 | break; |
| 645 | } |
| 646 | } |
| 647 | } |
| 648 | HirKind::Alternation(ref es) => { |
| 649 | alternate_literals(es, lits, prefixes); |
| 650 | } |
| 651 | _ => lits.cut(), |
| 652 | } |
| 653 | } |
| 654 | |
| 655 | fn suffixes(expr: &Hir, lits: &mut Literals) { |
| 656 | match *expr.kind() { |
| 657 | HirKind::Literal(hir::Literal::Unicode(c)) => { |
| 658 | let mut buf = [0u8; 4]; |
| 659 | let i = c.encode_utf8(&mut buf).len(); |
| 660 | let buf = &mut buf[..i]; |
| 661 | buf.reverse(); |
| 662 | lits.cross_add(buf); |
| 663 | } |
| 664 | HirKind::Literal(hir::Literal::Byte(b)) => { |
| 665 | lits.cross_add(&[b]); |
| 666 | } |
| 667 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
| 668 | if !lits.add_char_class_reverse(cls) { |
| 669 | lits.cut(); |
| 670 | } |
| 671 | } |
| 672 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
| 673 | if !lits.add_byte_class(cls) { |
| 674 | lits.cut(); |
| 675 | } |
| 676 | } |
| 677 | HirKind::Group(hir::Group { ref hir, .. }) => { |
| 678 | suffixes(&**hir, lits); |
| 679 | } |
| 680 | HirKind::Repetition(ref x) => match x.kind { |
| 681 | hir::RepetitionKind::ZeroOrOne => { |
| 682 | repeat_zero_or_one_literals(&x.hir, lits, suffixes); |
| 683 | } |
| 684 | hir::RepetitionKind::ZeroOrMore => { |
| 685 | repeat_zero_or_more_literals(&x.hir, lits, suffixes); |
| 686 | } |
| 687 | hir::RepetitionKind::OneOrMore => { |
| 688 | repeat_one_or_more_literals(&x.hir, lits, suffixes); |
| 689 | } |
| 690 | hir::RepetitionKind::Range(ref rng) => { |
| 691 | let (min, max) = match *rng { |
| 692 | hir::RepetitionRange::Exactly(m) => (m, Some(m)), |
| 693 | hir::RepetitionRange::AtLeast(m) => (m, None), |
| 694 | hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), |
| 695 | }; |
| 696 | repeat_range_literals( |
| 697 | &x.hir, min, max, x.greedy, lits, suffixes, |
| 698 | ) |
| 699 | } |
| 700 | }, |
| 701 | HirKind::Concat(ref es) if es.is_empty() => {} |
| 702 | HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), |
| 703 | HirKind::Concat(ref es) => { |
| 704 | for e in es.iter().rev() { |
| 705 | if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { |
| 706 | if !lits.is_empty() { |
| 707 | lits.cut(); |
| 708 | break; |
| 709 | } |
| 710 | lits.add(Literal::empty()); |
| 711 | continue; |
| 712 | } |
| 713 | let mut lits2 = lits.to_empty(); |
| 714 | suffixes(e, &mut lits2); |
| 715 | if !lits.cross_product(&lits2) || !lits2.any_complete() { |
| 716 | // If this expression couldn't yield any literal that |
| 717 | // could be extended, then we need to quit. Since we're |
| 718 | // short-circuiting, we also need to freeze every member. |
| 719 | lits.cut(); |
| 720 | break; |
| 721 | } |
| 722 | } |
| 723 | } |
| 724 | HirKind::Alternation(ref es) => { |
| 725 | alternate_literals(es, lits, suffixes); |
| 726 | } |
| 727 | _ => lits.cut(), |
| 728 | } |
| 729 | } |
| 730 | |
| 731 | fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( |
| 732 | e: &Hir, |
| 733 | lits: &mut Literals, |
| 734 | mut f: F, |
| 735 | ) { |
| 736 | f( |
| 737 | &Hir::repetition(rep:hir::Repetition { |
| 738 | kind: hir::RepetitionKind::ZeroOrMore, |
| 739 | // FIXME: Our literal extraction doesn't care about greediness. |
| 740 | // Which is partially why we're treating 'e?' as 'e*'. Namely, |
| 741 | // 'ab??' yields [Complete(ab), Complete(a)], but it should yield |
| 742 | // [Complete(a), Complete(ab)] because of the non-greediness. |
| 743 | greedy: true, |
| 744 | hir: Box::new(e.clone()), |
| 745 | }), |
| 746 | lits, |
| 747 | ); |
| 748 | } |
| 749 | |
| 750 | fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( |
| 751 | e: &Hir, |
| 752 | lits: &mut Literals, |
| 753 | mut f: F, |
| 754 | ) { |
| 755 | let (mut lits2: Literals, mut lits3: Literals) = (lits.clone(), lits.to_empty()); |
| 756 | lits3.set_limit_size(lits.limit_size() / 2); |
| 757 | f(e, &mut lits3); |
| 758 | |
| 759 | if lits3.is_empty() || !lits2.cross_product(&lits3) { |
| 760 | lits.cut(); |
| 761 | return; |
| 762 | } |
| 763 | lits2.cut(); |
| 764 | lits2.add(lit:Literal::empty()); |
| 765 | if !lits.union(lits:lits2) { |
| 766 | lits.cut(); |
| 767 | } |
| 768 | } |
| 769 | |
| 770 | fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( |
| 771 | e: &Hir, |
| 772 | lits: &mut Literals, |
| 773 | mut f: F, |
| 774 | ) { |
| 775 | f(e, lits); |
| 776 | lits.cut(); |
| 777 | } |
| 778 | |
| 779 | fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( |
| 780 | e: &Hir, |
| 781 | min: u32, |
| 782 | max: Option<u32>, |
| 783 | greedy: bool, |
| 784 | lits: &mut Literals, |
| 785 | mut f: F, |
| 786 | ) { |
| 787 | if min == 0 { |
| 788 | // This is a bit conservative. If `max` is set, then we could |
| 789 | // treat this as a finite set of alternations. For now, we |
| 790 | // just treat it as `e*`. |
| 791 | f( |
| 792 | &Hir::repetition(hir::Repetition { |
| 793 | kind: hir::RepetitionKind::ZeroOrMore, |
| 794 | greedy, |
| 795 | hir: Box::new(e.clone()), |
| 796 | }), |
| 797 | lits, |
| 798 | ); |
| 799 | } else { |
| 800 | if min > 0 { |
| 801 | let n = cmp::min(lits.limit_size, min as usize); |
| 802 | let es = iter::repeat(e.clone()).take(n).collect(); |
| 803 | f(&Hir::concat(es), lits); |
| 804 | if n < min as usize || lits.contains_empty() { |
| 805 | lits.cut(); |
| 806 | } |
| 807 | } |
| 808 | if max.map_or(true, |max| min < max) { |
| 809 | lits.cut(); |
| 810 | } |
| 811 | } |
| 812 | } |
| 813 | |
| 814 | fn alternate_literals<F: FnMut(&Hir, &mut Literals)>( |
| 815 | es: &[Hir], |
| 816 | lits: &mut Literals, |
| 817 | mut f: F, |
| 818 | ) { |
| 819 | let mut lits2: Literals = lits.to_empty(); |
| 820 | for e: &Hir in es { |
| 821 | let mut lits3: Literals = lits.to_empty(); |
| 822 | lits3.set_limit_size(lits.limit_size() / 5); |
| 823 | f(e, &mut lits3); |
| 824 | if lits3.is_empty() || !lits2.union(lits:lits3) { |
| 825 | // If we couldn't find suffixes for *any* of the |
| 826 | // alternates, then the entire alternation has to be thrown |
| 827 | // away and any existing members must be frozen. Similarly, |
| 828 | // if the union couldn't complete, stop and freeze. |
| 829 | lits.cut(); |
| 830 | return; |
| 831 | } |
| 832 | } |
| 833 | if !lits.cross_product(&lits2) { |
| 834 | lits.cut(); |
| 835 | } |
| 836 | } |
| 837 | |
| 838 | impl fmt::Debug for Literals { |
| 839 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 840 | f&mut DebugStruct<'_, '_>.debug_struct("Literals" ) |
| 841 | .field("lits" , &self.lits) |
| 842 | .field("limit_size" , &self.limit_size) |
| 843 | .field(name:"limit_class" , &self.limit_class) |
| 844 | .finish() |
| 845 | } |
| 846 | } |
| 847 | |
| 848 | impl Literal { |
| 849 | /// Returns a new complete literal with the bytes given. |
| 850 | pub fn new(bytes: Vec<u8>) -> Literal { |
| 851 | Literal { v: bytes, cut: false } |
| 852 | } |
| 853 | |
| 854 | /// Returns a new complete empty literal. |
| 855 | pub fn empty() -> Literal { |
| 856 | Literal { v: vec![], cut: false } |
| 857 | } |
| 858 | |
| 859 | /// Returns true if this literal was "cut." |
| 860 | pub fn is_cut(&self) -> bool { |
| 861 | self.cut |
| 862 | } |
| 863 | |
| 864 | /// Cuts this literal. |
| 865 | pub fn cut(&mut self) { |
| 866 | self.cut = true; |
| 867 | } |
| 868 | } |
| 869 | |
| 870 | impl PartialEq for Literal { |
| 871 | fn eq(&self, other: &Literal) -> bool { |
| 872 | self.v == other.v |
| 873 | } |
| 874 | } |
| 875 | |
| 876 | impl PartialOrd for Literal { |
| 877 | fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> { |
| 878 | self.v.partial_cmp(&other.v) |
| 879 | } |
| 880 | } |
| 881 | |
| 882 | impl fmt::Debug for Literal { |
| 883 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 884 | if self.is_cut() { |
| 885 | write!(f, "Cut( {})" , escape_unicode(&self.v)) |
| 886 | } else { |
| 887 | write!(f, "Complete( {})" , escape_unicode(&self.v)) |
| 888 | } |
| 889 | } |
| 890 | } |
| 891 | |
| 892 | impl AsRef<[u8]> for Literal { |
| 893 | fn as_ref(&self) -> &[u8] { |
| 894 | &self.v |
| 895 | } |
| 896 | } |
| 897 | |
| 898 | impl ops::Deref for Literal { |
| 899 | type Target = Vec<u8>; |
| 900 | fn deref(&self) -> &Vec<u8> { |
| 901 | &self.v |
| 902 | } |
| 903 | } |
| 904 | |
| 905 | impl ops::DerefMut for Literal { |
| 906 | fn deref_mut(&mut self) -> &mut Vec<u8> { |
| 907 | &mut self.v |
| 908 | } |
| 909 | } |
| 910 | |
| 911 | fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { |
| 912 | let mut i: usize = 0; |
| 913 | while haystack.len() >= needle.len() { |
| 914 | if needle == &haystack[..needle.len()] { |
| 915 | return Some(i); |
| 916 | } |
| 917 | i += 1; |
| 918 | haystack = &haystack[1..]; |
| 919 | } |
| 920 | None |
| 921 | } |
| 922 | |
| 923 | fn escape_unicode(bytes: &[u8]) -> String { |
| 924 | let show: String = match ::std::str::from_utf8(bytes) { |
| 925 | Ok(v: &str) => v.to_string(), |
| 926 | Err(_) => escape_bytes(bytes), |
| 927 | }; |
| 928 | let mut space_escaped: String = String::new(); |
| 929 | for c: char in show.chars() { |
| 930 | if c.is_whitespace() { |
| 931 | let escaped: String = if c as u32 <= 0x7F { |
| 932 | escape_byte(c as u8) |
| 933 | } else if c as u32 <= 0xFFFF { |
| 934 | format!(r"\u{{ {:04x}}}" , c as u32) |
| 935 | } else { |
| 936 | format!(r"\U {{{:08x}}}" , c as u32) |
| 937 | }; |
| 938 | space_escaped.push_str(&escaped); |
| 939 | } else { |
| 940 | space_escaped.push(ch:c); |
| 941 | } |
| 942 | } |
| 943 | space_escaped |
| 944 | } |
| 945 | |
| 946 | fn escape_bytes(bytes: &[u8]) -> String { |
| 947 | let mut s: String = String::new(); |
| 948 | for &b: u8 in bytes { |
| 949 | s.push_str(&escape_byte(b)); |
| 950 | } |
| 951 | s |
| 952 | } |
| 953 | |
| 954 | fn escape_byte(byte: u8) -> String { |
| 955 | use std::ascii::escape_default; |
| 956 | |
| 957 | let escaped: Vec<u8> = escape_default(byte).collect(); |
| 958 | String::from_utf8_lossy(&escaped).into_owned() |
| 959 | } |
| 960 | |
| 961 | fn cls_char_count(cls: &hir::ClassUnicode) -> usize { |
| 962 | cls.iter().map(|&r: ClassUnicodeRange| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() |
| 963 | as usize |
| 964 | } |
| 965 | |
| 966 | fn cls_byte_count(cls: &hir::ClassBytes) -> usize { |
| 967 | cls.iter().map(|&r: ClassBytesRange| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() |
| 968 | as usize |
| 969 | } |
| 970 | |
| 971 | #[cfg (test)] |
| 972 | mod tests { |
| 973 | use std::fmt; |
| 974 | |
| 975 | use super::{escape_bytes, Literal, Literals}; |
| 976 | use crate::hir::Hir; |
| 977 | use crate::ParserBuilder; |
| 978 | |
| 979 | // To make test failures easier to read. |
| 980 | #[derive (Debug, Eq, PartialEq)] |
| 981 | struct Bytes(Vec<ULiteral>); |
| 982 | #[derive (Debug, Eq, PartialEq)] |
| 983 | struct Unicode(Vec<ULiteral>); |
| 984 | |
| 985 | fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> { |
| 986 | let mut ulits = vec![]; |
| 987 | for blit in blits { |
| 988 | ulits |
| 989 | .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); |
| 990 | } |
| 991 | ulits |
| 992 | } |
| 993 | |
| 994 | fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals { |
| 995 | Literals { |
| 996 | lits: it.into_iter().collect(), |
| 997 | limit_size: 0, |
| 998 | limit_class: 0, |
| 999 | } |
| 1000 | } |
| 1001 | |
| 1002 | // Needs to be pub for 1.3? |
| 1003 | #[derive (Clone, Eq, PartialEq)] |
| 1004 | pub struct ULiteral { |
| 1005 | v: String, |
| 1006 | cut: bool, |
| 1007 | } |
| 1008 | |
| 1009 | impl ULiteral { |
| 1010 | fn is_cut(&self) -> bool { |
| 1011 | self.cut |
| 1012 | } |
| 1013 | } |
| 1014 | |
| 1015 | impl fmt::Debug for ULiteral { |
| 1016 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 1017 | if self.is_cut() { |
| 1018 | write!(f, "Cut({})" , self.v) |
| 1019 | } else { |
| 1020 | write!(f, "Complete({})" , self.v) |
| 1021 | } |
| 1022 | } |
| 1023 | } |
| 1024 | |
| 1025 | impl PartialEq<Literal> for ULiteral { |
| 1026 | fn eq(&self, other: &Literal) -> bool { |
| 1027 | self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() |
| 1028 | } |
| 1029 | } |
| 1030 | |
| 1031 | impl PartialEq<ULiteral> for Literal { |
| 1032 | fn eq(&self, other: &ULiteral) -> bool { |
| 1033 | &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() |
| 1034 | } |
| 1035 | } |
| 1036 | |
| 1037 | #[allow (non_snake_case)] |
| 1038 | fn C(s: &'static str) -> ULiteral { |
| 1039 | ULiteral { v: s.to_owned(), cut: true } |
| 1040 | } |
| 1041 | #[allow (non_snake_case)] |
| 1042 | fn M(s: &'static str) -> ULiteral { |
| 1043 | ULiteral { v: s.to_owned(), cut: false } |
| 1044 | } |
| 1045 | |
| 1046 | fn prefixes(lits: &mut Literals, expr: &Hir) { |
| 1047 | lits.union_prefixes(expr); |
| 1048 | } |
| 1049 | |
| 1050 | fn suffixes(lits: &mut Literals, expr: &Hir) { |
| 1051 | lits.union_suffixes(expr); |
| 1052 | } |
| 1053 | |
| 1054 | macro_rules! assert_lit_eq { |
| 1055 | ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ |
| 1056 | let expected: Vec<ULiteral> = vec![$($expected_lit),*]; |
| 1057 | let lits = $got_lits; |
| 1058 | assert_eq!( |
| 1059 | $which(expected.clone()), |
| 1060 | $which(escape_lits(lits.literals()))); |
| 1061 | assert_eq!( |
| 1062 | !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), |
| 1063 | lits.all_complete()); |
| 1064 | assert_eq!( |
| 1065 | expected.iter().any(|l| !l.is_cut()), |
| 1066 | lits.any_complete()); |
| 1067 | }}; |
| 1068 | } |
| 1069 | |
| 1070 | macro_rules! test_lit { |
| 1071 | ($name:ident, $which:ident, $re:expr) => { |
| 1072 | test_lit!($name, $which, $re,); |
| 1073 | }; |
| 1074 | ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { |
| 1075 | #[test] |
| 1076 | fn $name() { |
| 1077 | let expr = ParserBuilder::new() |
| 1078 | .build() |
| 1079 | .parse($re) |
| 1080 | .unwrap(); |
| 1081 | let lits = Literals::$which(&expr); |
| 1082 | assert_lit_eq!(Unicode, lits, $($lit),*); |
| 1083 | |
| 1084 | let expr = ParserBuilder::new() |
| 1085 | .allow_invalid_utf8(true) |
| 1086 | .unicode(false) |
| 1087 | .build() |
| 1088 | .parse($re) |
| 1089 | .unwrap(); |
| 1090 | let lits = Literals::$which(&expr); |
| 1091 | assert_lit_eq!(Bytes, lits, $($lit),*); |
| 1092 | } |
| 1093 | }; |
| 1094 | } |
| 1095 | |
| 1096 | // ************************************************************************ |
| 1097 | // Tests for prefix literal extraction. |
| 1098 | // ************************************************************************ |
| 1099 | |
| 1100 | // Elementary tests. |
| 1101 | test_lit!(pfx_one_lit1, prefixes, "a" , M("a" )); |
| 1102 | test_lit!(pfx_one_lit2, prefixes, "abc" , M("abc" )); |
| 1103 | test_lit!(pfx_one_lit3, prefixes, "(?u)☃" , M(" \\xe2 \\x98 \\x83" )); |
| 1104 | #[cfg (feature = "unicode-case" )] |
| 1105 | test_lit!(pfx_one_lit4, prefixes, "(?ui)☃" , M(" \\xe2 \\x98 \\x83" )); |
| 1106 | test_lit!(pfx_class1, prefixes, "[1-4]" , M("1" ), M("2" ), M("3" ), M("4" )); |
| 1107 | test_lit!( |
| 1108 | pfx_class2, |
| 1109 | prefixes, |
| 1110 | "(?u)[☃Ⅰ]" , |
| 1111 | M(" \\xe2 \\x85 \\xa0" ), |
| 1112 | M(" \\xe2 \\x98 \\x83" ) |
| 1113 | ); |
| 1114 | #[cfg (feature = "unicode-case" )] |
| 1115 | test_lit!( |
| 1116 | pfx_class3, |
| 1117 | prefixes, |
| 1118 | "(?ui)[☃Ⅰ]" , |
| 1119 | M(" \\xe2 \\x85 \\xa0" ), |
| 1120 | M(" \\xe2 \\x85 \\xb0" ), |
| 1121 | M(" \\xe2 \\x98 \\x83" ) |
| 1122 | ); |
| 1123 | test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a" , M("A" ), M("a" )); |
| 1124 | test_lit!( |
| 1125 | pfx_one_lit_casei2, |
| 1126 | prefixes, |
| 1127 | "(?i-u)abc" , |
| 1128 | M("ABC" ), |
| 1129 | M("aBC" ), |
| 1130 | M("AbC" ), |
| 1131 | M("abC" ), |
| 1132 | M("ABc" ), |
| 1133 | M("aBc" ), |
| 1134 | M("Abc" ), |
| 1135 | M("abc" ) |
| 1136 | ); |
| 1137 | test_lit!(pfx_group1, prefixes, "(a)" , M("a" )); |
| 1138 | test_lit!(pfx_rep_zero_or_one1, prefixes, "a?" ); |
| 1139 | test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?" ); |
| 1140 | test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?" , C("ab" ), M("a" )); |
| 1141 | // FIXME: This should return [M("a"), M("ab")] because of the non-greedy |
| 1142 | // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get |
| 1143 | // a cut literal. |
| 1144 | test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??" , C("ab" ), M("a" )); |
| 1145 | test_lit!(pfx_rep_zero_or_more1, prefixes, "a*" ); |
| 1146 | test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*" ); |
| 1147 | test_lit!(pfx_rep_one_or_more1, prefixes, "a+" , C("a" )); |
| 1148 | test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+" , C("abc" )); |
| 1149 | test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+" , C("a" )); |
| 1150 | test_lit!(pfx_rep_range1, prefixes, "a{0}" ); |
| 1151 | test_lit!(pfx_rep_range2, prefixes, "a{0,}" ); |
| 1152 | test_lit!(pfx_rep_range3, prefixes, "a{0,1}" ); |
| 1153 | test_lit!(pfx_rep_range4, prefixes, "a{1}" , M("a" )); |
| 1154 | test_lit!(pfx_rep_range5, prefixes, "a{2}" , M("aa" )); |
| 1155 | test_lit!(pfx_rep_range6, prefixes, "a{1,2}" , C("a" )); |
| 1156 | test_lit!(pfx_rep_range7, prefixes, "a{2,3}" , C("aa" )); |
| 1157 | |
| 1158 | // Test regexes with concatenations. |
| 1159 | test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)" , M("ab" )); |
| 1160 | test_lit!(pfx_cat2, prefixes, "[ab]z" , M("az" ), M("bz" )); |
| 1161 | test_lit!( |
| 1162 | pfx_cat3, |
| 1163 | prefixes, |
| 1164 | "(?i-u)[ab]z" , |
| 1165 | M("AZ" ), |
| 1166 | M("BZ" ), |
| 1167 | M("aZ" ), |
| 1168 | M("bZ" ), |
| 1169 | M("Az" ), |
| 1170 | M("Bz" ), |
| 1171 | M("az" ), |
| 1172 | M("bz" ) |
| 1173 | ); |
| 1174 | test_lit!( |
| 1175 | pfx_cat4, |
| 1176 | prefixes, |
| 1177 | "[ab][yz]" , |
| 1178 | M("ay" ), |
| 1179 | M("by" ), |
| 1180 | M("az" ), |
| 1181 | M("bz" ) |
| 1182 | ); |
| 1183 | test_lit!(pfx_cat5, prefixes, "a*b" , C("a" ), M("b" )); |
| 1184 | test_lit!(pfx_cat6, prefixes, "a*b*c" , C("a" ), C("b" ), M("c" )); |
| 1185 | test_lit!(pfx_cat7, prefixes, "a*b*c+" , C("a" ), C("b" ), C("c" )); |
| 1186 | test_lit!(pfx_cat8, prefixes, "a*b+c" , C("a" ), C("b" )); |
| 1187 | test_lit!(pfx_cat9, prefixes, "a*b+c*" , C("a" ), C("b" )); |
| 1188 | test_lit!(pfx_cat10, prefixes, "ab*" , C("ab" ), M("a" )); |
| 1189 | test_lit!(pfx_cat11, prefixes, "ab*c" , C("ab" ), M("ac" )); |
| 1190 | test_lit!(pfx_cat12, prefixes, "ab+" , C("ab" )); |
| 1191 | test_lit!(pfx_cat13, prefixes, "ab+c" , C("ab" )); |
| 1192 | test_lit!(pfx_cat14, prefixes, "a^" , C("a" )); |
| 1193 | test_lit!(pfx_cat15, prefixes, "$a" ); |
| 1194 | test_lit!(pfx_cat16, prefixes, r"ab*c" , C("ab" ), M("ac" )); |
| 1195 | test_lit!(pfx_cat17, prefixes, r"ab+c" , C("ab" )); |
| 1196 | test_lit!(pfx_cat18, prefixes, r"z*azb" , C("z" ), M("azb" )); |
| 1197 | test_lit!(pfx_cat19, prefixes, "a.z" , C("a" )); |
| 1198 | |
| 1199 | // Test regexes with alternations. |
| 1200 | test_lit!(pfx_alt1, prefixes, "a|b" , M("a" ), M("b" )); |
| 1201 | test_lit!(pfx_alt2, prefixes, "[1-3]|b" , M("1" ), M("2" ), M("3" ), M("b" )); |
| 1202 | test_lit!(pfx_alt3, prefixes, "y(?:a|b)z" , M("yaz" ), M("ybz" )); |
| 1203 | test_lit!(pfx_alt4, prefixes, "a|b*" ); |
| 1204 | test_lit!(pfx_alt5, prefixes, "a|b+" , M("a" ), C("b" )); |
| 1205 | test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)" ); |
| 1206 | test_lit!( |
| 1207 | pfx_alt7, |
| 1208 | prefixes, |
| 1209 | "(a|b)*c|(a|ab)*c" , |
| 1210 | C("a" ), |
| 1211 | C("b" ), |
| 1212 | M("c" ), |
| 1213 | C("a" ), |
| 1214 | C("ab" ), |
| 1215 | M("c" ) |
| 1216 | ); |
| 1217 | test_lit!(pfx_alt8, prefixes, "a*b|c" , C("a" ), M("b" ), M("c" )); |
| 1218 | |
| 1219 | // Test regexes with empty assertions. |
| 1220 | test_lit!(pfx_empty1, prefixes, "^a" , M("a" )); |
| 1221 | test_lit!(pfx_empty2, prefixes, "a${2}" , C("a" )); |
| 1222 | test_lit!(pfx_empty3, prefixes, "^abc" , M("abc" )); |
| 1223 | test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)" , M("abc" ), M("z" )); |
| 1224 | |
| 1225 | // Make sure some curious regexes have no prefixes. |
| 1226 | test_lit!(pfx_nothing1, prefixes, "." ); |
| 1227 | test_lit!(pfx_nothing2, prefixes, "(?s)." ); |
| 1228 | test_lit!(pfx_nothing3, prefixes, "^" ); |
| 1229 | test_lit!(pfx_nothing4, prefixes, "$" ); |
| 1230 | test_lit!(pfx_nothing6, prefixes, "(?m)$" ); |
| 1231 | test_lit!(pfx_nothing7, prefixes, r"\b" ); |
| 1232 | test_lit!(pfx_nothing8, prefixes, r"\B" ); |
| 1233 | |
| 1234 | // Test a few regexes that defeat any prefix literal detection. |
| 1235 | test_lit!(pfx_defeated1, prefixes, ".a" ); |
| 1236 | test_lit!(pfx_defeated2, prefixes, "(?s).a" ); |
| 1237 | test_lit!(pfx_defeated3, prefixes, "a*b*c*" ); |
| 1238 | test_lit!(pfx_defeated4, prefixes, "a|." ); |
| 1239 | test_lit!(pfx_defeated5, prefixes, ".|a" ); |
| 1240 | test_lit!(pfx_defeated6, prefixes, "a|^" ); |
| 1241 | test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))" ); |
| 1242 | test_lit!(pfx_defeated8, prefixes, "$a" ); |
| 1243 | test_lit!(pfx_defeated9, prefixes, "(?m)$a" ); |
| 1244 | test_lit!(pfx_defeated10, prefixes, r"\ba" ); |
| 1245 | test_lit!(pfx_defeated11, prefixes, r"\Ba" ); |
| 1246 | test_lit!(pfx_defeated12, prefixes, "^*a" ); |
| 1247 | test_lit!(pfx_defeated13, prefixes, "^+a" ); |
| 1248 | |
| 1249 | test_lit!( |
| 1250 | pfx_crazy1, |
| 1251 | prefixes, |
| 1252 | r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" , |
| 1253 | C("Mo \\'" ), |
| 1254 | C("Mu \\'" ), |
| 1255 | C("Moam" ), |
| 1256 | C("Muam" ) |
| 1257 | ); |
| 1258 | |
| 1259 | // ************************************************************************ |
| 1260 | // Tests for quiting prefix literal search. |
| 1261 | // ************************************************************************ |
| 1262 | |
| 1263 | macro_rules! test_exhausted { |
| 1264 | ($name:ident, $which:ident, $re:expr) => { |
| 1265 | test_exhausted!($name, $which, $re,); |
| 1266 | }; |
| 1267 | ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { |
| 1268 | #[test] |
| 1269 | fn $name() { |
| 1270 | let expr = ParserBuilder::new() |
| 1271 | .build() |
| 1272 | .parse($re) |
| 1273 | .unwrap(); |
| 1274 | let mut lits = Literals::empty(); |
| 1275 | lits.set_limit_size(20).set_limit_class(10); |
| 1276 | $which(&mut lits, &expr); |
| 1277 | assert_lit_eq!(Unicode, lits, $($lit),*); |
| 1278 | |
| 1279 | let expr = ParserBuilder::new() |
| 1280 | .allow_invalid_utf8(true) |
| 1281 | .unicode(false) |
| 1282 | .build() |
| 1283 | .parse($re) |
| 1284 | .unwrap(); |
| 1285 | let mut lits = Literals::empty(); |
| 1286 | lits.set_limit_size(20).set_limit_class(10); |
| 1287 | $which(&mut lits, &expr); |
| 1288 | assert_lit_eq!(Bytes, lits, $($lit),*); |
| 1289 | } |
| 1290 | }; |
| 1291 | } |
| 1292 | |
| 1293 | // These test use a much lower limit than the default so that we can |
| 1294 | // write test cases of reasonable size. |
| 1295 | test_exhausted!(pfx_exhausted1, prefixes, "[a-z]" ); |
| 1296 | test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A" ); |
| 1297 | test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z" , C("A" )); |
| 1298 | test_exhausted!( |
| 1299 | pfx_exhausted4, |
| 1300 | prefixes, |
| 1301 | "(?i-u)foobar" , |
| 1302 | C("FO" ), |
| 1303 | C("fO" ), |
| 1304 | C("Fo" ), |
| 1305 | C("fo" ) |
| 1306 | ); |
| 1307 | test_exhausted!( |
| 1308 | pfx_exhausted5, |
| 1309 | prefixes, |
| 1310 | "(?:ab){100}" , |
| 1311 | C("abababababababababab" ) |
| 1312 | ); |
| 1313 | test_exhausted!( |
| 1314 | pfx_exhausted6, |
| 1315 | prefixes, |
| 1316 | "(?:(?:ab){100})*cd" , |
| 1317 | C("ababababab" ), |
| 1318 | M("cd" ) |
| 1319 | ); |
| 1320 | test_exhausted!( |
| 1321 | pfx_exhausted7, |
| 1322 | prefixes, |
| 1323 | "z(?:(?:ab){100})*cd" , |
| 1324 | C("zababababab" ), |
| 1325 | M("zcd" ) |
| 1326 | ); |
| 1327 | test_exhausted!( |
| 1328 | pfx_exhausted8, |
| 1329 | prefixes, |
| 1330 | "aaaaaaaaaaaaaaaaaaaaz" , |
| 1331 | C("aaaaaaaaaaaaaaaaaaaa" ) |
| 1332 | ); |
| 1333 | |
| 1334 | // ************************************************************************ |
| 1335 | // Tests for suffix literal extraction. |
| 1336 | // ************************************************************************ |
| 1337 | |
| 1338 | // Elementary tests. |
| 1339 | test_lit!(sfx_one_lit1, suffixes, "a" , M("a" )); |
| 1340 | test_lit!(sfx_one_lit2, suffixes, "abc" , M("abc" )); |
| 1341 | test_lit!(sfx_one_lit3, suffixes, "(?u)☃" , M(" \\xe2 \\x98 \\x83" )); |
| 1342 | #[cfg (feature = "unicode-case" )] |
| 1343 | test_lit!(sfx_one_lit4, suffixes, "(?ui)☃" , M(" \\xe2 \\x98 \\x83" )); |
| 1344 | test_lit!(sfx_class1, suffixes, "[1-4]" , M("1" ), M("2" ), M("3" ), M("4" )); |
| 1345 | test_lit!( |
| 1346 | sfx_class2, |
| 1347 | suffixes, |
| 1348 | "(?u)[☃Ⅰ]" , |
| 1349 | M(" \\xe2 \\x85 \\xa0" ), |
| 1350 | M(" \\xe2 \\x98 \\x83" ) |
| 1351 | ); |
| 1352 | #[cfg (feature = "unicode-case" )] |
| 1353 | test_lit!( |
| 1354 | sfx_class3, |
| 1355 | suffixes, |
| 1356 | "(?ui)[☃Ⅰ]" , |
| 1357 | M(" \\xe2 \\x85 \\xa0" ), |
| 1358 | M(" \\xe2 \\x85 \\xb0" ), |
| 1359 | M(" \\xe2 \\x98 \\x83" ) |
| 1360 | ); |
| 1361 | test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a" , M("A" ), M("a" )); |
| 1362 | test_lit!( |
| 1363 | sfx_one_lit_casei2, |
| 1364 | suffixes, |
| 1365 | "(?i-u)abc" , |
| 1366 | M("ABC" ), |
| 1367 | M("ABc" ), |
| 1368 | M("AbC" ), |
| 1369 | M("Abc" ), |
| 1370 | M("aBC" ), |
| 1371 | M("aBc" ), |
| 1372 | M("abC" ), |
| 1373 | M("abc" ) |
| 1374 | ); |
| 1375 | test_lit!(sfx_group1, suffixes, "(a)" , M("a" )); |
| 1376 | test_lit!(sfx_rep_zero_or_one1, suffixes, "a?" ); |
| 1377 | test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?" ); |
| 1378 | test_lit!(sfx_rep_zero_or_more1, suffixes, "a*" ); |
| 1379 | test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*" ); |
| 1380 | test_lit!(sfx_rep_one_or_more1, suffixes, "a+" , C("a" )); |
| 1381 | test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+" , C("abc" )); |
| 1382 | test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+" , C("a" )); |
| 1383 | test_lit!(sfx_rep_range1, suffixes, "a{0}" ); |
| 1384 | test_lit!(sfx_rep_range2, suffixes, "a{0,}" ); |
| 1385 | test_lit!(sfx_rep_range3, suffixes, "a{0,1}" ); |
| 1386 | test_lit!(sfx_rep_range4, suffixes, "a{1}" , M("a" )); |
| 1387 | test_lit!(sfx_rep_range5, suffixes, "a{2}" , M("aa" )); |
| 1388 | test_lit!(sfx_rep_range6, suffixes, "a{1,2}" , C("a" )); |
| 1389 | test_lit!(sfx_rep_range7, suffixes, "a{2,3}" , C("aa" )); |
| 1390 | |
| 1391 | // Test regexes with concatenations. |
| 1392 | test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)" , M("ab" )); |
| 1393 | test_lit!(sfx_cat2, suffixes, "[ab]z" , M("az" ), M("bz" )); |
| 1394 | test_lit!( |
| 1395 | sfx_cat3, |
| 1396 | suffixes, |
| 1397 | "(?i-u)[ab]z" , |
| 1398 | M("AZ" ), |
| 1399 | M("Az" ), |
| 1400 | M("BZ" ), |
| 1401 | M("Bz" ), |
| 1402 | M("aZ" ), |
| 1403 | M("az" ), |
| 1404 | M("bZ" ), |
| 1405 | M("bz" ) |
| 1406 | ); |
| 1407 | test_lit!( |
| 1408 | sfx_cat4, |
| 1409 | suffixes, |
| 1410 | "[ab][yz]" , |
| 1411 | M("ay" ), |
| 1412 | M("az" ), |
| 1413 | M("by" ), |
| 1414 | M("bz" ) |
| 1415 | ); |
| 1416 | test_lit!(sfx_cat5, suffixes, "a*b" , C("ab" ), M("b" )); |
| 1417 | test_lit!(sfx_cat6, suffixes, "a*b*c" , C("bc" ), C("ac" ), M("c" )); |
| 1418 | test_lit!(sfx_cat7, suffixes, "a*b*c+" , C("c" )); |
| 1419 | test_lit!(sfx_cat8, suffixes, "a*b+c" , C("bc" )); |
| 1420 | test_lit!(sfx_cat9, suffixes, "a*b+c*" , C("c" ), C("b" )); |
| 1421 | test_lit!(sfx_cat10, suffixes, "ab*" , C("b" ), M("a" )); |
| 1422 | test_lit!(sfx_cat11, suffixes, "ab*c" , C("bc" ), M("ac" )); |
| 1423 | test_lit!(sfx_cat12, suffixes, "ab+" , C("b" )); |
| 1424 | test_lit!(sfx_cat13, suffixes, "ab+c" , C("bc" )); |
| 1425 | test_lit!(sfx_cat14, suffixes, "a^" ); |
| 1426 | test_lit!(sfx_cat15, suffixes, "$a" , C("a" )); |
| 1427 | test_lit!(sfx_cat16, suffixes, r"ab*c" , C("bc" ), M("ac" )); |
| 1428 | test_lit!(sfx_cat17, suffixes, r"ab+c" , C("bc" )); |
| 1429 | test_lit!(sfx_cat18, suffixes, r"z*azb" , C("zazb" ), M("azb" )); |
| 1430 | test_lit!(sfx_cat19, suffixes, "a.z" , C("z" )); |
| 1431 | |
| 1432 | // Test regexes with alternations. |
| 1433 | test_lit!(sfx_alt1, suffixes, "a|b" , M("a" ), M("b" )); |
| 1434 | test_lit!(sfx_alt2, suffixes, "[1-3]|b" , M("1" ), M("2" ), M("3" ), M("b" )); |
| 1435 | test_lit!(sfx_alt3, suffixes, "y(?:a|b)z" , M("yaz" ), M("ybz" )); |
| 1436 | test_lit!(sfx_alt4, suffixes, "a|b*" ); |
| 1437 | test_lit!(sfx_alt5, suffixes, "a|b+" , M("a" ), C("b" )); |
| 1438 | test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)" ); |
| 1439 | test_lit!( |
| 1440 | sfx_alt7, |
| 1441 | suffixes, |
| 1442 | "(a|b)*c|(a|ab)*c" , |
| 1443 | C("ac" ), |
| 1444 | C("bc" ), |
| 1445 | M("c" ), |
| 1446 | C("ac" ), |
| 1447 | C("abc" ), |
| 1448 | M("c" ) |
| 1449 | ); |
| 1450 | test_lit!(sfx_alt8, suffixes, "a*b|c" , C("ab" ), M("b" ), M("c" )); |
| 1451 | |
| 1452 | // Test regexes with empty assertions. |
| 1453 | test_lit!(sfx_empty1, suffixes, "a$" , M("a" )); |
| 1454 | test_lit!(sfx_empty2, suffixes, "${2}a" , C("a" )); |
| 1455 | |
| 1456 | // Make sure some curious regexes have no suffixes. |
| 1457 | test_lit!(sfx_nothing1, suffixes, "." ); |
| 1458 | test_lit!(sfx_nothing2, suffixes, "(?s)." ); |
| 1459 | test_lit!(sfx_nothing3, suffixes, "^" ); |
| 1460 | test_lit!(sfx_nothing4, suffixes, "$" ); |
| 1461 | test_lit!(sfx_nothing6, suffixes, "(?m)$" ); |
| 1462 | test_lit!(sfx_nothing7, suffixes, r"\b" ); |
| 1463 | test_lit!(sfx_nothing8, suffixes, r"\B" ); |
| 1464 | |
| 1465 | // Test a few regexes that defeat any suffix literal detection. |
| 1466 | test_lit!(sfx_defeated1, suffixes, "a." ); |
| 1467 | test_lit!(sfx_defeated2, suffixes, "(?s)a." ); |
| 1468 | test_lit!(sfx_defeated3, suffixes, "a*b*c*" ); |
| 1469 | test_lit!(sfx_defeated4, suffixes, "a|." ); |
| 1470 | test_lit!(sfx_defeated5, suffixes, ".|a" ); |
| 1471 | test_lit!(sfx_defeated6, suffixes, "a|^" ); |
| 1472 | test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))." ); |
| 1473 | test_lit!(sfx_defeated8, suffixes, "a^" ); |
| 1474 | test_lit!(sfx_defeated9, suffixes, "(?m)a$" ); |
| 1475 | test_lit!(sfx_defeated10, suffixes, r"a\b" ); |
| 1476 | test_lit!(sfx_defeated11, suffixes, r"a\B" ); |
| 1477 | test_lit!(sfx_defeated12, suffixes, "a^*" ); |
| 1478 | test_lit!(sfx_defeated13, suffixes, "a^+" ); |
| 1479 | |
| 1480 | // These test use a much lower limit than the default so that we can |
| 1481 | // write test cases of reasonable size. |
| 1482 | test_exhausted!(sfx_exhausted1, suffixes, "[a-z]" ); |
| 1483 | test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*" ); |
| 1484 | test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z" , C("Z" )); |
| 1485 | test_exhausted!( |
| 1486 | sfx_exhausted4, |
| 1487 | suffixes, |
| 1488 | "(?i-u)foobar" , |
| 1489 | C("AR" ), |
| 1490 | C("Ar" ), |
| 1491 | C("aR" ), |
| 1492 | C("ar" ) |
| 1493 | ); |
| 1494 | test_exhausted!( |
| 1495 | sfx_exhausted5, |
| 1496 | suffixes, |
| 1497 | "(?:ab){100}" , |
| 1498 | C("abababababababababab" ) |
| 1499 | ); |
| 1500 | test_exhausted!( |
| 1501 | sfx_exhausted6, |
| 1502 | suffixes, |
| 1503 | "cd(?:(?:ab){100})*" , |
| 1504 | C("ababababab" ), |
| 1505 | M("cd" ) |
| 1506 | ); |
| 1507 | test_exhausted!( |
| 1508 | sfx_exhausted7, |
| 1509 | suffixes, |
| 1510 | "cd(?:(?:ab){100})*z" , |
| 1511 | C("abababababz" ), |
| 1512 | M("cdz" ) |
| 1513 | ); |
| 1514 | test_exhausted!( |
| 1515 | sfx_exhausted8, |
| 1516 | suffixes, |
| 1517 | "zaaaaaaaaaaaaaaaaaaaa" , |
| 1518 | C("aaaaaaaaaaaaaaaaaaaa" ) |
| 1519 | ); |
| 1520 | |
| 1521 | // ************************************************************************ |
| 1522 | // Tests for generating unambiguous literal sets. |
| 1523 | // ************************************************************************ |
| 1524 | |
| 1525 | macro_rules! test_unamb { |
| 1526 | ($name:ident, $given:expr, $expected:expr) => { |
| 1527 | #[test] |
| 1528 | fn $name() { |
| 1529 | let given: Vec<Literal> = $given |
| 1530 | .into_iter() |
| 1531 | .map(|ul| { |
| 1532 | let cut = ul.is_cut(); |
| 1533 | Literal { v: ul.v.into_bytes(), cut: cut } |
| 1534 | }) |
| 1535 | .collect(); |
| 1536 | let lits = create_lits(given); |
| 1537 | let got = lits.unambiguous_prefixes(); |
| 1538 | assert_eq!($expected, escape_lits(got.literals())); |
| 1539 | } |
| 1540 | }; |
| 1541 | } |
| 1542 | |
| 1543 | test_unamb!(unambiguous1, vec![M("z" ), M("azb" )], vec![C("a" ), C("z" )]); |
| 1544 | test_unamb!( |
| 1545 | unambiguous2, |
| 1546 | vec![M("zaaaaaa" ), M("aa" )], |
| 1547 | vec![C("aa" ), C("z" )] |
| 1548 | ); |
| 1549 | test_unamb!( |
| 1550 | unambiguous3, |
| 1551 | vec![M("Sherlock" ), M("Watson" )], |
| 1552 | vec![M("Sherlock" ), M("Watson" )] |
| 1553 | ); |
| 1554 | test_unamb!(unambiguous4, vec![M("abc" ), M("bc" )], vec![C("a" ), C("bc" )]); |
| 1555 | test_unamb!(unambiguous5, vec![M("bc" ), M("abc" )], vec![C("a" ), C("bc" )]); |
| 1556 | test_unamb!(unambiguous6, vec![M("a" ), M("aa" )], vec![C("a" )]); |
| 1557 | test_unamb!(unambiguous7, vec![M("aa" ), M("a" )], vec![C("a" )]); |
| 1558 | test_unamb!(unambiguous8, vec![M("ab" ), M("a" )], vec![C("a" )]); |
| 1559 | test_unamb!( |
| 1560 | unambiguous9, |
| 1561 | vec![M("ac" ), M("bc" ), M("c" ), M("ac" ), M("abc" ), M("c" )], |
| 1562 | vec![C("a" ), C("b" ), C("c" )] |
| 1563 | ); |
| 1564 | test_unamb!( |
| 1565 | unambiguous10, |
| 1566 | vec![M("Mo'" ), M("Mu'" ), M("Mo" ), M("Mu" )], |
| 1567 | vec![C("Mo" ), C("Mu" )] |
| 1568 | ); |
| 1569 | test_unamb!( |
| 1570 | unambiguous11, |
| 1571 | vec![M("zazb" ), M("azb" )], |
| 1572 | vec![C("a" ), C("z" )] |
| 1573 | ); |
| 1574 | test_unamb!(unambiguous12, vec![M("foo" ), C("foo" )], vec![C("foo" )]); |
| 1575 | test_unamb!( |
| 1576 | unambiguous13, |
| 1577 | vec![M("ABCX" ), M("CDAX" ), M("BCX" )], |
| 1578 | vec![C("A" ), C("BCX" ), C("CD" )] |
| 1579 | ); |
| 1580 | test_unamb!( |
| 1581 | unambiguous14, |
| 1582 | vec![M("IMGX" ), M("MVIX" ), M("MGX" ), M("DSX" )], |
| 1583 | vec![M("DSX" ), C("I" ), C("MGX" ), C("MV" )] |
| 1584 | ); |
| 1585 | test_unamb!( |
| 1586 | unambiguous15, |
| 1587 | vec![M("IMG_" ), M("MG_" ), M("CIMG" )], |
| 1588 | vec![C("C" ), C("I" ), C("MG_" )] |
| 1589 | ); |
| 1590 | |
| 1591 | // ************************************************************************ |
| 1592 | // Tests for suffix trimming. |
| 1593 | // ************************************************************************ |
| 1594 | macro_rules! test_trim { |
| 1595 | ($name:ident, $trim:expr, $given:expr, $expected:expr) => { |
| 1596 | #[test] |
| 1597 | fn $name() { |
| 1598 | let given: Vec<Literal> = $given |
| 1599 | .into_iter() |
| 1600 | .map(|ul| { |
| 1601 | let cut = ul.is_cut(); |
| 1602 | Literal { v: ul.v.into_bytes(), cut: cut } |
| 1603 | }) |
| 1604 | .collect(); |
| 1605 | let lits = create_lits(given); |
| 1606 | let got = lits.trim_suffix($trim).unwrap(); |
| 1607 | assert_eq!($expected, escape_lits(got.literals())); |
| 1608 | } |
| 1609 | }; |
| 1610 | } |
| 1611 | |
| 1612 | test_trim!(trim1, 1, vec![M("ab" ), M("yz" )], vec![C("a" ), C("y" )]); |
| 1613 | test_trim!(trim2, 1, vec![M("abc" ), M("abd" )], vec![C("ab" )]); |
| 1614 | test_trim!(trim3, 2, vec![M("abc" ), M("abd" )], vec![C("a" )]); |
| 1615 | test_trim!(trim4, 2, vec![M("abc" ), M("ghij" )], vec![C("a" ), C("gh" )]); |
| 1616 | |
| 1617 | // ************************************************************************ |
| 1618 | // Tests for longest common prefix. |
| 1619 | // ************************************************************************ |
| 1620 | |
| 1621 | macro_rules! test_lcp { |
| 1622 | ($name:ident, $given:expr, $expected:expr) => { |
| 1623 | #[test] |
| 1624 | fn $name() { |
| 1625 | let given: Vec<Literal> = $given |
| 1626 | .into_iter() |
| 1627 | .map(|s: &str| Literal { |
| 1628 | v: s.to_owned().into_bytes(), |
| 1629 | cut: false, |
| 1630 | }) |
| 1631 | .collect(); |
| 1632 | let lits = create_lits(given); |
| 1633 | let got = lits.longest_common_prefix(); |
| 1634 | assert_eq!($expected, escape_bytes(got)); |
| 1635 | } |
| 1636 | }; |
| 1637 | } |
| 1638 | |
| 1639 | test_lcp!(lcp1, vec!["a" ], "a" ); |
| 1640 | test_lcp!(lcp2, vec![], "" ); |
| 1641 | test_lcp!(lcp3, vec!["a" , "b" ], "" ); |
| 1642 | test_lcp!(lcp4, vec!["ab" , "ab" ], "ab" ); |
| 1643 | test_lcp!(lcp5, vec!["ab" , "a" ], "a" ); |
| 1644 | test_lcp!(lcp6, vec!["a" , "ab" ], "a" ); |
| 1645 | test_lcp!(lcp7, vec!["ab" , "b" ], "" ); |
| 1646 | test_lcp!(lcp8, vec!["b" , "ab" ], "" ); |
| 1647 | test_lcp!(lcp9, vec!["foobar" , "foobaz" ], "fooba" ); |
| 1648 | test_lcp!(lcp10, vec!["foobar" , "foobaz" , "a" ], "" ); |
| 1649 | test_lcp!(lcp11, vec!["a" , "foobar" , "foobaz" ], "" ); |
| 1650 | test_lcp!(lcp12, vec!["foo" , "flub" , "flab" , "floo" ], "f" ); |
| 1651 | |
| 1652 | // ************************************************************************ |
| 1653 | // Tests for longest common suffix. |
| 1654 | // ************************************************************************ |
| 1655 | |
| 1656 | macro_rules! test_lcs { |
| 1657 | ($name:ident, $given:expr, $expected:expr) => { |
| 1658 | #[test] |
| 1659 | fn $name() { |
| 1660 | let given: Vec<Literal> = $given |
| 1661 | .into_iter() |
| 1662 | .map(|s: &str| Literal { |
| 1663 | v: s.to_owned().into_bytes(), |
| 1664 | cut: false, |
| 1665 | }) |
| 1666 | .collect(); |
| 1667 | let lits = create_lits(given); |
| 1668 | let got = lits.longest_common_suffix(); |
| 1669 | assert_eq!($expected, escape_bytes(got)); |
| 1670 | } |
| 1671 | }; |
| 1672 | } |
| 1673 | |
| 1674 | test_lcs!(lcs1, vec!["a" ], "a" ); |
| 1675 | test_lcs!(lcs2, vec![], "" ); |
| 1676 | test_lcs!(lcs3, vec!["a" , "b" ], "" ); |
| 1677 | test_lcs!(lcs4, vec!["ab" , "ab" ], "ab" ); |
| 1678 | test_lcs!(lcs5, vec!["ab" , "a" ], "" ); |
| 1679 | test_lcs!(lcs6, vec!["a" , "ab" ], "" ); |
| 1680 | test_lcs!(lcs7, vec!["ab" , "b" ], "b" ); |
| 1681 | test_lcs!(lcs8, vec!["b" , "ab" ], "b" ); |
| 1682 | test_lcs!(lcs9, vec!["barfoo" , "bazfoo" ], "foo" ); |
| 1683 | test_lcs!(lcs10, vec!["barfoo" , "bazfoo" , "a" ], "" ); |
| 1684 | test_lcs!(lcs11, vec!["a" , "barfoo" , "bazfoo" ], "" ); |
| 1685 | test_lcs!(lcs12, vec!["flub" , "bub" , "boob" , "dub" ], "b" ); |
| 1686 | } |
| 1687 | |