1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#include "vm/regexp.h"
6
7#include <memory>
8
9#include "platform/splay-tree-inl.h"
10#include "platform/unicode.h"
11
12#include "unicode/uniset.h"
13
14#include "vm/dart_entry.h"
15#include "vm/regexp_assembler.h"
16#include "vm/regexp_assembler_bytecode.h"
17#include "vm/regexp_ast.h"
18#include "vm/symbols.h"
19#include "vm/thread.h"
20#include "vm/unibrow-inl.h"
21
22#if !defined(DART_PRECOMPILED_RUNTIME)
23#include "vm/regexp_assembler_ir.h"
24#endif // !defined(DART_PRECOMPILED_RUNTIME)
25
26#define Z (zone())
27
28namespace dart {
29
30// Default to generating optimized regexp code.
31static constexpr bool kRegexpOptimization = true;
32
33// More makes code generation slower, less makes V8 benchmark score lower.
34static constexpr intptr_t kMaxLookaheadForBoyerMoore = 8;
35
36ContainedInLattice AddRange(ContainedInLattice containment,
37 const int32_t* ranges,
38 intptr_t ranges_length,
39 Interval new_range) {
40 ASSERT((ranges_length & 1) == 1);
41 ASSERT(ranges[ranges_length - 1] == Utf::kMaxCodePoint + 1);
42 if (containment == kLatticeUnknown) return containment;
43 bool inside = false;
44 int32_t last = 0;
45 for (intptr_t i = 0; i < ranges_length;
46 inside = !inside, last = ranges[i], i++) {
47 // Consider the range from last to ranges[i].
48 // We haven't got to the new range yet.
49 if (ranges[i] <= new_range.from()) continue;
50 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
51 // inclusive, but the values in ranges are not.
52 if (last <= new_range.from() && new_range.to() < ranges[i]) {
53 return Combine(a: containment, b: inside ? kLatticeIn : kLatticeOut);
54 }
55 return kLatticeUnknown;
56 }
57 return containment;
58}
59
60// -------------------------------------------------------------------
61// Implementation of the Irregexp regular expression engine.
62//
63// The Irregexp regular expression engine is intended to be a complete
64// implementation of ECMAScript regular expressions. It generates
65// IR code that is subsequently compiled to native code.
66
67// The Irregexp regexp engine is structured in three steps.
68// 1) The parser generates an abstract syntax tree. See regexp_ast.cc.
69// 2) From the AST a node network is created. The nodes are all
70// subclasses of RegExpNode. The nodes represent states when
71// executing a regular expression. Several optimizations are
72// performed on the node network.
73// 3) From the nodes we generate IR instructions that can actually
74// execute the regular expression (perform the search). The
75// code generation step is described in more detail below.
76
77// Code generation.
78//
79// The nodes are divided into four main categories.
80// * Choice nodes
81// These represent places where the regular expression can
82// match in more than one way. For example on entry to an
83// alternation (foo|bar) or a repetition (*, +, ? or {}).
84// * Action nodes
85// These represent places where some action should be
86// performed. Examples include recording the current position
87// in the input string to a register (in order to implement
88// captures) or other actions on register for example in order
89// to implement the counters needed for {} repetitions.
90// * Matching nodes
91// These attempt to match some element part of the input string.
92// Examples of elements include character classes, plain strings
93// or back references.
94// * End nodes
95// These are used to implement the actions required on finding
96// a successful match or failing to find a match.
97//
98// The code generated maintains some state as it runs. This consists of the
99// following elements:
100//
101// * The capture registers. Used for string captures.
102// * Other registers. Used for counters etc.
103// * The current position.
104// * The stack of backtracking information. Used when a matching node
105// fails to find a match and needs to try an alternative.
106//
107// Conceptual regular expression execution model:
108//
109// There is a simple conceptual model of regular expression execution
110// which will be presented first. The actual code generated is a more
111// efficient simulation of the simple conceptual model:
112//
113// * Choice nodes are implemented as follows:
114// For each choice except the last {
115// push current position
116// push backtrack code location
117// <generate code to test for choice>
118// backtrack code location:
119// pop current position
120// }
121// <generate code to test for last choice>
122//
123// * Actions nodes are generated as follows
124// <push affected registers on backtrack stack>
125// <generate code to perform action>
126// push backtrack code location
127// <generate code to test for following nodes>
128// backtrack code location:
129// <pop affected registers to restore their state>
130// <pop backtrack location from stack and go to it>
131//
132// * Matching nodes are generated as follows:
133// if input string matches at current position
134// update current position
135// <generate code to test for following nodes>
136// else
137// <pop backtrack location from stack and go to it>
138//
139// Thus it can be seen that the current position is saved and restored
140// by the choice nodes, whereas the registers are saved and restored by
141// by the action nodes that manipulate them.
142//
143// The other interesting aspect of this model is that nodes are generated
144// at the point where they are needed by a recursive call to Emit(). If
145// the node has already been code generated then the Emit() call will
146// generate a jump to the previously generated code instead. In order to
147// limit recursion it is possible for the Emit() function to put the node
148// on a work list for later generation and instead generate a jump. The
149// destination of the jump is resolved later when the code is generated.
150//
151// Actual regular expression code generation.
152//
153// Code generation is actually more complicated than the above. In order
154// to improve the efficiency of the generated code some optimizations are
155// performed
156//
157// * Choice nodes have 1-character lookahead.
158// A choice node looks at the following character and eliminates some of
159// the choices immediately based on that character. This is not yet
160// implemented.
161// * Simple greedy loops store reduced backtracking information.
162// A quantifier like /.*foo/m will greedily match the whole input. It will
163// then need to backtrack to a point where it can match "foo". The naive
164// implementation of this would push each character position onto the
165// backtracking stack, then pop them off one by one. This would use space
166// proportional to the length of the input string. However since the "."
167// can only match in one way and always has a constant length (in this case
168// of 1) it suffices to store the current position on the top of the stack
169// once. Matching now becomes merely incrementing the current position and
170// backtracking becomes decrementing the current position and checking the
171// result against the stored current position. This is faster and saves
172// space.
173// * The current state is virtualized.
174// This is used to defer expensive operations until it is clear that they
175// are needed and to generate code for a node more than once, allowing
176// specialized an efficient versions of the code to be created. This is
177// explained in the section below.
178//
179// Execution state virtualization.
180//
181// Instead of emitting code, nodes that manipulate the state can record their
182// manipulation in an object called the Trace. The Trace object can record a
183// current position offset, an optional backtrack code location on the top of
184// the virtualized backtrack stack and some register changes. When a node is
185// to be emitted it can flush the Trace or update it. Flushing the Trace
186// will emit code to bring the actual state into line with the virtual state.
187// Avoiding flushing the state can postpone some work (e.g. updates of capture
188// registers). Postponing work can save time when executing the regular
189// expression since it may be found that the work never has to be done as a
190// failure to match can occur. In addition it is much faster to jump to a
191// known backtrack code location than it is to pop an unknown backtrack
192// location from the stack and jump there.
193//
194// The virtual state found in the Trace affects code generation. For example
195// the virtual state contains the difference between the actual current
196// position and the virtual current position, and matching code needs to use
197// this offset to attempt a match in the correct location of the input
198// string. Therefore code generated for a non-trivial trace is specialized
199// to that trace. The code generator therefore has the ability to generate
200// code for each node several times. In order to limit the size of the
201// generated code there is an arbitrary limit on how many specialized sets of
202// code may be generated for a given node. If the limit is reached, the
203// trace is flushed and a generic version of the code for a node is emitted.
204// This is subsequently used for that node. The code emitted for non-generic
205// trace is not recorded in the node and so it cannot currently be reused in
206// the event that code generation is requested for an identical trace.
207
208void RegExpTree::AppendToText(RegExpText* text) {
209 UNREACHABLE();
210}
211
212void RegExpAtom::AppendToText(RegExpText* text) {
213 text->AddElement(elm: TextElement::Atom(atom: this));
214}
215
216void RegExpCharacterClass::AppendToText(RegExpText* text) {
217 text->AddElement(elm: TextElement::CharClass(char_class: this));
218}
219
220void RegExpText::AppendToText(RegExpText* text) {
221 for (intptr_t i = 0; i < elements()->length(); i++)
222 text->AddElement(elm: (*elements())[i]);
223}
224
225TextElement TextElement::Atom(RegExpAtom* atom) {
226 return TextElement(ATOM, atom);
227}
228
229TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
230 return TextElement(CHAR_CLASS, char_class);
231}
232
233intptr_t TextElement::length() const {
234 switch (text_type()) {
235 case ATOM:
236 return atom()->length();
237
238 case CHAR_CLASS:
239 return 1;
240 }
241 UNREACHABLE();
242 return 0;
243}
244
245class FrequencyCollator : public ValueObject {
246 public:
247 FrequencyCollator() : total_samples_(0) {
248 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
249 frequencies_[i] = CharacterFrequency(i);
250 }
251 }
252
253 void CountCharacter(intptr_t character) {
254 intptr_t index = (character & RegExpMacroAssembler::kTableMask);
255 frequencies_[index].Increment();
256 total_samples_++;
257 }
258
259 // Does not measure in percent, but rather per-128 (the table size from the
260 // regexp macro assembler).
261 intptr_t Frequency(intptr_t in_character) {
262 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
263 if (total_samples_ < 1) return 1; // Division by zero.
264 intptr_t freq_in_per128 =
265 (frequencies_[in_character].counter() * 128) / total_samples_;
266 return freq_in_per128;
267 }
268
269 private:
270 class CharacterFrequency {
271 public:
272 CharacterFrequency() : counter_(0), character_(-1) {}
273 explicit CharacterFrequency(intptr_t character)
274 : counter_(0), character_(character) {}
275
276 void Increment() { counter_++; }
277 intptr_t counter() { return counter_; }
278 intptr_t character() { return character_; }
279
280 private:
281 intptr_t counter_;
282 intptr_t character_;
283
284 DISALLOW_ALLOCATION();
285 };
286
287 private:
288 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
289 intptr_t total_samples_;
290};
291
292class RegExpCompiler : public ValueObject {
293 public:
294 RegExpCompiler(intptr_t capture_count, bool is_one_byte);
295
296 intptr_t AllocateRegister() { return next_register_++; }
297
298 // Lookarounds to match lone surrogates for unicode character class matches
299 // are never nested. We can therefore reuse registers.
300 intptr_t UnicodeLookaroundStackRegister() {
301 if (unicode_lookaround_stack_register_ == kNoRegister) {
302 unicode_lookaround_stack_register_ = AllocateRegister();
303 }
304 return unicode_lookaround_stack_register_;
305 }
306
307 intptr_t UnicodeLookaroundPositionRegister() {
308 if (unicode_lookaround_position_register_ == kNoRegister) {
309 unicode_lookaround_position_register_ = AllocateRegister();
310 }
311 return unicode_lookaround_position_register_;
312 }
313
314#if !defined(DART_PRECOMPILED_RUNTIME)
315 RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler,
316 RegExpNode* start,
317 intptr_t capture_count,
318 const String& pattern);
319#endif
320
321 RegExpEngine::CompilationResult Assemble(
322 BytecodeRegExpMacroAssembler* assembler,
323 RegExpNode* start,
324 intptr_t capture_count,
325 const String& pattern);
326
327 inline void AddWork(RegExpNode* node) { work_list_->Add(value: node); }
328
329 static constexpr intptr_t kImplementationOffset = 0;
330 static constexpr intptr_t kNumberOfRegistersOffset = 0;
331 static constexpr intptr_t kCodeOffset = 1;
332
333 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
334 EndNode* accept() { return accept_; }
335
336 static constexpr intptr_t kMaxRecursion = 100;
337 inline intptr_t recursion_depth() { return recursion_depth_; }
338 inline void IncrementRecursionDepth() { recursion_depth_++; }
339 inline void DecrementRecursionDepth() { recursion_depth_--; }
340
341 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
342
343 inline bool one_byte() const { return is_one_byte_; }
344 bool read_backward() { return read_backward_; }
345 void set_read_backward(bool value) { read_backward_ = value; }
346 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
347
348 intptr_t current_expansion_factor() { return current_expansion_factor_; }
349 void set_current_expansion_factor(intptr_t value) {
350 current_expansion_factor_ = value;
351 }
352
353 Zone* zone() const { return zone_; }
354
355 static constexpr intptr_t kNoRegister = -1;
356
357 private:
358 EndNode* accept_;
359 intptr_t next_register_;
360 intptr_t unicode_lookaround_stack_register_;
361 intptr_t unicode_lookaround_position_register_;
362 ZoneGrowableArray<RegExpNode*>* work_list_;
363 intptr_t recursion_depth_;
364 RegExpMacroAssembler* macro_assembler_;
365 bool is_one_byte_;
366 bool reg_exp_too_big_;
367 bool read_backward_;
368 intptr_t current_expansion_factor_;
369 FrequencyCollator frequency_collator_;
370 Zone* zone_;
371};
372
373class RecursionCheck : public ValueObject {
374 public:
375 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
376 compiler->IncrementRecursionDepth();
377 }
378 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
379
380 private:
381 RegExpCompiler* compiler_;
382};
383
384static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
385 return RegExpEngine::CompilationResult("RegExp too big");
386}
387
388// Attempts to compile the regexp using an Irregexp code generator. Returns
389// a fixed array or a null handle depending on whether it succeeded.
390RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool is_one_byte)
391 : next_register_(2 * (capture_count + 1)),
392 unicode_lookaround_stack_register_(kNoRegister),
393 unicode_lookaround_position_register_(kNoRegister),
394 work_list_(nullptr),
395 recursion_depth_(0),
396 is_one_byte_(is_one_byte),
397 reg_exp_too_big_(false),
398 read_backward_(false),
399 current_expansion_factor_(1),
400 zone_(Thread::Current()->zone()) {
401 accept_ = new (Z) EndNode(EndNode::ACCEPT, Z);
402}
403
404#if !defined(DART_PRECOMPILED_RUNTIME)
405RegExpEngine::CompilationResult RegExpCompiler::Assemble(
406 IRRegExpMacroAssembler* macro_assembler,
407 RegExpNode* start,
408 intptr_t capture_count,
409 const String& pattern) {
410 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
411 macro_assembler_ = macro_assembler;
412
413 ZoneGrowableArray<RegExpNode*> work_list(0);
414 work_list_ = &work_list;
415 BlockLabel fail;
416 macro_assembler_->PushBacktrack(label: &fail);
417 Trace new_trace;
418 start->Emit(compiler: this, trace: &new_trace);
419 macro_assembler_->BindBlock(label: &fail);
420 macro_assembler_->Fail();
421 while (!work_list.is_empty()) {
422 work_list.RemoveLast()->Emit(compiler: this, trace: &new_trace);
423 }
424 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
425
426 macro_assembler->GenerateBacktrackBlock();
427 macro_assembler->FinalizeRegistersArray();
428
429 return RegExpEngine::CompilationResult(
430 macro_assembler->backtrack_goto(), macro_assembler->graph_entry(),
431 macro_assembler->num_blocks(), macro_assembler->num_stack_locals(),
432 next_register_);
433}
434#endif
435
436RegExpEngine::CompilationResult RegExpCompiler::Assemble(
437 BytecodeRegExpMacroAssembler* macro_assembler,
438 RegExpNode* start,
439 intptr_t capture_count,
440 const String& pattern) {
441 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
442 macro_assembler_ = macro_assembler;
443
444 ZoneGrowableArray<RegExpNode*> work_list(0);
445 work_list_ = &work_list;
446 BlockLabel fail;
447 macro_assembler_->PushBacktrack(label: &fail);
448 Trace new_trace;
449 start->Emit(compiler: this, trace: &new_trace);
450 macro_assembler_->BindBlock(label: &fail);
451 macro_assembler_->Fail();
452 while (!work_list.is_empty()) {
453 work_list.RemoveLast()->Emit(compiler: this, trace: &new_trace);
454 }
455 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
456
457 TypedData& bytecode = TypedData::ZoneHandle(ptr: macro_assembler->GetBytecode());
458 return RegExpEngine::CompilationResult(&bytecode, next_register_);
459}
460
461bool Trace::DeferredAction::Mentions(intptr_t that) {
462 if (action_type() == ActionNode::CLEAR_CAPTURES) {
463 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
464 return range.Contains(value: that);
465 } else {
466 return reg() == that;
467 }
468}
469
470bool Trace::mentions_reg(intptr_t reg) {
471 for (DeferredAction* action = actions_; action != nullptr;
472 action = action->next()) {
473 if (action->Mentions(that: reg)) return true;
474 }
475 return false;
476}
477
478bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {
479 ASSERT(*cp_offset == 0);
480 for (DeferredAction* action = actions_; action != nullptr;
481 action = action->next()) {
482 if (action->Mentions(that: reg)) {
483 if (action->action_type() == ActionNode::STORE_POSITION) {
484 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
485 return true;
486 } else {
487 return false;
488 }
489 }
490 }
491 return false;
492}
493
494// This is called as we come into a loop choice node and some other tricky
495// nodes. It normalizes the state of the code generator to ensure we can
496// generate generic code.
497intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers, Zone* zone) {
498 intptr_t max_register = RegExpCompiler::kNoRegister;
499 for (DeferredAction* action = actions_; action != nullptr;
500 action = action->next()) {
501 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
502 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
503 for (intptr_t i = range.from(); i <= range.to(); i++)
504 affected_registers->Set(value: i, zone);
505 if (range.to() > max_register) max_register = range.to();
506 } else {
507 affected_registers->Set(value: action->reg(), zone);
508 if (action->reg() > max_register) max_register = action->reg();
509 }
510 }
511 return max_register;
512}
513
514void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
515 intptr_t max_register,
516 const OutSet& registers_to_pop,
517 const OutSet& registers_to_clear) {
518 for (intptr_t reg = max_register; reg >= 0; reg--) {
519 if (registers_to_pop.Get(value: reg)) {
520 assembler->PopRegister(register_index: reg);
521 } else if (registers_to_clear.Get(value: reg)) {
522 intptr_t clear_to = reg;
523 while (reg > 0 && registers_to_clear.Get(value: reg - 1)) {
524 reg--;
525 }
526 assembler->ClearRegisters(reg_from: reg, reg_to: clear_to);
527 }
528 }
529}
530
531void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
532 intptr_t max_register,
533 const OutSet& affected_registers,
534 OutSet* registers_to_pop,
535 OutSet* registers_to_clear,
536 Zone* zone) {
537 for (intptr_t reg = 0; reg <= max_register; reg++) {
538 if (!affected_registers.Get(value: reg)) {
539 continue;
540 }
541
542 // The chronologically first deferred action in the trace
543 // is used to infer the action needed to restore a register
544 // to its previous state (or not, if it's safe to ignore it).
545 enum DeferredActionUndoType { ACTION_IGNORE, ACTION_RESTORE, ACTION_CLEAR };
546 DeferredActionUndoType undo_action = ACTION_IGNORE;
547
548 intptr_t value = 0;
549 bool absolute = false;
550 bool clear = false;
551 const intptr_t kNoStore = kMinInt32;
552 intptr_t store_position = kNoStore;
553 // This is a little tricky because we are scanning the actions in reverse
554 // historical order (newest first).
555 for (DeferredAction* action = actions_; action != nullptr;
556 action = action->next()) {
557 if (action->Mentions(that: reg)) {
558 switch (action->action_type()) {
559 case ActionNode::SET_REGISTER: {
560 Trace::DeferredSetRegister* psr =
561 static_cast<Trace::DeferredSetRegister*>(action);
562 if (!absolute) {
563 value += psr->value();
564 absolute = true;
565 }
566 // SET_REGISTER is currently only used for newly introduced loop
567 // counters. They can have a significant previous value if they
568 // occur in a loop. TODO(lrn): Propagate this information, so we
569 // can set undo_action to ACTION_IGNORE if we know there is no
570 // value to restore.
571 undo_action = ACTION_RESTORE;
572 ASSERT(store_position == kNoStore);
573 ASSERT(!clear);
574 break;
575 }
576 case ActionNode::INCREMENT_REGISTER:
577 if (!absolute) {
578 value++;
579 }
580 ASSERT(store_position == kNoStore);
581 ASSERT(!clear);
582 undo_action = ACTION_RESTORE;
583 break;
584 case ActionNode::STORE_POSITION: {
585 Trace::DeferredCapture* pc =
586 static_cast<Trace::DeferredCapture*>(action);
587 if (!clear && store_position == kNoStore) {
588 store_position = pc->cp_offset();
589 }
590
591 // For captures we know that stores and clears alternate.
592 // Other register, are never cleared, and if the occur
593 // inside a loop, they might be assigned more than once.
594 if (reg <= 1) {
595 // Registers zero and one, aka "capture zero", is
596 // always set correctly if we succeed. There is no
597 // need to undo a setting on backtrack, because we
598 // will set it again or fail.
599 undo_action = ACTION_IGNORE;
600 } else {
601 undo_action = pc->is_capture() ? ACTION_CLEAR : ACTION_RESTORE;
602 }
603 ASSERT(!absolute);
604 ASSERT(value == 0);
605 break;
606 }
607 case ActionNode::CLEAR_CAPTURES: {
608 // Since we're scanning in reverse order, if we've already
609 // set the position we have to ignore historically earlier
610 // clearing operations.
611 if (store_position == kNoStore) {
612 clear = true;
613 }
614 undo_action = ACTION_RESTORE;
615 ASSERT(!absolute);
616 ASSERT(value == 0);
617 break;
618 }
619 default:
620 UNREACHABLE();
621 break;
622 }
623 }
624 }
625 // Prepare for the undo-action (e.g., push if it's going to be popped).
626 if (undo_action == ACTION_RESTORE) {
627 assembler->PushRegister(register_index: reg);
628 registers_to_pop->Set(value: reg, zone);
629 } else if (undo_action == ACTION_CLEAR) {
630 registers_to_clear->Set(value: reg, zone);
631 }
632 // Perform the chronologically last action (or accumulated increment)
633 // for the register.
634 if (store_position != kNoStore) {
635 assembler->WriteCurrentPositionToRegister(reg, cp_offset: store_position);
636 } else if (clear) {
637 assembler->ClearRegisters(reg_from: reg, reg_to: reg);
638 } else if (absolute) {
639 assembler->SetRegister(register_index: reg, to: value);
640 } else if (value != 0) {
641 assembler->AdvanceRegister(reg, by: value);
642 }
643 }
644}
645
646// This is called as we come into a loop choice node and some other tricky
647// nodes. It normalizes the state of the code generator to ensure we can
648// generate generic code.
649void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
650 RegExpMacroAssembler* assembler = compiler->macro_assembler();
651
652 ASSERT(!is_trivial());
653
654 if (actions_ == nullptr && backtrack() == nullptr) {
655 // Here we just have some deferred cp advances to fix and we are back to
656 // a normal situation. We may also have to forget some information gained
657 // through a quick check that was already performed.
658 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(by: cp_offset_);
659 // Create a new trivial state and generate the node with that.
660 Trace new_state;
661 successor->Emit(compiler, trace: &new_state);
662 return;
663 }
664
665 // Generate deferred actions here along with code to undo them again.
666 OutSet affected_registers;
667
668 if (backtrack() != nullptr) {
669 // Here we have a concrete backtrack location. These are set up by choice
670 // nodes and so they indicate that we have a deferred save of the current
671 // position which we may need to emit here.
672 assembler->PushCurrentPosition();
673 }
674 Zone* zone = successor->zone();
675 intptr_t max_register = FindAffectedRegisters(affected_registers: &affected_registers, zone);
676 OutSet registers_to_pop;
677 OutSet registers_to_clear;
678 PerformDeferredActions(assembler, max_register, affected_registers,
679 registers_to_pop: &registers_to_pop, registers_to_clear: &registers_to_clear, zone);
680 if (cp_offset_ != 0) {
681 assembler->AdvanceCurrentPosition(by: cp_offset_);
682 }
683
684 // Create a new trivial state and generate the node with that.
685 BlockLabel undo;
686 assembler->PushBacktrack(label: &undo);
687 Trace new_state;
688 successor->Emit(compiler, trace: &new_state);
689
690 // On backtrack we need to restore state.
691 assembler->BindBlock(label: &undo);
692 RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
693 registers_to_clear);
694 if (backtrack() == nullptr) {
695 assembler->Backtrack();
696 } else {
697 assembler->PopCurrentPosition();
698 assembler->GoTo(to: backtrack());
699 }
700}
701
702void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
703 RegExpMacroAssembler* assembler = compiler->macro_assembler();
704
705 // Omit flushing the trace. We discard the entire stack frame anyway.
706
707 if (!label()->is_bound()) {
708 // We are completely independent of the trace, since we ignore it,
709 // so this code can be used as the generic version.
710 assembler->BindBlock(label: label());
711 }
712
713 // Throw away everything on the backtrack stack since the start
714 // of the negative submatch and restore the character position.
715 assembler->ReadCurrentPositionFromRegister(reg: current_position_register_);
716 assembler->ReadStackPointerFromRegister(reg: stack_pointer_register_);
717 if (clear_capture_count_ > 0) {
718 // Clear any captures that might have been performed during the success
719 // of the body of the negative look-ahead.
720 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
721 assembler->ClearRegisters(reg_from: clear_capture_start_, reg_to: clear_capture_end);
722 }
723 // Now that we have unwound the stack we find at the top of the stack the
724 // backtrack that the BeginSubmatch node got.
725 assembler->Backtrack();
726}
727
728void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
729 if (!trace->is_trivial()) {
730 trace->Flush(compiler, successor: this);
731 return;
732 }
733 RegExpMacroAssembler* assembler = compiler->macro_assembler();
734 if (!label()->is_bound()) {
735 assembler->BindBlock(label: label());
736 }
737 switch (action_) {
738 case ACCEPT:
739 assembler->Succeed();
740 return;
741 case BACKTRACK:
742 assembler->GoTo(to: trace->backtrack());
743 return;
744 case NEGATIVE_SUBMATCH_SUCCESS:
745 // This case is handled in a different virtual method.
746 UNREACHABLE();
747 }
748 UNIMPLEMENTED();
749}
750
751void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
752 if (guards_ == nullptr) guards_ = new (zone) ZoneGrowableArray<Guard*>(1);
753 guards_->Add(value: guard);
754}
755
756ActionNode* ActionNode::SetRegister(intptr_t reg,
757 intptr_t val,
758 RegExpNode* on_success) {
759 ActionNode* result =
760 new (on_success->zone()) ActionNode(SET_REGISTER, on_success);
761 result->data_.u_store_register.reg = reg;
762 result->data_.u_store_register.value = val;
763 return result;
764}
765
766ActionNode* ActionNode::IncrementRegister(intptr_t reg,
767 RegExpNode* on_success) {
768 ActionNode* result =
769 new (on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
770 result->data_.u_increment_register.reg = reg;
771 return result;
772}
773
774ActionNode* ActionNode::StorePosition(intptr_t reg,
775 bool is_capture,
776 RegExpNode* on_success) {
777 ActionNode* result =
778 new (on_success->zone()) ActionNode(STORE_POSITION, on_success);
779 result->data_.u_position_register.reg = reg;
780 result->data_.u_position_register.is_capture = is_capture;
781 return result;
782}
783
784ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
785 ActionNode* result =
786 new (on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
787 result->data_.u_clear_captures.range_from = range.from();
788 result->data_.u_clear_captures.range_to = range.to();
789 return result;
790}
791
792ActionNode* ActionNode::BeginSubmatch(intptr_t stack_reg,
793 intptr_t position_reg,
794 RegExpNode* on_success) {
795 ActionNode* result =
796 new (on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
797 result->data_.u_submatch.stack_pointer_register = stack_reg;
798 result->data_.u_submatch.current_position_register = position_reg;
799 return result;
800}
801
802ActionNode* ActionNode::PositiveSubmatchSuccess(intptr_t stack_reg,
803 intptr_t position_reg,
804 intptr_t clear_register_count,
805 intptr_t clear_register_from,
806 RegExpNode* on_success) {
807 ActionNode* result = new (on_success->zone())
808 ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
809 result->data_.u_submatch.stack_pointer_register = stack_reg;
810 result->data_.u_submatch.current_position_register = position_reg;
811 result->data_.u_submatch.clear_register_count = clear_register_count;
812 result->data_.u_submatch.clear_register_from = clear_register_from;
813 return result;
814}
815
816ActionNode* ActionNode::EmptyMatchCheck(intptr_t start_register,
817 intptr_t repetition_register,
818 intptr_t repetition_limit,
819 RegExpNode* on_success) {
820 ActionNode* result =
821 new (on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
822 result->data_.u_empty_match_check.start_register = start_register;
823 result->data_.u_empty_match_check.repetition_register = repetition_register;
824 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
825 return result;
826}
827
828#define DEFINE_ACCEPT(Type) \
829 void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
830FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
831#undef DEFINE_ACCEPT
832
833void LoopChoiceNode::Accept(NodeVisitor* visitor) {
834 visitor->VisitLoopChoice(that: this);
835}
836
837// -------------------------------------------------------------------
838// Emit code.
839
840void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
841 Guard* guard,
842 Trace* trace) {
843 switch (guard->op()) {
844 case Guard::LT:
845 ASSERT(!trace->mentions_reg(guard->reg()));
846 macro_assembler->IfRegisterGE(reg: guard->reg(), comparand: guard->value(),
847 if_ge: trace->backtrack());
848 break;
849 case Guard::GEQ:
850 ASSERT(!trace->mentions_reg(guard->reg()));
851 macro_assembler->IfRegisterLT(reg: guard->reg(), comparand: guard->value(),
852 if_lt: trace->backtrack());
853 break;
854 }
855}
856
857// Returns the number of characters in the equivalence class, omitting those
858// that cannot occur in the source string because it is ASCII.
859static intptr_t GetCaseIndependentLetters(uint16_t character,
860 bool one_byte_subject,
861 int32_t* letters) {
862 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
863 intptr_t length = jsregexp_uncanonicalize.get(c: character, n: '\0', result: letters);
864 // Unibrow returns 0 or 1 for characters where case independence is
865 // trivial.
866 if (length == 0) {
867 letters[0] = character;
868 length = 1;
869 }
870 if (!one_byte_subject || character <= Symbols::kMaxOneCharCodeSymbol) {
871 return length;
872 }
873
874 // The standard requires that non-ASCII characters cannot have ASCII
875 // character codes in their equivalence class.
876 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
877 // is it? For example, \u00C5 is equivalent to \u212B.
878 return 0;
879}
880
881static inline bool EmitSimpleCharacter(Zone* zone,
882 RegExpCompiler* compiler,
883 uint16_t c,
884 BlockLabel* on_failure,
885 intptr_t cp_offset,
886 bool check,
887 bool preloaded) {
888 RegExpMacroAssembler* assembler = compiler->macro_assembler();
889 bool bound_checked = false;
890 if (!preloaded) {
891 assembler->LoadCurrentCharacter(cp_offset, on_end_of_input: on_failure, check_bounds: check);
892 bound_checked = true;
893 }
894 assembler->CheckNotCharacter(c, on_not_equal: on_failure);
895 return bound_checked;
896}
897
898// Only emits non-letters (things that don't have case). Only used for case
899// independent matches.
900static inline bool EmitAtomNonLetter(Zone* zone,
901 RegExpCompiler* compiler,
902 uint16_t c,
903 BlockLabel* on_failure,
904 intptr_t cp_offset,
905 bool check,
906 bool preloaded) {
907 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
908 bool one_byte = compiler->one_byte();
909 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
910 intptr_t length = GetCaseIndependentLetters(character: c, one_byte_subject: one_byte, letters: chars);
911 if (length < 1) {
912 // This can't match. Must be an one-byte subject and a non-one-byte
913 // character. We do not need to do anything since the one-byte pass
914 // already handled this.
915 return false; // Bounds not checked.
916 }
917 bool checked = false;
918 // We handle the length > 1 case in a later pass.
919 if (length == 1) {
920 if (one_byte && c > Symbols::kMaxOneCharCodeSymbol) {
921 // Can't match - see above.
922 return false; // Bounds not checked.
923 }
924 if (!preloaded) {
925 macro_assembler->LoadCurrentCharacter(cp_offset, on_end_of_input: on_failure, check_bounds: check);
926 checked = check;
927 }
928 macro_assembler->CheckNotCharacter(c, on_not_equal: on_failure);
929 }
930 return checked;
931}
932
933static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
934 bool one_byte,
935 uint16_t c1,
936 uint16_t c2,
937 BlockLabel* on_failure) {
938 uint16_t char_mask;
939 if (one_byte) {
940 char_mask = Symbols::kMaxOneCharCodeSymbol;
941 } else {
942 char_mask = Utf16::kMaxCodeUnit;
943 }
944 uint16_t exor = c1 ^ c2;
945 // Check whether exor has only one bit set.
946 if (((exor - 1) & exor) == 0) {
947 // If c1 and c2 differ only by one bit.
948 // Ecma262UnCanonicalize always gives the highest number last.
949 ASSERT(c2 > c1);
950 uint16_t mask = char_mask ^ exor;
951 macro_assembler->CheckNotCharacterAfterAnd(c: c1, and_with: mask, on_not_equal: on_failure);
952 return true;
953 }
954 ASSERT(c2 > c1);
955 uint16_t diff = c2 - c1;
956 if (((diff - 1) & diff) == 0 && c1 >= diff) {
957 // If the characters differ by 2^n but don't differ by one bit then
958 // subtract the difference from the found character, then do the or
959 // trick. We avoid the theoretical case where negative numbers are
960 // involved in order to simplify code generation.
961 uint16_t mask = char_mask ^ diff;
962 macro_assembler->CheckNotCharacterAfterMinusAnd(c: c1 - diff, minus: diff, and_with: mask,
963 on_not_equal: on_failure);
964 return true;
965 }
966 return false;
967}
968
969typedef bool EmitCharacterFunction(Zone* zone,
970 RegExpCompiler* compiler,
971 uint16_t c,
972 BlockLabel* on_failure,
973 intptr_t cp_offset,
974 bool check,
975 bool preloaded);
976
977// Only emits letters (things that have case). Only used for case independent
978// matches.
979static inline bool EmitAtomLetter(Zone* zone,
980 RegExpCompiler* compiler,
981 uint16_t c,
982 BlockLabel* on_failure,
983 intptr_t cp_offset,
984 bool check,
985 bool preloaded) {
986 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
987 bool one_byte = compiler->one_byte();
988 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
989 intptr_t length = GetCaseIndependentLetters(character: c, one_byte_subject: one_byte, letters: chars);
990 if (length <= 1) return false;
991 // We may not need to check against the end of the input string
992 // if this character lies before a character that matched.
993 if (!preloaded) {
994 macro_assembler->LoadCurrentCharacter(cp_offset, on_end_of_input: on_failure, check_bounds: check);
995 }
996 BlockLabel ok;
997 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
998 switch (length) {
999 case 2: {
1000 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, c1: chars[0],
1001 c2: chars[1], on_failure)) {
1002 } else {
1003 macro_assembler->CheckCharacter(c: chars[0], on_equal: &ok);
1004 macro_assembler->CheckNotCharacter(c: chars[1], on_not_equal: on_failure);
1005 macro_assembler->BindBlock(label: &ok);
1006 }
1007 break;
1008 }
1009 case 4:
1010 macro_assembler->CheckCharacter(c: chars[3], on_equal: &ok);
1011 FALL_THROUGH;
1012 case 3:
1013 macro_assembler->CheckCharacter(c: chars[0], on_equal: &ok);
1014 macro_assembler->CheckCharacter(c: chars[1], on_equal: &ok);
1015 macro_assembler->CheckNotCharacter(c: chars[2], on_not_equal: on_failure);
1016 macro_assembler->BindBlock(label: &ok);
1017 break;
1018 default:
1019 UNREACHABLE();
1020 break;
1021 }
1022 return true;
1023}
1024
1025static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1026 uint16_t border,
1027 BlockLabel* fall_through,
1028 BlockLabel* above_or_equal,
1029 BlockLabel* below) {
1030 if (below != fall_through) {
1031 masm->CheckCharacterLT(limit: border, on_less: below);
1032 if (above_or_equal != fall_through) masm->GoTo(to: above_or_equal);
1033 } else {
1034 masm->CheckCharacterGT(limit: border - 1, on_greater: above_or_equal);
1035 }
1036}
1037
1038static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1039 uint16_t first,
1040 uint16_t last,
1041 BlockLabel* fall_through,
1042 BlockLabel* in_range,
1043 BlockLabel* out_of_range) {
1044 if (in_range == fall_through) {
1045 if (first == last) {
1046 masm->CheckNotCharacter(c: first, on_not_equal: out_of_range);
1047 } else {
1048 masm->CheckCharacterNotInRange(from: first, to: last, on_not_in_range: out_of_range);
1049 }
1050 } else {
1051 if (first == last) {
1052 masm->CheckCharacter(c: first, on_equal: in_range);
1053 } else {
1054 masm->CheckCharacterInRange(from: first, to: last, on_in_range: in_range);
1055 }
1056 if (out_of_range != fall_through) masm->GoTo(to: out_of_range);
1057 }
1058}
1059
1060// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1061// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1062static void EmitUseLookupTable(RegExpMacroAssembler* masm,
1063 ZoneGrowableArray<uint16_t>* ranges,
1064 intptr_t start_index,
1065 intptr_t end_index,
1066 uint16_t min_char,
1067 BlockLabel* fall_through,
1068 BlockLabel* even_label,
1069 BlockLabel* odd_label) {
1070 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1071 const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1072
1073 intptr_t base = (min_char & ~kMask);
1074
1075 // Assert that everything is on one kTableSize page.
1076 for (intptr_t i = start_index; i <= end_index; i++) {
1077 ASSERT((ranges->At(i) & ~kMask) == base);
1078 }
1079 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base);
1080
1081 char templ[kSize];
1082 BlockLabel* on_bit_set;
1083 BlockLabel* on_bit_clear;
1084 intptr_t bit;
1085 if (even_label == fall_through) {
1086 on_bit_set = odd_label;
1087 on_bit_clear = even_label;
1088 bit = 1;
1089 } else {
1090 on_bit_set = even_label;
1091 on_bit_clear = odd_label;
1092 bit = 0;
1093 }
1094 for (intptr_t i = 0; i < (ranges->At(index: start_index) & kMask) && i < kSize;
1095 i++) {
1096 templ[i] = bit;
1097 }
1098 intptr_t j = 0;
1099 bit ^= 1;
1100 for (intptr_t i = start_index; i < end_index; i++) {
1101 for (j = (ranges->At(index: i) & kMask); j < (ranges->At(index: i + 1) & kMask); j++) {
1102 templ[j] = bit;
1103 }
1104 bit ^= 1;
1105 }
1106 for (intptr_t i = j; i < kSize; i++) {
1107 templ[i] = bit;
1108 }
1109 // TODO(erikcorry): Cache these.
1110 const TypedData& ba = TypedData::ZoneHandle(
1111 zone: masm->zone(), ptr: TypedData::New(class_id: kTypedDataUint8ArrayCid, len: kSize, space: Heap::kOld));
1112 for (intptr_t i = 0; i < kSize; i++) {
1113 ba.SetUint8(byte_offset: i, value: templ[i]);
1114 }
1115 masm->CheckBitInTable(table: ba, on_bit_set);
1116 if (on_bit_clear != fall_through) masm->GoTo(to: on_bit_clear);
1117}
1118
1119static void CutOutRange(RegExpMacroAssembler* masm,
1120 ZoneGrowableArray<uint16_t>* ranges,
1121 intptr_t start_index,
1122 intptr_t end_index,
1123 intptr_t cut_index,
1124 BlockLabel* even_label,
1125 BlockLabel* odd_label) {
1126 bool odd = (((cut_index - start_index) & 1) == 1);
1127 BlockLabel* in_range_label = odd ? odd_label : even_label;
1128 BlockLabel dummy;
1129 EmitDoubleBoundaryTest(masm, first: ranges->At(index: cut_index),
1130 last: ranges->At(index: cut_index + 1) - 1, fall_through: &dummy, in_range: in_range_label,
1131 out_of_range: &dummy);
1132 ASSERT(!dummy.is_linked());
1133 // Cut out the single range by rewriting the array. This creates a new
1134 // range that is a merger of the two ranges on either side of the one we
1135 // are cutting out. The oddity of the labels is preserved.
1136 for (intptr_t j = cut_index; j > start_index; j--) {
1137 (*ranges)[j] = ranges->At(index: j - 1);
1138 }
1139 for (intptr_t j = cut_index + 1; j < end_index; j++) {
1140 (*ranges)[j] = ranges->At(index: j + 1);
1141 }
1142}
1143
1144// Unicode case. Split the search space into kSize spaces that are handled
1145// with recursion.
1146static void SplitSearchSpace(ZoneGrowableArray<uint16_t>* ranges,
1147 intptr_t start_index,
1148 intptr_t end_index,
1149 intptr_t* new_start_index,
1150 intptr_t* new_end_index,
1151 uint16_t* border) {
1152 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1153 const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1154
1155 uint16_t first = ranges->At(index: start_index);
1156 uint16_t last = ranges->At(index: end_index) - 1;
1157
1158 *new_start_index = start_index;
1159 *border = (ranges->At(index: start_index) & ~kMask) + kSize;
1160 while (*new_start_index < end_index) {
1161 if (ranges->At(index: *new_start_index) > *border) break;
1162 (*new_start_index)++;
1163 }
1164 // new_start_index is the index of the first edge that is beyond the
1165 // current kSize space.
1166
1167 // For very large search spaces we do a binary chop search of the non-Latin1
1168 // space instead of just going to the end of the current kSize space. The
1169 // heuristics are complicated a little by the fact that any 128-character
1170 // encoding space can be quickly tested with a table lookup, so we don't
1171 // wish to do binary chop search at a smaller granularity than that. A
1172 // 128-character space can take up a lot of space in the ranges array if,
1173 // for example, we only want to match every second character (eg. the lower
1174 // case characters on some Unicode pages).
1175 intptr_t binary_chop_index = (end_index + start_index) / 2;
1176 // The first test ensures that we get to the code that handles the Latin1
1177 // range with a single not-taken branch, speeding up this important
1178 // character range (even non-Latin1 charset-based text has spaces and
1179 // punctuation).
1180 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // Latin1 case.
1181 end_index - start_index > (*new_start_index - start_index) * 2 &&
1182 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1183 ranges->At(index: binary_chop_index) >= first + 2 * kSize) {
1184 intptr_t scan_forward_for_section_border = binary_chop_index;
1185 intptr_t new_border = (ranges->At(index: binary_chop_index) | kMask) + 1;
1186
1187 while (scan_forward_for_section_border < end_index) {
1188 if (ranges->At(index: scan_forward_for_section_border) > new_border) {
1189 *new_start_index = scan_forward_for_section_border;
1190 *border = new_border;
1191 break;
1192 }
1193 scan_forward_for_section_border++;
1194 }
1195 }
1196
1197 ASSERT(*new_start_index > start_index);
1198 *new_end_index = *new_start_index - 1;
1199 if (ranges->At(index: *new_end_index) == *border) {
1200 (*new_end_index)--;
1201 }
1202 if (*border >= ranges->At(index: end_index)) {
1203 *border = ranges->At(index: end_index);
1204 *new_start_index = end_index; // Won't be used.
1205 *new_end_index = end_index - 1;
1206 }
1207}
1208
1209// Gets a series of segment boundaries representing a character class. If the
1210// character is in the range between an even and an odd boundary (counting from
1211// start_index) then go to even_label, otherwise go to odd_label. We already
1212// know that the character is in the range of min_char to max_char inclusive.
1213// Either label can be null indicating backtracking. Either label can also be
1214// equal to the fall_through label.
1215static void GenerateBranches(RegExpMacroAssembler* masm,
1216 ZoneGrowableArray<uint16_t>* ranges,
1217 intptr_t start_index,
1218 intptr_t end_index,
1219 uint16_t min_char,
1220 uint16_t max_char,
1221 BlockLabel* fall_through,
1222 BlockLabel* even_label,
1223 BlockLabel* odd_label) {
1224 uint16_t first = ranges->At(index: start_index);
1225 uint16_t last = ranges->At(index: end_index) - 1;
1226
1227 ASSERT(min_char < first);
1228
1229 // Just need to test if the character is before or on-or-after
1230 // a particular character.
1231 if (start_index == end_index) {
1232 EmitBoundaryTest(masm, border: first, fall_through, above_or_equal: even_label, below: odd_label);
1233 return;
1234 }
1235
1236 // Another almost trivial case: There is one interval in the middle that is
1237 // different from the end intervals.
1238 if (start_index + 1 == end_index) {
1239 EmitDoubleBoundaryTest(masm, first, last, fall_through, in_range: even_label,
1240 out_of_range: odd_label);
1241 return;
1242 }
1243
1244 // It's not worth using table lookup if there are very few intervals in the
1245 // character class.
1246 if (end_index - start_index <= 6) {
1247 // It is faster to test for individual characters, so we look for those
1248 // first, then try arbitrary ranges in the second round.
1249 static intptr_t kNoCutIndex = -1;
1250 intptr_t cut = kNoCutIndex;
1251 for (intptr_t i = start_index; i < end_index; i++) {
1252 if (ranges->At(index: i) == ranges->At(index: i + 1) - 1) {
1253 cut = i;
1254 break;
1255 }
1256 }
1257 if (cut == kNoCutIndex) cut = start_index;
1258 CutOutRange(masm, ranges, start_index, end_index, cut_index: cut, even_label,
1259 odd_label);
1260 ASSERT(end_index - start_index >= 2);
1261 GenerateBranches(masm, ranges, start_index: start_index + 1, end_index: end_index - 1, min_char,
1262 max_char, fall_through, even_label, odd_label);
1263 return;
1264 }
1265
1266 // If there are a lot of intervals in the regexp, then we will use tables to
1267 // determine whether the character is inside or outside the character class.
1268 const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;
1269
1270 if ((max_char >> kBits) == (min_char >> kBits)) {
1271 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1272 fall_through, even_label, odd_label);
1273 return;
1274 }
1275
1276 if ((min_char >> kBits) != (first >> kBits)) {
1277 masm->CheckCharacterLT(limit: first, on_less: odd_label);
1278 GenerateBranches(masm, ranges, start_index: start_index + 1, end_index, min_char: first, max_char,
1279 fall_through, even_label: odd_label, odd_label: even_label);
1280 return;
1281 }
1282
1283 intptr_t new_start_index = 0;
1284 intptr_t new_end_index = 0;
1285 uint16_t border = 0;
1286
1287 SplitSearchSpace(ranges, start_index, end_index, new_start_index: &new_start_index,
1288 new_end_index: &new_end_index, border: &border);
1289
1290 BlockLabel handle_rest;
1291 BlockLabel* above = &handle_rest;
1292 if (border == last + 1) {
1293 // We didn't find any section that started after the limit, so everything
1294 // above the border is one of the terminal labels.
1295 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1296 ASSERT(new_end_index == end_index - 1);
1297 }
1298
1299 ASSERT(start_index <= new_end_index);
1300 ASSERT(new_start_index <= end_index);
1301 ASSERT(start_index < new_start_index);
1302 ASSERT(new_end_index < end_index);
1303 ASSERT(new_end_index + 1 == new_start_index ||
1304 (new_end_index + 2 == new_start_index &&
1305 border == ranges->At(new_end_index + 1)));
1306 ASSERT(min_char < border - 1);
1307 ASSERT(border < max_char);
1308 ASSERT(ranges->At(new_end_index) < border);
1309 ASSERT(border < ranges->At(new_start_index) ||
1310 (border == ranges->At(new_start_index) &&
1311 new_start_index == end_index && new_end_index == end_index - 1 &&
1312 border == last + 1));
1313 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1));
1314
1315 masm->CheckCharacterGT(limit: border - 1, on_greater: above);
1316 BlockLabel dummy;
1317 GenerateBranches(masm, ranges, start_index, end_index: new_end_index, min_char,
1318 max_char: border - 1, fall_through: &dummy, even_label, odd_label);
1319
1320 if (handle_rest.is_linked()) {
1321 masm->BindBlock(label: &handle_rest);
1322 bool flip = (new_start_index & 1) != (start_index & 1);
1323 GenerateBranches(masm, ranges, start_index: new_start_index, end_index, min_char: border, max_char,
1324 fall_through: &dummy, even_label: flip ? odd_label : even_label,
1325 odd_label: flip ? even_label : odd_label);
1326 }
1327}
1328
1329static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1330 RegExpCharacterClass* cc,
1331 bool one_byte,
1332 BlockLabel* on_failure,
1333 intptr_t cp_offset,
1334 bool check_offset,
1335 bool preloaded,
1336 Zone* zone) {
1337 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1338 if (!CharacterRange::IsCanonical(ranges)) {
1339 CharacterRange::Canonicalize(ranges);
1340 }
1341
1342 uint16_t max_char;
1343 if (one_byte) {
1344 max_char = Symbols::kMaxOneCharCodeSymbol;
1345 } else {
1346 max_char = Utf16::kMaxCodeUnit;
1347 }
1348
1349 intptr_t range_count = ranges->length();
1350
1351 intptr_t last_valid_range = range_count - 1;
1352 while (last_valid_range >= 0) {
1353 const CharacterRange& range = ranges->At(index: last_valid_range);
1354 if (range.from() <= max_char) {
1355 break;
1356 }
1357 last_valid_range--;
1358 }
1359
1360 if (last_valid_range < 0) {
1361 if (!cc->is_negated()) {
1362 macro_assembler->GoTo(to: on_failure);
1363 }
1364 if (check_offset) {
1365 macro_assembler->CheckPosition(cp_offset, on_outside_input: on_failure);
1366 }
1367 return;
1368 }
1369
1370 if (last_valid_range == 0 && ranges->At(index: 0).IsEverything(max: max_char)) {
1371 if (cc->is_negated()) {
1372 macro_assembler->GoTo(to: on_failure);
1373 } else {
1374 // This is a common case hit by non-anchored expressions.
1375 if (check_offset) {
1376 macro_assembler->CheckPosition(cp_offset, on_outside_input: on_failure);
1377 }
1378 }
1379 return;
1380 }
1381
1382 if (!preloaded) {
1383 macro_assembler->LoadCurrentCharacter(cp_offset, on_end_of_input: on_failure, check_bounds: check_offset);
1384 }
1385
1386 if (cc->is_standard() && macro_assembler->CheckSpecialCharacterClass(
1387 type: cc->standard_type(), on_no_match: on_failure)) {
1388 return;
1389 }
1390
1391 // A new list with ascending entries. Each entry is a code unit
1392 // where there is a boundary between code units that are part of
1393 // the class and code units that are not. Normally we insert an
1394 // entry at zero which goes to the failure label, but if there
1395 // was already one there we fall through for success on that entry.
1396 // Subsequent entries have alternating meaning (success/failure).
1397 ZoneGrowableArray<uint16_t>* range_boundaries =
1398 new (zone) ZoneGrowableArray<uint16_t>(last_valid_range);
1399
1400 bool zeroth_entry_is_failure = !cc->is_negated();
1401
1402 for (intptr_t i = 0; i <= last_valid_range; i++) {
1403 const CharacterRange& range = ranges->At(index: i);
1404 if (range.from() == 0) {
1405 ASSERT(i == 0);
1406 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1407 } else {
1408 range_boundaries->Add(value: range.from());
1409 }
1410 if (range.to() + 1 <= max_char) {
1411 range_boundaries->Add(value: range.to() + 1);
1412 }
1413 }
1414 intptr_t end_index = range_boundaries->length() - 1;
1415
1416 BlockLabel fall_through;
1417 GenerateBranches(masm: macro_assembler, ranges: range_boundaries,
1418 start_index: 0, // start_index.
1419 end_index,
1420 min_char: 0, // min_char.
1421 max_char, fall_through: &fall_through,
1422 even_label: zeroth_entry_is_failure ? &fall_through : on_failure,
1423 odd_label: zeroth_entry_is_failure ? on_failure : &fall_through);
1424 macro_assembler->BindBlock(label: &fall_through);
1425}
1426
1427RegExpNode::~RegExpNode() {}
1428
1429RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1430 Trace* trace) {
1431 // If we are generating a greedy loop then don't stop and don't reuse code.
1432 if (trace->stop_node() != nullptr) {
1433 return CONTINUE;
1434 }
1435
1436 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1437 if (trace->is_trivial()) {
1438 if (label_.is_bound()) {
1439 // We are being asked to generate a generic version, but that's already
1440 // been done so just go to it.
1441 macro_assembler->GoTo(to: &label_);
1442 return DONE;
1443 }
1444 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1445 // To avoid too deep recursion we push the node to the work queue and just
1446 // generate a goto here.
1447 compiler->AddWork(node: this);
1448 macro_assembler->GoTo(to: &label_);
1449 return DONE;
1450 }
1451 // Generate generic version of the node and bind the label for later use.
1452 macro_assembler->BindBlock(label: &label_);
1453 return CONTINUE;
1454 }
1455
1456 // We are being asked to make a non-generic version. Keep track of how many
1457 // non-generic versions we generate so as not to overdo it.
1458 trace_count_++;
1459 if (kRegexpOptimization && trace_count_ < kMaxCopiesCodeGenerated &&
1460 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1461 return CONTINUE;
1462 }
1463
1464 // If we get here code has been generated for this node too many times or
1465 // recursion is too deep. Time to switch to a generic version. The code for
1466 // generic versions above can handle deep recursion properly.
1467 trace->Flush(compiler, successor: this);
1468 return DONE;
1469}
1470
1471intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,
1472 intptr_t budget,
1473 bool not_at_start) {
1474 if (budget <= 0) return 0;
1475 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1476 return on_success()->EatsAtLeast(still_to_find, budget: budget - 1, not_at_start);
1477}
1478
1479void ActionNode::FillInBMInfo(intptr_t offset,
1480 intptr_t budget,
1481 BoyerMooreLookahead* bm,
1482 bool not_at_start) {
1483 if (action_type_ == BEGIN_SUBMATCH) {
1484 bm->SetRest(offset);
1485 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1486 on_success()->FillInBMInfo(offset, budget: budget - 1, bm, not_at_start);
1487 }
1488 SaveBMInfo(bm, not_at_start, offset);
1489}
1490
1491intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,
1492 intptr_t budget,
1493 bool not_at_start) {
1494 if (budget <= 0) return 0;
1495 // If we know we are not at the start and we are asked "how many characters
1496 // will you match if you succeed?" then we can answer anything since false
1497 // implies false. So lets just return the max answer (still_to_find) since
1498 // that won't prevent us from preloading a lot of characters for the other
1499 // branches in the node graph.
1500 if (assertion_type() == AT_START && not_at_start) return still_to_find;
1501 return on_success()->EatsAtLeast(still_to_find, budget: budget - 1, not_at_start);
1502}
1503
1504void AssertionNode::FillInBMInfo(intptr_t offset,
1505 intptr_t budget,
1506 BoyerMooreLookahead* bm,
1507 bool not_at_start) {
1508 // Match the behaviour of EatsAtLeast on this node.
1509 if (assertion_type() == AT_START && not_at_start) return;
1510 on_success()->FillInBMInfo(offset, budget: budget - 1, bm, not_at_start);
1511 SaveBMInfo(bm, not_at_start, offset);
1512}
1513
1514intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
1515 intptr_t budget,
1516 bool not_at_start) {
1517 if (read_backward()) return 0;
1518 if (budget <= 0) return 0;
1519 return on_success()->EatsAtLeast(still_to_find, budget: budget - 1, not_at_start);
1520}
1521
1522intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
1523 intptr_t budget,
1524 bool not_at_start) {
1525 if (read_backward()) return 0;
1526 intptr_t answer = Length();
1527 if (answer >= still_to_find) return answer;
1528 if (budget <= 0) return answer;
1529 // We are not at start after this node so we set the last argument to 'true'.
1530 return answer +
1531 on_success()->EatsAtLeast(still_to_find: still_to_find - answer, budget: budget - 1, not_at_start: true);
1532}
1533
1534intptr_t NegativeLookaroundChoiceNode::EatsAtLeast(intptr_t still_to_find,
1535 intptr_t budget,
1536 bool not_at_start) {
1537 if (budget <= 0) return 0;
1538 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1539 // afterwards.
1540 RegExpNode* node = (*alternatives_)[1].node();
1541 return node->EatsAtLeast(still_to_find, budget: budget - 1, not_at_start);
1542}
1543
1544void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1545 QuickCheckDetails* details,
1546 RegExpCompiler* compiler,
1547 intptr_t filled_in,
1548 bool not_at_start) {
1549 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1550 // afterwards.
1551 RegExpNode* node = (*alternatives_)[1].node();
1552 return node->GetQuickCheckDetails(details, compiler, characters_filled_in: filled_in, not_at_start);
1553}
1554
1555intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,
1556 intptr_t budget,
1557 RegExpNode* ignore_this_node,
1558 bool not_at_start) {
1559 if (budget <= 0) return 0;
1560 intptr_t min = 100;
1561 intptr_t choice_count = alternatives_->length();
1562 budget = (budget - 1) / choice_count;
1563 for (intptr_t i = 0; i < choice_count; i++) {
1564 RegExpNode* node = (*alternatives_)[i].node();
1565 if (node == ignore_this_node) continue;
1566 intptr_t node_eats_at_least =
1567 node->EatsAtLeast(still_to_find, budget, not_at_start);
1568 if (node_eats_at_least < min) min = node_eats_at_least;
1569 if (min == 0) return 0;
1570 }
1571 return min;
1572}
1573
1574intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
1575 intptr_t budget,
1576 bool not_at_start) {
1577 return EatsAtLeastHelper(still_to_find, budget: budget - 1, ignore_this_node: loop_node_, not_at_start);
1578}
1579
1580intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,
1581 intptr_t budget,
1582 bool not_at_start) {
1583 return EatsAtLeastHelper(still_to_find, budget, ignore_this_node: nullptr, not_at_start);
1584}
1585
1586// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1587static inline uint32_t SmearBitsRight(uint32_t v) {
1588 v |= v >> 1;
1589 v |= v >> 2;
1590 v |= v >> 4;
1591 v |= v >> 8;
1592 v |= v >> 16;
1593 return v;
1594}
1595
1596bool QuickCheckDetails::Rationalize(bool asc) {
1597 bool found_useful_op = false;
1598 uint32_t char_mask;
1599 if (asc) {
1600 char_mask = Symbols::kMaxOneCharCodeSymbol;
1601 } else {
1602 char_mask = Utf16::kMaxCodeUnit;
1603 }
1604 mask_ = 0;
1605 value_ = 0;
1606 intptr_t char_shift = 0;
1607 for (intptr_t i = 0; i < characters_; i++) {
1608 Position* pos = &positions_[i];
1609 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {
1610 found_useful_op = true;
1611 }
1612 mask_ |= (pos->mask & char_mask) << char_shift;
1613 value_ |= (pos->value & char_mask) << char_shift;
1614 char_shift += asc ? 8 : 16;
1615 }
1616 return found_useful_op;
1617}
1618
1619bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1620 Trace* bounds_check_trace,
1621 Trace* trace,
1622 bool preload_has_checked_bounds,
1623 BlockLabel* on_possible_success,
1624 QuickCheckDetails* details,
1625 bool fall_through_on_failure) {
1626 if (details->characters() == 0) return false;
1627 GetQuickCheckDetails(details, compiler, characters_filled_in: 0,
1628 not_at_start: trace->at_start() == Trace::FALSE_VALUE);
1629 if (details->cannot_match()) return false;
1630 if (!details->Rationalize(asc: compiler->one_byte())) return false;
1631 ASSERT(details->characters() == 1 ||
1632 compiler->macro_assembler()->CanReadUnaligned());
1633 uint32_t mask = details->mask();
1634 uint32_t value = details->value();
1635
1636 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1637
1638 if (trace->characters_preloaded() != details->characters()) {
1639 ASSERT(trace->cp_offset() == bounds_check_trace->cp_offset());
1640 // We are attempting to preload the minimum number of characters
1641 // any choice would eat, so if the bounds check fails, then none of the
1642 // choices can succeed, so we can just immediately backtrack, rather
1643 // than go to the next choice.
1644 assembler->LoadCurrentCharacter(
1645 cp_offset: trace->cp_offset(), on_end_of_input: bounds_check_trace->backtrack(),
1646 check_bounds: !preload_has_checked_bounds, characters: details->characters());
1647 }
1648
1649 bool need_mask = true;
1650
1651 if (details->characters() == 1) {
1652 // If number of characters preloaded is 1 then we used a byte or 16 bit
1653 // load so the value is already masked down.
1654 uint32_t char_mask;
1655 if (compiler->one_byte()) {
1656 char_mask = Symbols::kMaxOneCharCodeSymbol;
1657 } else {
1658 char_mask = Utf16::kMaxCodeUnit;
1659 }
1660 if ((mask & char_mask) == char_mask) need_mask = false;
1661 mask &= char_mask;
1662 } else {
1663 // For 2-character preloads in one-byte mode or 1-character preloads in
1664 // two-byte mode we also use a 16 bit load with zero extend.
1665 if (details->characters() == 2 && compiler->one_byte()) {
1666 if ((mask & 0xffff) == 0xffff) need_mask = false;
1667 } else if (details->characters() == 1 && !compiler->one_byte()) {
1668 if ((mask & 0xffff) == 0xffff) need_mask = false;
1669 } else {
1670 if (mask == 0xffffffff) need_mask = false;
1671 }
1672 }
1673
1674 if (fall_through_on_failure) {
1675 if (need_mask) {
1676 assembler->CheckCharacterAfterAnd(c: value, and_with: mask, on_equal: on_possible_success);
1677 } else {
1678 assembler->CheckCharacter(c: value, on_equal: on_possible_success);
1679 }
1680 } else {
1681 if (need_mask) {
1682 assembler->CheckNotCharacterAfterAnd(c: value, and_with: mask, on_not_equal: trace->backtrack());
1683 } else {
1684 assembler->CheckNotCharacter(c: value, on_not_equal: trace->backtrack());
1685 }
1686 }
1687 return true;
1688}
1689
1690// Here is the meat of GetQuickCheckDetails (see also the comment on the
1691// super-class in the .h file).
1692//
1693// We iterate along the text object, building up for each character a
1694// mask and value that can be used to test for a quick failure to match.
1695// The masks and values for the positions will be combined into a single
1696// machine word for the current character width in order to be used in
1697// generating a quick check.
1698void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1699 RegExpCompiler* compiler,
1700 intptr_t characters_filled_in,
1701 bool not_at_start) {
1702#if defined(__GNUC__) && defined(__BYTE_ORDER__)
1703 // TODO(zerny): Make the combination code byte-order independent.
1704 ASSERT(details->characters() == 1 ||
1705 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__));
1706#endif
1707 // Do not collect any quick check details if the text node reads backward,
1708 // since it reads in the opposite direction than we use for quick checks.
1709 if (read_backward()) return;
1710 ASSERT(characters_filled_in < details->characters());
1711 intptr_t characters = details->characters();
1712 int32_t char_mask;
1713 if (compiler->one_byte()) {
1714 char_mask = Symbols::kMaxOneCharCodeSymbol;
1715 } else {
1716 char_mask = Utf16::kMaxCodeUnit;
1717 }
1718 for (intptr_t k = 0; k < elms_->length(); k++) {
1719 TextElement elm = elms_->At(index: k);
1720 if (elm.text_type() == TextElement::ATOM) {
1721 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1722 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {
1723 QuickCheckDetails::Position* pos =
1724 details->positions(index: characters_filled_in);
1725 uint16_t c = quarks->At(index: i);
1726 if (c > char_mask) {
1727 // If we expect a non-Latin1 character from an one-byte string,
1728 // there is no way we can match. Not even case independent
1729 // matching can turn an Latin1 character into non-Latin1 or
1730 // vice versa.
1731 // TODO(dcarney): issue 3550. Verify that this works as expected.
1732 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
1733 details->set_cannot_match();
1734 pos->determines_perfectly = false;
1735 return;
1736 }
1737 if (elm.atom()->ignore_case()) {
1738 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1739 intptr_t length =
1740 GetCaseIndependentLetters(character: c, one_byte_subject: compiler->one_byte(), letters: chars);
1741 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1742 if (length == 1) {
1743 // This letter has no case equivalents, so it's nice and simple
1744 // and the mask-compare will determine definitely whether we have
1745 // a match at this character position.
1746 pos->mask = char_mask;
1747 pos->value = c;
1748 pos->determines_perfectly = true;
1749 } else {
1750 uint32_t common_bits = char_mask;
1751 uint32_t bits = chars[0];
1752 for (intptr_t j = 1; j < length; j++) {
1753 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1754 common_bits ^= differing_bits;
1755 bits &= common_bits;
1756 }
1757 // If length is 2 and common bits has only one zero in it then
1758 // our mask and compare instruction will determine definitely
1759 // whether we have a match at this character position. Otherwise
1760 // it can only be an approximate check.
1761 uint32_t one_zero = (common_bits | ~char_mask);
1762 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1763 pos->determines_perfectly = true;
1764 }
1765 pos->mask = common_bits;
1766 pos->value = bits;
1767 }
1768 } else {
1769 // Don't ignore case. Nice simple case where the mask-compare will
1770 // determine definitely whether we have a match at this character
1771 // position.
1772 pos->mask = char_mask;
1773 pos->value = c;
1774 pos->determines_perfectly = true;
1775 }
1776 characters_filled_in++;
1777 ASSERT(characters_filled_in <= details->characters());
1778 if (characters_filled_in == details->characters()) {
1779 return;
1780 }
1781 }
1782 } else {
1783 QuickCheckDetails::Position* pos =
1784 details->positions(index: characters_filled_in);
1785 RegExpCharacterClass* tree = elm.char_class();
1786 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
1787 ASSERT(!ranges->is_empty());
1788 if (!CharacterRange::IsCanonical(ranges)) {
1789 CharacterRange::Canonicalize(ranges);
1790 }
1791 if (tree->is_negated()) {
1792 // A quick check uses multi-character mask and compare. There is no
1793 // useful way to incorporate a negative char class into this scheme
1794 // so we just conservatively create a mask and value that will always
1795 // succeed.
1796 pos->mask = 0;
1797 pos->value = 0;
1798 } else {
1799 intptr_t first_range = 0;
1800 while (ranges->At(index: first_range).from() > char_mask) {
1801 first_range++;
1802 if (first_range == ranges->length()) {
1803 details->set_cannot_match();
1804 pos->determines_perfectly = false;
1805 return;
1806 }
1807 }
1808 CharacterRange range = ranges->At(index: first_range);
1809 uint16_t from = range.from();
1810 uint16_t to = range.to();
1811 if (to > char_mask) {
1812 to = char_mask;
1813 }
1814 uint32_t differing_bits = (from ^ to);
1815 // A mask and compare is only perfect if the differing bits form a
1816 // number like 00011111 with one single block of trailing 1s.
1817 if ((differing_bits & (differing_bits + 1)) == 0 &&
1818 from + differing_bits == to) {
1819 pos->determines_perfectly = true;
1820 }
1821 uint32_t common_bits = ~SmearBitsRight(v: differing_bits);
1822 uint32_t bits = (from & common_bits);
1823 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {
1824 CharacterRange range = ranges->At(index: i);
1825 uint16_t from = range.from();
1826 uint16_t to = range.to();
1827 if (from > char_mask) continue;
1828 if (to > char_mask) to = char_mask;
1829 // Here we are combining more ranges into the mask and compare
1830 // value. With each new range the mask becomes more sparse and
1831 // so the chances of a false positive rise. A character class
1832 // with multiple ranges is assumed never to be equivalent to a
1833 // mask and compare operation.
1834 pos->determines_perfectly = false;
1835 uint32_t new_common_bits = (from ^ to);
1836 new_common_bits = ~SmearBitsRight(v: new_common_bits);
1837 common_bits &= new_common_bits;
1838 bits &= new_common_bits;
1839 uint32_t differing_bits = (from & common_bits) ^ bits;
1840 common_bits ^= differing_bits;
1841 bits &= common_bits;
1842 }
1843 pos->mask = common_bits;
1844 pos->value = bits;
1845 }
1846 characters_filled_in++;
1847 ASSERT(characters_filled_in <= details->characters());
1848 if (characters_filled_in == details->characters()) {
1849 return;
1850 }
1851 }
1852 }
1853 ASSERT(characters_filled_in != details->characters());
1854 if (!details->cannot_match()) {
1855 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1856 not_at_start: true);
1857 }
1858}
1859
1860void QuickCheckDetails::Clear() {
1861 for (int i = 0; i < characters_; i++) {
1862 positions_[i].mask = 0;
1863 positions_[i].value = 0;
1864 positions_[i].determines_perfectly = false;
1865 }
1866 characters_ = 0;
1867}
1868
1869void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {
1870 if (by >= characters_ || by < 0) {
1871 // check that by < 0 => characters_ == 0
1872 ASSERT(by >= 0 || characters_ == 0);
1873 Clear();
1874 return;
1875 }
1876 for (intptr_t i = 0; i < characters_ - by; i++) {
1877 positions_[i] = positions_[by + i];
1878 }
1879 for (intptr_t i = characters_ - by; i < characters_; i++) {
1880 positions_[i].mask = 0;
1881 positions_[i].value = 0;
1882 positions_[i].determines_perfectly = false;
1883 }
1884 characters_ -= by;
1885 // We could change mask_ and value_ here but we would never advance unless
1886 // they had already been used in a check and they won't be used again because
1887 // it would gain us nothing. So there's no point.
1888}
1889
1890void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {
1891 ASSERT(characters_ == other->characters_);
1892 if (other->cannot_match_) {
1893 return;
1894 }
1895 if (cannot_match_) {
1896 *this = *other;
1897 return;
1898 }
1899 for (intptr_t i = from_index; i < characters_; i++) {
1900 QuickCheckDetails::Position* pos = positions(index: i);
1901 QuickCheckDetails::Position* other_pos = other->positions(index: i);
1902 if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1903 !other_pos->determines_perfectly) {
1904 // Our mask-compare operation will be approximate unless we have the
1905 // exact same operation on both sides of the alternation.
1906 pos->determines_perfectly = false;
1907 }
1908 pos->mask &= other_pos->mask;
1909 pos->value &= pos->mask;
1910 other_pos->value &= pos->mask;
1911 uint16_t differing_bits = (pos->value ^ other_pos->value);
1912 pos->mask &= ~differing_bits;
1913 pos->value &= pos->mask;
1914 }
1915}
1916
1917class VisitMarker : public ValueObject {
1918 public:
1919 explicit VisitMarker(NodeInfo* info) : info_(info) {
1920 ASSERT(!info->visited);
1921 info->visited = true;
1922 }
1923 ~VisitMarker() { info_->visited = false; }
1924
1925 private:
1926 NodeInfo* info_;
1927};
1928
1929RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth) {
1930 if (info()->replacement_calculated) return replacement();
1931 if (depth < 0) return this;
1932 ASSERT(!info()->visited);
1933 VisitMarker marker(info());
1934 return FilterSuccessor(depth: depth - 1);
1935}
1936
1937RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth) {
1938 RegExpNode* next = on_success_->FilterOneByte(depth: depth - 1);
1939 if (next == nullptr) return set_replacement(nullptr);
1940 on_success_ = next;
1941 return set_replacement(this);
1942}
1943
1944// We need to check for the following characters: 0x39c 0x3bc 0x178.
1945static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
1946 // TODO(dcarney): this could be a lot more efficient.
1947 return range.Contains(i: 0x39c) || range.Contains(i: 0x3bc) ||
1948 range.Contains(i: 0x178);
1949}
1950
1951static bool RangesContainLatin1Equivalents(
1952 ZoneGrowableArray<CharacterRange>* ranges) {
1953 for (intptr_t i = 0; i < ranges->length(); i++) {
1954 // TODO(dcarney): this could be a lot more efficient.
1955 if (RangeContainsLatin1Equivalents(range: ranges->At(index: i))) return true;
1956 }
1957 return false;
1958}
1959
1960static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {
1961 ASSERT(c > Symbols::kMaxOneCharCodeSymbol);
1962 switch (c) {
1963 // This are equivalent characters in unicode.
1964 case 0x39c:
1965 case 0x3bc:
1966 return 0xb5;
1967 // This is an uppercase of a Latin-1 character
1968 // outside of Latin-1.
1969 case 0x178:
1970 return 0xff;
1971 }
1972 return 0;
1973}
1974
1975RegExpNode* TextNode::FilterOneByte(intptr_t depth) {
1976 if (info()->replacement_calculated) return replacement();
1977 if (depth < 0) return this;
1978 ASSERT(!info()->visited);
1979 VisitMarker marker(info());
1980 intptr_t element_count = elms_->length();
1981 for (intptr_t i = 0; i < element_count; i++) {
1982 TextElement elm = elms_->At(index: i);
1983 if (elm.text_type() == TextElement::ATOM) {
1984 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1985 for (intptr_t j = 0; j < quarks->length(); j++) {
1986 uint16_t c = quarks->At(index: j);
1987 if (c <= Symbols::kMaxOneCharCodeSymbol) continue;
1988 if (!elm.atom()->ignore_case()) return set_replacement(nullptr);
1989 // Here, we need to check for characters whose upper and lower cases
1990 // are outside the Latin-1 range.
1991 uint16_t converted = ConvertNonLatin1ToLatin1(c);
1992 // Character is outside Latin-1 completely
1993 if (converted == 0) return set_replacement(nullptr);
1994 // Convert quark to Latin-1 in place.
1995 (*quarks)[0] = converted;
1996 }
1997 } else {
1998 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
1999 RegExpCharacterClass* cc = elm.char_class();
2000 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
2001 if (!CharacterRange::IsCanonical(ranges)) {
2002 CharacterRange::Canonicalize(ranges);
2003 }
2004 // Now they are in order so we only need to look at the first.
2005 intptr_t range_count = ranges->length();
2006 if (cc->is_negated()) {
2007 if (range_count != 0 && ranges->At(index: 0).from() == 0 &&
2008 ranges->At(index: 0).to() >= Symbols::kMaxOneCharCodeSymbol) {
2009 // This will be handled in a later filter.
2010 if (cc->flags().IgnoreCase() &&
2011 RangesContainLatin1Equivalents(ranges)) {
2012 continue;
2013 }
2014 return set_replacement(nullptr);
2015 }
2016 } else {
2017 if (range_count == 0 ||
2018 ranges->At(index: 0).from() > Symbols::kMaxOneCharCodeSymbol) {
2019 // This will be handled in a later filter.
2020 if (cc->flags().IgnoreCase() &&
2021 RangesContainLatin1Equivalents(ranges))
2022 continue;
2023 return set_replacement(nullptr);
2024 }
2025 }
2026 }
2027 }
2028 return FilterSuccessor(depth: depth - 1);
2029}
2030
2031RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth) {
2032 if (info()->replacement_calculated) return replacement();
2033 if (depth < 0) return this;
2034 if (info()->visited) return this;
2035 {
2036 VisitMarker marker(info());
2037
2038 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth: depth - 1);
2039 // If we can't continue after the loop then there is no sense in doing the
2040 // loop.
2041 if (continue_replacement == nullptr) return set_replacement(nullptr);
2042 }
2043
2044 return ChoiceNode::FilterOneByte(depth: depth - 1);
2045}
2046
2047RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) {
2048 if (info()->replacement_calculated) return replacement();
2049 if (depth < 0) return this;
2050 if (info()->visited) return this;
2051 VisitMarker marker(info());
2052 intptr_t choice_count = alternatives_->length();
2053
2054 for (intptr_t i = 0; i < choice_count; i++) {
2055 GuardedAlternative alternative = alternatives_->At(index: i);
2056 if (alternative.guards() != nullptr &&
2057 alternative.guards()->length() != 0) {
2058 set_replacement(this);
2059 return this;
2060 }
2061 }
2062
2063 intptr_t surviving = 0;
2064 RegExpNode* survivor = nullptr;
2065 for (intptr_t i = 0; i < choice_count; i++) {
2066 GuardedAlternative alternative = alternatives_->At(index: i);
2067 RegExpNode* replacement = alternative.node()->FilterOneByte(depth: depth - 1);
2068 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2069 if (replacement != nullptr) {
2070 (*alternatives_)[i].set_node(replacement);
2071 surviving++;
2072 survivor = replacement;
2073 }
2074 }
2075 if (surviving < 2) return set_replacement(survivor);
2076
2077 set_replacement(this);
2078 if (surviving == choice_count) {
2079 return this;
2080 }
2081 // Only some of the nodes survived the filtering. We need to rebuild the
2082 // alternatives list.
2083 ZoneGrowableArray<GuardedAlternative>* new_alternatives =
2084 new (Z) ZoneGrowableArray<GuardedAlternative>(surviving);
2085 for (intptr_t i = 0; i < choice_count; i++) {
2086 RegExpNode* replacement =
2087 (*alternatives_)[i].node()->FilterOneByte(depth: depth - 1);
2088 if (replacement != nullptr) {
2089 (*alternatives_)[i].set_node(replacement);
2090 new_alternatives->Add(value: (*alternatives_)[i]);
2091 }
2092 }
2093 alternatives_ = new_alternatives;
2094 return this;
2095}
2096
2097RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth) {
2098 if (info()->replacement_calculated) return replacement();
2099 if (depth < 0) return this;
2100 if (info()->visited) return this;
2101 VisitMarker marker(info());
2102 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2103 // afterwards.
2104 RegExpNode* node = (*alternatives_)[1].node();
2105 RegExpNode* replacement = node->FilterOneByte(depth: depth - 1);
2106 if (replacement == nullptr) return set_replacement(nullptr);
2107 (*alternatives_)[1].set_node(replacement);
2108
2109 RegExpNode* neg_node = (*alternatives_)[0].node();
2110 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth: depth - 1);
2111 // If the negative lookahead is always going to fail then
2112 // we don't need to check it.
2113 if (neg_replacement == nullptr) return set_replacement(replacement);
2114 (*alternatives_)[0].set_node(neg_replacement);
2115 return set_replacement(this);
2116}
2117
2118void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2119 RegExpCompiler* compiler,
2120 intptr_t characters_filled_in,
2121 bool not_at_start) {
2122 if (body_can_be_zero_length_ || info()->visited) return;
2123 VisitMarker marker(info());
2124 return ChoiceNode::GetQuickCheckDetails(details, compiler,
2125 characters_filled_in, not_at_start);
2126}
2127
2128void LoopChoiceNode::FillInBMInfo(intptr_t offset,
2129 intptr_t budget,
2130 BoyerMooreLookahead* bm,
2131 bool not_at_start) {
2132 if (body_can_be_zero_length_ || budget <= 0) {
2133 bm->SetRest(offset);
2134 SaveBMInfo(bm, not_at_start, offset);
2135 return;
2136 }
2137 ChoiceNode::FillInBMInfo(offset, budget: budget - 1, bm, not_at_start);
2138 SaveBMInfo(bm, not_at_start, offset);
2139}
2140
2141void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2142 RegExpCompiler* compiler,
2143 intptr_t characters_filled_in,
2144 bool not_at_start) {
2145 not_at_start = (not_at_start || not_at_start_);
2146 intptr_t choice_count = alternatives_->length();
2147 ASSERT(choice_count > 0);
2148 (*alternatives_)[0].node()->GetQuickCheckDetails(
2149 details, compiler, characters_filled_in, not_at_start);
2150 for (intptr_t i = 1; i < choice_count; i++) {
2151 QuickCheckDetails new_details(details->characters());
2152 RegExpNode* node = (*alternatives_)[i].node();
2153 node->GetQuickCheckDetails(details: &new_details, compiler, characters_filled_in,
2154 not_at_start);
2155 // Here we merge the quick match details of the two branches.
2156 details->Merge(other: &new_details, from_index: characters_filled_in);
2157 }
2158}
2159
2160// Check for [0-9A-Z_a-z].
2161static void EmitWordCheck(RegExpMacroAssembler* assembler,
2162 BlockLabel* word,
2163 BlockLabel* non_word,
2164 bool fall_through_on_word) {
2165 if (assembler->CheckSpecialCharacterClass(
2166 type: fall_through_on_word ? 'w' : 'W',
2167 on_no_match: fall_through_on_word ? non_word : word)) {
2168 // Optimized implementation available.
2169 return;
2170 }
2171 assembler->CheckCharacterGT(limit: 'z', on_greater: non_word);
2172 assembler->CheckCharacterLT(limit: '0', on_less: non_word);
2173 assembler->CheckCharacterGT(limit: 'a' - 1, on_greater: word);
2174 assembler->CheckCharacterLT(limit: '9' + 1, on_less: word);
2175 assembler->CheckCharacterLT(limit: 'A', on_less: non_word);
2176 assembler->CheckCharacterLT(limit: 'Z' + 1, on_less: word);
2177 if (fall_through_on_word) {
2178 assembler->CheckNotCharacter(c: '_', on_not_equal: non_word);
2179 } else {
2180 assembler->CheckCharacter(c: '_', on_equal: word);
2181 }
2182}
2183
2184// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2185// that matches newline or the start of input).
2186static void EmitHat(RegExpCompiler* compiler,
2187 RegExpNode* on_success,
2188 Trace* trace) {
2189 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2190 // We will be loading the previous character into the current character
2191 // register.
2192 Trace new_trace(*trace);
2193 new_trace.InvalidateCurrentCharacter();
2194
2195 BlockLabel ok;
2196 if (new_trace.cp_offset() == 0) {
2197 // The start of input counts as a newline in this context, so skip to
2198 // ok if we are at the start.
2199 assembler->CheckAtStart(on_at_start: &ok);
2200 }
2201 // We already checked that we are not at the start of input so it must be
2202 // OK to load the previous character.
2203 assembler->LoadCurrentCharacter(cp_offset: new_trace.cp_offset() - 1,
2204 on_end_of_input: new_trace.backtrack(), check_bounds: false);
2205 if (!assembler->CheckSpecialCharacterClass(type: 'n', on_no_match: new_trace.backtrack())) {
2206 // Newline means \n, \r, 0x2028 or 0x2029.
2207 if (!compiler->one_byte()) {
2208 assembler->CheckCharacterAfterAnd(c: 0x2028, and_with: 0xfffe, on_equal: &ok);
2209 }
2210 assembler->CheckCharacter(c: '\n', on_equal: &ok);
2211 assembler->CheckNotCharacter(c: '\r', on_not_equal: new_trace.backtrack());
2212 }
2213 assembler->BindBlock(label: &ok);
2214 on_success->Emit(compiler, trace: &new_trace);
2215}
2216
2217// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2218void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2219 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2220 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2221 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2222 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2223 if (lookahead == nullptr) {
2224 intptr_t eats_at_least =
2225 Utils::Minimum(x: kMaxLookaheadForBoyerMoore,
2226 y: EatsAtLeast(still_to_find: kMaxLookaheadForBoyerMoore, budget: kRecursionBudget,
2227 not_at_start));
2228 if (eats_at_least >= 1) {
2229 BoyerMooreLookahead* bm =
2230 new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
2231 FillInBMInfo(offset: 0, budget: kRecursionBudget, bm, not_at_start);
2232 if (bm->at(i: 0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2233 if (bm->at(i: 0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2234 }
2235 } else {
2236 if (lookahead->at(i: 0)->is_non_word())
2237 next_is_word_character = Trace::FALSE_VALUE;
2238 if (lookahead->at(i: 0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2239 }
2240 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2241 if (next_is_word_character == Trace::UNKNOWN) {
2242 BlockLabel before_non_word;
2243 BlockLabel before_word;
2244 if (trace->characters_preloaded() != 1) {
2245 assembler->LoadCurrentCharacter(cp_offset: trace->cp_offset(), on_end_of_input: &before_non_word);
2246 }
2247 // Fall through on non-word.
2248 EmitWordCheck(assembler, word: &before_word, non_word: &before_non_word, fall_through_on_word: false);
2249 // Next character is not a word character.
2250 assembler->BindBlock(label: &before_non_word);
2251 BlockLabel ok;
2252 // Backtrack on \B (non-boundary check) if previous is a word,
2253 // since we know next *is not* a word and this would be a boundary.
2254 BacktrackIfPrevious(compiler, trace, backtrack_if_previous: at_boundary ? kIsNonWord : kIsWord);
2255
2256 if (!assembler->IsClosed()) {
2257 assembler->GoTo(to: &ok);
2258 }
2259
2260 assembler->BindBlock(label: &before_word);
2261 BacktrackIfPrevious(compiler, trace, backtrack_if_previous: at_boundary ? kIsWord : kIsNonWord);
2262 assembler->BindBlock(label: &ok);
2263 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2264 BacktrackIfPrevious(compiler, trace, backtrack_if_previous: at_boundary ? kIsWord : kIsNonWord);
2265 } else {
2266 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2267 BacktrackIfPrevious(compiler, trace, backtrack_if_previous: at_boundary ? kIsNonWord : kIsWord);
2268 }
2269}
2270
2271void AssertionNode::BacktrackIfPrevious(
2272 RegExpCompiler* compiler,
2273 Trace* trace,
2274 AssertionNode::IfPrevious backtrack_if_previous) {
2275 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2276 Trace new_trace(*trace);
2277 new_trace.InvalidateCurrentCharacter();
2278
2279 BlockLabel fall_through, dummy;
2280
2281 BlockLabel* non_word = backtrack_if_previous == kIsNonWord
2282 ? new_trace.backtrack()
2283 : &fall_through;
2284 BlockLabel* word = backtrack_if_previous == kIsNonWord
2285 ? &fall_through
2286 : new_trace.backtrack();
2287
2288 if (new_trace.cp_offset() == 0) {
2289 // The start of input counts as a non-word character, so the question is
2290 // decided if we are at the start.
2291 assembler->CheckAtStart(on_at_start: non_word);
2292 }
2293 // We already checked that we are not at the start of input so it must be
2294 // OK to load the previous character.
2295 assembler->LoadCurrentCharacter(cp_offset: new_trace.cp_offset() - 1, on_end_of_input: &dummy, check_bounds: false);
2296 EmitWordCheck(assembler, word, non_word, fall_through_on_word: backtrack_if_previous == kIsNonWord);
2297
2298 assembler->BindBlock(label: &fall_through);
2299 on_success()->Emit(compiler, trace: &new_trace);
2300}
2301
2302void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2303 RegExpCompiler* compiler,
2304 intptr_t filled_in,
2305 bool not_at_start) {
2306 if (assertion_type_ == AT_START && not_at_start) {
2307 details->set_cannot_match();
2308 return;
2309 }
2310 return on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in: filled_in,
2311 not_at_start);
2312}
2313
2314void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2315 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2316 switch (assertion_type_) {
2317 case AT_END: {
2318 BlockLabel ok;
2319 assembler->CheckPosition(cp_offset: trace->cp_offset(), on_outside_input: &ok);
2320 assembler->GoTo(to: trace->backtrack());
2321 assembler->BindBlock(label: &ok);
2322 break;
2323 }
2324 case AT_START: {
2325 if (trace->at_start() == Trace::FALSE_VALUE) {
2326 assembler->GoTo(to: trace->backtrack());
2327 return;
2328 }
2329 if (trace->at_start() == Trace::UNKNOWN) {
2330 assembler->CheckNotAtStart(cp_offset: trace->cp_offset(), on_not_at_start: trace->backtrack());
2331 Trace at_start_trace = *trace;
2332 at_start_trace.set_at_start(Trace::TRUE_VALUE);
2333 on_success()->Emit(compiler, trace: &at_start_trace);
2334 return;
2335 }
2336 } break;
2337 case AFTER_NEWLINE:
2338 EmitHat(compiler, on_success: on_success(), trace);
2339 return;
2340 case AT_BOUNDARY:
2341 case AT_NON_BOUNDARY: {
2342 EmitBoundaryCheck(compiler, trace);
2343 return;
2344 }
2345 }
2346 on_success()->Emit(compiler, trace);
2347}
2348
2349static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2350 if (quick_check == nullptr) return false;
2351 if (offset >= quick_check->characters()) return false;
2352 return quick_check->positions(index: offset)->determines_perfectly;
2353}
2354
2355static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {
2356 if (index > *checked_up_to) {
2357 *checked_up_to = index;
2358 }
2359}
2360
2361// We call this repeatedly to generate code for each pass over the text node.
2362// The passes are in increasing order of difficulty because we hope one
2363// of the first passes will fail in which case we are saved the work of the
2364// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2365// we will check the '%' in the first pass, the case independent 'a' in the
2366// second pass and the character class in the last pass.
2367//
2368// The passes are done from right to left, so for example to test for /bar/
2369// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2370// and then a 'b' with offset 0. This means we can avoid the end-of-input
2371// bounds check most of the time. In the example we only need to check for
2372// end-of-input when loading the putative 'r'.
2373//
2374// A slight complication involves the fact that the first character may already
2375// be fetched into a register by the previous node. In this case we want to
2376// do the test for that character first. We do this in separate passes. The
2377// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2378// pass has been performed then subsequent passes will have true in
2379// first_element_checked to indicate that character does not need to be
2380// checked again.
2381//
2382// In addition to all this we are passed a Trace, which can
2383// contain an AlternativeGeneration object. In this AlternativeGeneration
2384// object we can see details of any quick check that was already passed in
2385// order to get to the code we are now generating. The quick check can involve
2386// loading characters, which means we do not need to recheck the bounds
2387// up to the limit the quick check already checked. In addition the quick
2388// check can have involved a mask and compare operation which may simplify
2389// or obviate the need for further checks at some character positions.
2390void TextNode::TextEmitPass(RegExpCompiler* compiler,
2391 TextEmitPassType pass,
2392 bool preloaded,
2393 Trace* trace,
2394 bool first_element_checked,
2395 intptr_t* checked_up_to) {
2396 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2397 bool one_byte = compiler->one_byte();
2398 BlockLabel* backtrack = trace->backtrack();
2399 QuickCheckDetails* quick_check = trace->quick_check_performed();
2400 intptr_t element_count = elms_->length();
2401 intptr_t backward_offset = read_backward() ? -Length() : 0;
2402 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2403 TextElement elm = elms_->At(index: i);
2404 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2405 if (elm.text_type() == TextElement::ATOM) {
2406 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2407 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
2408 if (SkipPass(pass, ignore_case: elm.atom()->ignore_case())) continue;
2409 if (first_element_checked && i == 0 && j == 0) continue;
2410 if (DeterminedAlready(quick_check, offset: elm.cp_offset() + j)) continue;
2411 EmitCharacterFunction* emit_function = nullptr;
2412 uint16_t quark = quarks->At(index: j);
2413 if (elm.atom()->ignore_case()) {
2414 // Everywhere else we assume that a non-Latin-1 character cannot match
2415 // a Latin-1 character. Avoid the cases where this is assumption is
2416 // invalid by using the Latin1 equivalent instead.
2417 quark = Latin1::TryConvertToLatin1(c: quark);
2418 }
2419 switch (pass) {
2420 case NON_LATIN1_MATCH:
2421 ASSERT(one_byte);
2422 if (quark > Symbols::kMaxOneCharCodeSymbol) {
2423 assembler->GoTo(to: backtrack);
2424 return;
2425 }
2426 break;
2427 case NON_LETTER_CHARACTER_MATCH:
2428 emit_function = &EmitAtomNonLetter;
2429 break;
2430 case SIMPLE_CHARACTER_MATCH:
2431 emit_function = &EmitSimpleCharacter;
2432 break;
2433 case CASE_CHARACTER_MATCH:
2434 emit_function = &EmitAtomLetter;
2435 break;
2436 default:
2437 break;
2438 }
2439 if (emit_function != nullptr) {
2440 const bool bounds_check =
2441 *checked_up_to < (cp_offset + j) || read_backward();
2442 bool bound_checked =
2443 emit_function(Z, compiler, quarks->At(index: j), backtrack,
2444 cp_offset + j, bounds_check, preloaded);
2445 if (bound_checked) UpdateBoundsCheck(index: cp_offset + j, checked_up_to);
2446 }
2447 }
2448 } else {
2449 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2450 if (pass == CHARACTER_CLASS_MATCH) {
2451 if (first_element_checked && i == 0) continue;
2452 if (DeterminedAlready(quick_check, offset: elm.cp_offset())) continue;
2453 RegExpCharacterClass* cc = elm.char_class();
2454 bool bounds_check = *checked_up_to < cp_offset || read_backward();
2455 EmitCharClass(macro_assembler: assembler, cc, one_byte, on_failure: backtrack, cp_offset,
2456 check_offset: bounds_check, preloaded, Z);
2457 UpdateBoundsCheck(index: cp_offset, checked_up_to);
2458 }
2459 }
2460 }
2461}
2462
2463intptr_t TextNode::Length() {
2464 TextElement elm = elms_->Last();
2465 ASSERT(elm.cp_offset() >= 0);
2466 return elm.cp_offset() + elm.length();
2467}
2468
2469bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {
2470 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);
2471 if (ignore_case) {
2472 return pass == SIMPLE_CHARACTER_MATCH;
2473 } else {
2474 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2475 }
2476}
2477
2478TextNode* TextNode::CreateForCharacterRanges(
2479 ZoneGrowableArray<CharacterRange>* ranges,
2480 bool read_backward,
2481 RegExpNode* on_success,
2482 RegExpFlags flags) {
2483 ASSERT(ranges != nullptr);
2484 ZoneGrowableArray<TextElement>* elms = new ZoneGrowableArray<TextElement>(1);
2485 elms->Add(value: TextElement::CharClass(char_class: new RegExpCharacterClass(ranges, flags)));
2486 return new TextNode(elms, read_backward, on_success);
2487}
2488
2489TextNode* TextNode::CreateForSurrogatePair(CharacterRange lead,
2490 CharacterRange trail,
2491 bool read_backward,
2492 RegExpNode* on_success,
2493 RegExpFlags flags) {
2494 auto lead_ranges = CharacterRange::List(zone: on_success->zone(), range: lead);
2495 auto trail_ranges = CharacterRange::List(zone: on_success->zone(), range: trail);
2496 auto elms = new ZoneGrowableArray<TextElement>(2);
2497
2498 elms->Add(
2499 value: TextElement::CharClass(char_class: new RegExpCharacterClass(lead_ranges, flags)));
2500 elms->Add(
2501 value: TextElement::CharClass(char_class: new RegExpCharacterClass(trail_ranges, flags)));
2502
2503 return new TextNode(elms, read_backward, on_success);
2504}
2505
2506// This generates the code to match a text node. A text node can contain
2507// straight character sequences (possibly to be matched in a case-independent
2508// way) and character classes. For efficiency we do not do this in a single
2509// pass from left to right. Instead we pass over the text node several times,
2510// emitting code for some character positions every time. See the comment on
2511// TextEmitPass for details.
2512void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2513 LimitResult limit_result = LimitVersions(compiler, trace);
2514 if (limit_result == DONE) return;
2515 ASSERT(limit_result == CONTINUE);
2516
2517 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2518 compiler->SetRegExpTooBig();
2519 return;
2520 }
2521
2522 if (compiler->one_byte()) {
2523 intptr_t dummy = 0;
2524 TextEmitPass(compiler, pass: NON_LATIN1_MATCH, preloaded: false, trace, first_element_checked: false, checked_up_to: &dummy);
2525 }
2526
2527 bool first_elt_done = false;
2528 intptr_t bound_checked_to = trace->cp_offset() - 1;
2529 bound_checked_to += trace->bound_checked_up_to();
2530
2531 // If a character is preloaded into the current character register then
2532 // check that now.
2533 if (trace->characters_preloaded() == 1) {
2534 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2535 TextEmitPass(compiler, pass: static_cast<TextEmitPassType>(pass), preloaded: true, trace,
2536 first_element_checked: false, checked_up_to: &bound_checked_to);
2537 }
2538 first_elt_done = true;
2539 }
2540
2541 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2542 TextEmitPass(compiler, pass: static_cast<TextEmitPassType>(pass), preloaded: false, trace,
2543 first_element_checked: first_elt_done, checked_up_to: &bound_checked_to);
2544 }
2545
2546 Trace successor_trace(*trace);
2547 // If we advance backward, we may end up at the start.
2548 successor_trace.AdvanceCurrentPositionInTrace(
2549 by: read_backward() ? -Length() : Length(), compiler);
2550 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2551 : Trace::FALSE_VALUE);
2552 RecursionCheck rc(compiler);
2553 on_success()->Emit(compiler, trace: &successor_trace);
2554}
2555
2556void Trace::InvalidateCurrentCharacter() {
2557 characters_preloaded_ = 0;
2558}
2559
2560void Trace::AdvanceCurrentPositionInTrace(intptr_t by,
2561 RegExpCompiler* compiler) {
2562 // We don't have an instruction for shifting the current character register
2563 // down or for using a shifted value for anything so lets just forget that
2564 // we preloaded any characters into it.
2565 characters_preloaded_ = 0;
2566 // Adjust the offsets of the quick check performed information. This
2567 // information is used to find out what we already determined about the
2568 // characters by means of mask and compare.
2569 quick_check_performed_.Advance(by, one_byte: compiler->one_byte());
2570 cp_offset_ += by;
2571 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2572 compiler->SetRegExpTooBig();
2573 cp_offset_ = 0;
2574 }
2575 bound_checked_up_to_ =
2576 Utils::Maximum(x: static_cast<intptr_t>(0), y: bound_checked_up_to_ - by);
2577}
2578
2579void TextNode::MakeCaseIndependent(bool is_one_byte) {
2580 intptr_t element_count = elms_->length();
2581 for (intptr_t i = 0; i < element_count; i++) {
2582 TextElement elm = elms_->At(index: i);
2583 if (elm.text_type() == TextElement::CHAR_CLASS) {
2584 RegExpCharacterClass* cc = elm.char_class();
2585 bool case_equivalents_already_added =
2586 cc->flags().NeedsUnicodeCaseEquivalents();
2587 if (cc->flags().IgnoreCase() && !case_equivalents_already_added) {
2588 // None of the standard character classes is different in the case
2589 // independent case and it slows us down if we don't know that.
2590 if (cc->is_standard()) continue;
2591 CharacterRange::AddCaseEquivalents(ranges: cc->ranges(), is_one_byte, Z);
2592 }
2593 }
2594 }
2595}
2596
2597intptr_t TextNode::GreedyLoopTextLength() {
2598 TextElement elm = elms_->At(index: elms_->length() - 1);
2599 return elm.cp_offset() + elm.length();
2600}
2601
2602RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2603 RegExpCompiler* compiler) {
2604 if (read_backward()) return nullptr;
2605 if (elms_->length() != 1) return nullptr;
2606 TextElement elm = elms_->At(index: 0);
2607 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2608 RegExpCharacterClass* node = elm.char_class();
2609 ZoneGrowableArray<CharacterRange>* ranges = node->ranges();
2610 if (!CharacterRange::IsCanonical(ranges)) {
2611 CharacterRange::Canonicalize(ranges);
2612 }
2613 if (node->is_negated()) {
2614 return ranges->length() == 0 ? on_success() : nullptr;
2615 }
2616 if (ranges->length() != 1) return nullptr;
2617 uint32_t max_char;
2618 if (compiler->one_byte()) {
2619 max_char = Symbols::kMaxOneCharCodeSymbol;
2620 } else {
2621 max_char = Utf16::kMaxCodeUnit;
2622 }
2623 return ranges->At(index: 0).IsEverything(max: max_char) ? on_success() : nullptr;
2624}
2625
2626// Finds the fixed match length of a sequence of nodes that goes from
2627// this alternative and back to this choice node. If there are variable
2628// length nodes or other complications in the way then return a sentinel
2629// value indicating that a greedy loop cannot be constructed.
2630intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative(
2631 const GuardedAlternative* alternative) {
2632 intptr_t length = 0;
2633 RegExpNode* node = alternative->node();
2634 // Later we will generate code for all these text nodes using recursion
2635 // so we have to limit the max number.
2636 intptr_t recursion_depth = 0;
2637 while (node != this) {
2638 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2639 return kNodeIsTooComplexForGreedyLoops;
2640 }
2641 intptr_t node_length = node->GreedyLoopTextLength();
2642 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2643 return kNodeIsTooComplexForGreedyLoops;
2644 }
2645 length += node_length;
2646 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2647 node = seq_node->on_success();
2648 }
2649 return read_backward() ? -length : length;
2650}
2651
2652void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2653 ASSERT(loop_node_ == nullptr);
2654 AddAlternative(node: alt);
2655 loop_node_ = alt.node();
2656}
2657
2658void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2659 ASSERT(continue_node_ == nullptr);
2660 AddAlternative(node: alt);
2661 continue_node_ = alt.node();
2662}
2663
2664void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2665 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2666 if (trace->stop_node() == this) {
2667 // Back edge of greedy optimized loop node graph.
2668 intptr_t text_length =
2669 GreedyLoopTextLengthForAlternative(alternative: &alternatives_->At(index: 0));
2670 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2671 // Update the counter-based backtracking info on the stack. This is an
2672 // optimization for greedy loops (see below).
2673 ASSERT(trace->cp_offset() == text_length);
2674 macro_assembler->AdvanceCurrentPosition(by: text_length);
2675 macro_assembler->GoTo(to: trace->loop_label());
2676 return;
2677 }
2678 ASSERT(trace->stop_node() == nullptr);
2679 if (!trace->is_trivial()) {
2680 trace->Flush(compiler, successor: this);
2681 return;
2682 }
2683 ChoiceNode::Emit(compiler, trace);
2684}
2685
2686intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2687 intptr_t eats_at_least) {
2688 intptr_t preload_characters =
2689 Utils::Minimum(x: static_cast<intptr_t>(4), y: eats_at_least);
2690 if (compiler->one_byte()) {
2691#if !defined(DART_COMPRESSED_POINTERS) && !defined(TARGET_ARCH_RISCV32)
2692 if (preload_characters > 4) preload_characters = 4;
2693 // We can't preload 3 characters because there is no machine instruction
2694 // to do that. We can't just load 4 because we could be reading
2695 // beyond the end of the string, which could cause a memory fault.
2696 if (preload_characters == 3) preload_characters = 2;
2697#else
2698 // Ensure LoadCodeUnitsInstr can always produce a Smi. See
2699 // https://github.com/dart-lang/sdk/issues/29951
2700 if (preload_characters > 2) preload_characters = 2;
2701#endif
2702 } else {
2703#if !defined(DART_COMPRESSED_POINTERS) && !defined(TARGET_ARCH_RISCV32)
2704 if (preload_characters > 2) preload_characters = 2;
2705#else
2706 // Ensure LoadCodeUnitsInstr can always produce a Smi. See
2707 // https://github.com/dart-lang/sdk/issues/29951
2708 if (preload_characters > 1) preload_characters = 1;
2709#endif
2710 }
2711 if (!compiler->macro_assembler()->CanReadUnaligned()) {
2712 if (preload_characters > 1) preload_characters = 1;
2713 }
2714 return preload_characters;
2715}
2716
2717// This structure is used when generating the alternatives in a choice node. It
2718// records the way the alternative is being code generated.
2719struct AlternativeGeneration {
2720 AlternativeGeneration()
2721 : possible_success(),
2722 expects_preload(false),
2723 after(),
2724 quick_check_details() {}
2725 BlockLabel possible_success;
2726 bool expects_preload;
2727 BlockLabel after;
2728 QuickCheckDetails quick_check_details;
2729};
2730
2731// Creates a list of AlternativeGenerations. If the list has a reasonable
2732// size then it is on the stack, otherwise the excess is on the heap.
2733class AlternativeGenerationList {
2734 public:
2735 explicit AlternativeGenerationList(intptr_t count) : count_(count) {
2736 ASSERT(count >= 0);
2737 if (count > kAFew) {
2738 excess_alt_gens_.reset(p: new AlternativeGeneration[count - kAFew]);
2739 }
2740 }
2741
2742 AlternativeGeneration* at(intptr_t i) {
2743 ASSERT(0 <= i);
2744 ASSERT(i < count_);
2745 if (i < kAFew) {
2746 return &a_few_alt_gens_[i];
2747 }
2748 return &excess_alt_gens_[i - kAFew];
2749 }
2750
2751 private:
2752 static constexpr intptr_t kAFew = 10;
2753
2754 intptr_t count_;
2755 AlternativeGeneration a_few_alt_gens_[kAFew];
2756 std::unique_ptr<AlternativeGeneration[]> excess_alt_gens_;
2757
2758 DISALLOW_ALLOCATION();
2759 DISALLOW_COPY_AND_ASSIGN(AlternativeGenerationList);
2760};
2761
2762static constexpr int32_t kRangeEndMarker = Utf::kMaxCodePoint + 1;
2763
2764// The '2' variant is inclusive from and exclusive to.
2765// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
2766// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
2767// 0x180E has been removed from Unicode's Zs category and thus
2768// from ECMAScript's WhiteSpace category as of Unicode 6.3.
2769static constexpr int32_t kSpaceRanges[] = {
2770 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
2771 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
2772 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
2773static constexpr intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
2774static constexpr int32_t kWordRanges[] = {
2775 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
2776static constexpr intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
2777static constexpr int32_t kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
2778static constexpr intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
2779static constexpr int32_t kSurrogateRanges[] = {0xd800, 0xe000, kRangeEndMarker};
2780static constexpr intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
2781static constexpr int32_t kLineTerminatorRanges[] = {
2782 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
2783static constexpr intptr_t kLineTerminatorRangeCount =
2784 ARRAY_SIZE(kLineTerminatorRanges);
2785
2786void BoyerMoorePositionInfo::Set(intptr_t character) {
2787 SetInterval(Interval(character, character));
2788}
2789
2790void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2791 s_ = AddRange(containment: s_, ranges: kSpaceRanges, ranges_length: kSpaceRangeCount, new_range: interval);
2792 w_ = AddRange(containment: w_, ranges: kWordRanges, ranges_length: kWordRangeCount, new_range: interval);
2793 d_ = AddRange(containment: d_, ranges: kDigitRanges, ranges_length: kDigitRangeCount, new_range: interval);
2794 surrogate_ =
2795 AddRange(containment: surrogate_, ranges: kSurrogateRanges, ranges_length: kSurrogateRangeCount, new_range: interval);
2796 if (interval.to() - interval.from() >= kMapSize - 1) {
2797 if (map_count_ != kMapSize) {
2798 map_count_ = kMapSize;
2799 for (intptr_t i = 0; i < kMapSize; i++)
2800 (*map_)[i] = true;
2801 }
2802 return;
2803 }
2804 for (intptr_t i = interval.from(); i <= interval.to(); i++) {
2805 intptr_t mod_character = (i & kMask);
2806 if (!map_->At(index: mod_character)) {
2807 map_count_++;
2808 (*map_)[mod_character] = true;
2809 }
2810 if (map_count_ == kMapSize) return;
2811 }
2812}
2813
2814void BoyerMoorePositionInfo::SetAll() {
2815 s_ = w_ = d_ = kLatticeUnknown;
2816 if (map_count_ != kMapSize) {
2817 map_count_ = kMapSize;
2818 for (intptr_t i = 0; i < kMapSize; i++)
2819 (*map_)[i] = true;
2820 }
2821}
2822
2823BoyerMooreLookahead::BoyerMooreLookahead(intptr_t length,
2824 RegExpCompiler* compiler,
2825 Zone* zone)
2826 : length_(length), compiler_(compiler) {
2827 if (compiler->one_byte()) {
2828 max_char_ = Symbols::kMaxOneCharCodeSymbol;
2829 } else {
2830 max_char_ = Utf16::kMaxCodeUnit;
2831 }
2832 bitmaps_ = new (zone) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);
2833 for (intptr_t i = 0; i < length; i++) {
2834 bitmaps_->Add(value: new (zone) BoyerMoorePositionInfo(zone));
2835 }
2836}
2837
2838// Find the longest range of lookahead that has the fewest number of different
2839// characters that can occur at a given position. Since we are optimizing two
2840// different parameters at once this is a tradeoff.
2841bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {
2842 intptr_t biggest_points = 0;
2843 // If more than 32 characters out of 128 can occur it is unlikely that we can
2844 // be lucky enough to step forwards much of the time.
2845 const intptr_t kMaxMax = 32;
2846 for (intptr_t max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2847 max_number_of_chars *= 2) {
2848 biggest_points =
2849 FindBestInterval(max_number_of_chars, old_biggest_points: biggest_points, from, to);
2850 }
2851 if (biggest_points == 0) return false;
2852 return true;
2853}
2854
2855// Find the highest-points range between 0 and length_ where the character
2856// information is not too vague. 'Too vague' means that there are more than
2857// max_number_of_chars that can occur at this position. Calculates the number
2858// of points as the product of width-of-the-range and
2859// probability-of-finding-one-of-the-characters, where the probability is
2860// calculated using the frequency distribution of the sample subject string.
2861intptr_t BoyerMooreLookahead::FindBestInterval(intptr_t max_number_of_chars,
2862 intptr_t old_biggest_points,
2863 intptr_t* from,
2864 intptr_t* to) {
2865 intptr_t biggest_points = old_biggest_points;
2866 static constexpr intptr_t kSize = RegExpMacroAssembler::kTableSize;
2867 for (intptr_t i = 0; i < length_;) {
2868 while (i < length_ && Count(map_number: i) > max_number_of_chars)
2869 i++;
2870 if (i == length_) break;
2871 intptr_t remembered_from = i;
2872 bool union_map[kSize];
2873 for (intptr_t j = 0; j < kSize; j++)
2874 union_map[j] = false;
2875 while (i < length_ && Count(map_number: i) <= max_number_of_chars) {
2876 BoyerMoorePositionInfo* map = bitmaps_->At(index: i);
2877 for (intptr_t j = 0; j < kSize; j++)
2878 union_map[j] |= map->at(i: j);
2879 i++;
2880 }
2881 intptr_t frequency = 0;
2882 for (intptr_t j = 0; j < kSize; j++) {
2883 if (union_map[j]) {
2884 // Add 1 to the frequency to give a small per-character boost for
2885 // the cases where our sampling is not good enough and many
2886 // characters have a frequency of zero. This means the frequency
2887 // can theoretically be up to 2*kSize though we treat it mostly as
2888 // a fraction of kSize.
2889 frequency += compiler_->frequency_collator()->Frequency(in_character: j) + 1;
2890 }
2891 }
2892 // We use the probability of skipping times the distance we are skipping to
2893 // judge the effectiveness of this. Actually we have a cut-off: By
2894 // dividing by 2 we switch off the skipping if the probability of skipping
2895 // is less than 50%. This is because the multibyte mask-and-compare
2896 // skipping in quickcheck is more likely to do well on this case.
2897 bool in_quickcheck_range =
2898 ((i - remembered_from < 4) ||
2899 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2900 // Called 'probability' but it is only a rough estimate and can actually
2901 // be outside the 0-kSize range.
2902 intptr_t probability =
2903 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2904 intptr_t points = (i - remembered_from) * probability;
2905 if (points > biggest_points) {
2906 *from = remembered_from;
2907 *to = i - 1;
2908 biggest_points = points;
2909 }
2910 }
2911 return biggest_points;
2912}
2913
2914// Take all the characters that will not prevent a successful match if they
2915// occur in the subject string in the range between min_lookahead and
2916// max_lookahead (inclusive) measured from the current position. If the
2917// character at max_lookahead offset is not one of these characters, then we
2918// can safely skip forwards by the number of characters in the range.
2919intptr_t BoyerMooreLookahead::GetSkipTable(
2920 intptr_t min_lookahead,
2921 intptr_t max_lookahead,
2922 const TypedData& boolean_skip_table) {
2923 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2924
2925 const intptr_t kSkipArrayEntry = 0;
2926 const intptr_t kDontSkipArrayEntry = 1;
2927
2928 for (intptr_t i = 0; i < kSize; i++) {
2929 boolean_skip_table.SetUint8(byte_offset: i, value: kSkipArrayEntry);
2930 }
2931 intptr_t skip = max_lookahead + 1 - min_lookahead;
2932
2933 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2934 BoyerMoorePositionInfo* map = bitmaps_->At(index: i);
2935 for (intptr_t j = 0; j < kSize; j++) {
2936 if (map->at(i: j)) {
2937 boolean_skip_table.SetUint8(byte_offset: j, value: kDontSkipArrayEntry);
2938 }
2939 }
2940 }
2941
2942 return skip;
2943}
2944
2945// See comment above on the implementation of GetSkipTable.
2946void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2947 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2948
2949 intptr_t min_lookahead = 0;
2950 intptr_t max_lookahead = 0;
2951
2952 if (!FindWorthwhileInterval(from: &min_lookahead, to: &max_lookahead)) return;
2953
2954 bool found_single_character = false;
2955 intptr_t single_character = 0;
2956 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2957 BoyerMoorePositionInfo* map = bitmaps_->At(index: i);
2958 if (map->map_count() > 1 ||
2959 (found_single_character && map->map_count() != 0)) {
2960 found_single_character = false;
2961 break;
2962 }
2963 for (intptr_t j = 0; j < kSize; j++) {
2964 if (map->at(i: j)) {
2965 found_single_character = true;
2966 single_character = j;
2967 break;
2968 }
2969 }
2970 }
2971
2972 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;
2973
2974 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2975 // The mask-compare can probably handle this better.
2976 return;
2977 }
2978
2979 if (found_single_character) {
2980 BlockLabel cont, again;
2981 masm->BindBlock(label: &again);
2982 masm->LoadCurrentCharacter(cp_offset: max_lookahead, on_end_of_input: &cont, check_bounds: true);
2983 if (max_char_ > kSize) {
2984 masm->CheckCharacterAfterAnd(c: single_character,
2985 and_with: RegExpMacroAssembler::kTableMask, on_equal: &cont);
2986 } else {
2987 masm->CheckCharacter(c: single_character, on_equal: &cont);
2988 }
2989 masm->AdvanceCurrentPosition(by: lookahead_width);
2990 masm->GoTo(to: &again);
2991 masm->BindBlock(label: &cont);
2992 return;
2993 }
2994
2995 const TypedData& boolean_skip_table = TypedData::ZoneHandle(
2996 zone: compiler_->zone(),
2997 ptr: TypedData::New(class_id: kTypedDataUint8ArrayCid, len: kSize, space: Heap::kOld));
2998 intptr_t skip_distance =
2999 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
3000 ASSERT(skip_distance != 0);
3001
3002 BlockLabel cont, again;
3003
3004 masm->BindBlock(label: &again);
3005 masm->CheckPreemption(/*is_backtrack=*/false);
3006 masm->LoadCurrentCharacter(cp_offset: max_lookahead, on_end_of_input: &cont, check_bounds: true);
3007 masm->CheckBitInTable(table: boolean_skip_table, on_bit_set: &cont);
3008 masm->AdvanceCurrentPosition(by: skip_distance);
3009 masm->GoTo(to: &again);
3010 masm->BindBlock(label: &cont);
3011
3012 return;
3013}
3014
3015/* Code generation for choice nodes.
3016 *
3017 * We generate quick checks that do a mask and compare to eliminate a
3018 * choice. If the quick check succeeds then it jumps to the continuation to
3019 * do slow checks and check subsequent nodes. If it fails (the common case)
3020 * it falls through to the next choice.
3021 *
3022 * Here is the desired flow graph. Nodes directly below each other imply
3023 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3024 * 3 doesn't have a quick check so we have to call the slow check.
3025 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3026 * regexp continuation is generated directly after the Sn node, up to the
3027 * next GoTo if we decide to reuse some already generated code. Some
3028 * nodes expect preload_characters to be preloaded into the current
3029 * character register. R nodes do this preloading. Vertices are marked
3030 * F for failures and S for success (possible success in the case of quick
3031 * nodes). L, V, < and > are used as arrow heads.
3032 *
3033 * ----------> R
3034 * |
3035 * V
3036 * Q1 -----> S1
3037 * | S /
3038 * F| /
3039 * | F/
3040 * | /
3041 * | R
3042 * | /
3043 * V L
3044 * Q2 -----> S2
3045 * | S /
3046 * F| /
3047 * | F/
3048 * | /
3049 * | R
3050 * | /
3051 * V L
3052 * S3
3053 * |
3054 * F|
3055 * |
3056 * R
3057 * |
3058 * backtrack V
3059 * <----------Q4
3060 * \ F |
3061 * \ |S
3062 * \ F V
3063 * \-----S4
3064 *
3065 * For greedy loops we push the current position, then generate the code that
3066 * eats the input specially in EmitGreedyLoop. The other choice (the
3067 * continuation) is generated by the normal code in EmitChoices, and steps back
3068 * in the input to the starting position when it fails to match. The loop code
3069 * looks like this (U is the unwind code that steps back in the greedy loop).
3070 *
3071 * _____
3072 * / \
3073 * V |
3074 * ----------> S1 |
3075 * /| |
3076 * / |S |
3077 * F/ \_____/
3078 * /
3079 * |<-----
3080 * | \
3081 * V |S
3082 * Q2 ---> U----->backtrack
3083 * | F /
3084 * S| /
3085 * V F /
3086 * S2--/
3087 */
3088
3089GreedyLoopState::GreedyLoopState(bool not_at_start) {
3090 counter_backtrack_trace_.set_backtrack(&label_);
3091 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3092}
3093
3094void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3095#ifdef DEBUG
3096 intptr_t choice_count = alternatives_->length();
3097 for (intptr_t i = 0; i < choice_count - 1; i++) {
3098 GuardedAlternative alternative = alternatives_->At(i);
3099 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3100 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3101 for (intptr_t j = 0; j < guard_count; j++) {
3102 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
3103 }
3104 }
3105#endif
3106}
3107
3108void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3109 Trace* current_trace,
3110 PreloadState* state) {
3111 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3112 // Save some time by looking at most one machine word ahead.
3113 state->eats_at_least_ =
3114 EatsAtLeast(still_to_find: compiler->one_byte() ? 4 : 2, budget: kRecursionBudget,
3115 not_at_start: current_trace->at_start() == Trace::FALSE_VALUE);
3116 }
3117 state->preload_characters_ =
3118 CalculatePreloadCharacters(compiler, eats_at_least: state->eats_at_least_);
3119
3120 state->preload_is_current_ =
3121 (current_trace->characters_preloaded() == state->preload_characters_);
3122 state->preload_has_checked_bounds_ = state->preload_is_current_;
3123}
3124
3125void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3126 intptr_t choice_count = alternatives_->length();
3127
3128 if (choice_count == 1 && alternatives_->At(index: 0).guards() == nullptr) {
3129 alternatives_->At(index: 0).node()->Emit(compiler, trace);
3130 return;
3131 }
3132
3133 AssertGuardsMentionRegisters(trace);
3134
3135 LimitResult limit_result = LimitVersions(compiler, trace);
3136 if (limit_result == DONE) return;
3137 ASSERT(limit_result == CONTINUE);
3138
3139 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3140 // other choice nodes we only flush if we are out of code size budget.
3141 if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3142 trace->Flush(compiler, successor: this);
3143 return;
3144 }
3145
3146 RecursionCheck rc(compiler);
3147
3148 PreloadState preload;
3149 preload.init();
3150 GreedyLoopState greedy_loop_state(not_at_start());
3151
3152 intptr_t text_length =
3153 GreedyLoopTextLengthForAlternative(alternative: &alternatives_->At(index: 0));
3154 AlternativeGenerationList alt_gens(choice_count);
3155
3156 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3157 trace = EmitGreedyLoop(compiler, trace, alt_gens: &alt_gens, preloads: &preload,
3158 greedy_loop_state: &greedy_loop_state, text_length);
3159 } else {
3160 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3161 // match the traces produced pre-cleanup.
3162 BlockLabel second_choice;
3163 compiler->macro_assembler()->BindBlock(label: &second_choice);
3164
3165 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3166
3167 EmitChoices(compiler, alt_gens: &alt_gens, first_choice: 0, trace, preloads: &preload);
3168 }
3169
3170 // At this point we need to generate slow checks for the alternatives where
3171 // the quick check was inlined. We can recognize these because the associated
3172 // label was bound.
3173 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3174 for (intptr_t i = 0; i < choice_count; i++) {
3175 AlternativeGeneration* alt_gen = alt_gens.at(i);
3176 Trace new_trace(*trace);
3177 // If there are actions to be flushed we have to limit how many times
3178 // they are flushed. Take the budget of the parent trace and distribute
3179 // it fairly amongst the children.
3180 if (new_trace.actions() != nullptr) {
3181 new_trace.set_flush_budget(new_flush_budget);
3182 }
3183 bool next_expects_preload =
3184 i == choice_count - 1 ? false : alt_gens.at(i: i + 1)->expects_preload;
3185 EmitOutOfLineContinuation(compiler, trace: &new_trace, alternative: alternatives_->At(index: i),
3186 alt_gen, preload_characters: preload.preload_characters_,
3187 next_expects_preload);
3188 }
3189}
3190
3191Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3192 Trace* trace,
3193 AlternativeGenerationList* alt_gens,
3194 PreloadState* preload,
3195 GreedyLoopState* greedy_loop_state,
3196 intptr_t text_length) {
3197 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3198 // Here we have special handling for greedy loops containing only text nodes
3199 // and other simple nodes. These are handled by pushing the current
3200 // position on the stack and then incrementing the current position each
3201 // time around the switch. On backtrack we decrement the current position
3202 // and check it against the pushed value. This avoids pushing backtrack
3203 // information for each iteration of the loop, which could take up a lot of
3204 // space.
3205 ASSERT(trace->stop_node() == nullptr);
3206 macro_assembler->PushCurrentPosition();
3207 BlockLabel greedy_match_failed;
3208 Trace greedy_match_trace;
3209 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3210 greedy_match_trace.set_backtrack(&greedy_match_failed);
3211 BlockLabel loop_label;
3212 macro_assembler->BindBlock(label: &loop_label);
3213 macro_assembler->CheckPreemption(/*is_backtrack=*/false);
3214 greedy_match_trace.set_stop_node(this);
3215 greedy_match_trace.set_loop_label(&loop_label);
3216 (*alternatives_)[0].node()->Emit(compiler, trace: &greedy_match_trace);
3217 macro_assembler->BindBlock(label: &greedy_match_failed);
3218
3219 BlockLabel second_choice; // For use in greedy matches.
3220 macro_assembler->BindBlock(label: &second_choice);
3221
3222 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3223
3224 EmitChoices(compiler, alt_gens, first_choice: 1, trace: new_trace, preloads: preload);
3225
3226 macro_assembler->BindBlock(label: greedy_loop_state->label());
3227 // If we have unwound to the bottom then backtrack.
3228 macro_assembler->CheckGreedyLoop(on_tos_equals_current_position: trace->backtrack());
3229 // Otherwise try the second priority at an earlier position.
3230 macro_assembler->AdvanceCurrentPosition(by: -text_length);
3231 macro_assembler->GoTo(to: &second_choice);
3232 return new_trace;
3233}
3234
3235intptr_t ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3236 Trace* trace) {
3237 intptr_t eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3238 if (alternatives_->length() != 2) return eats_at_least;
3239
3240 GuardedAlternative alt1 = alternatives_->At(index: 1);
3241 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3242 return eats_at_least;
3243 }
3244 RegExpNode* eats_anything_node = alt1.node();
3245 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3246 return eats_at_least;
3247 }
3248
3249 // Really we should be creating a new trace when we execute this function,
3250 // but there is no need, because the code it generates cannot backtrack, and
3251 // we always arrive here with a trivial trace (since it's the entry to a
3252 // loop. That also implies that there are no preloaded characters, which is
3253 // good, because it means we won't be violating any assumptions by
3254 // overwriting those characters with new load instructions.
3255 ASSERT(trace->is_trivial());
3256
3257 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3258 // At this point we know that we are at a non-greedy loop that will eat
3259 // any character one at a time. Any non-anchored regexp has such a
3260 // loop prepended to it in order to find where it starts. We look for
3261 // a pattern of the form ...abc... where we can look 6 characters ahead
3262 // and step forwards 3 if the character is not one of abc. Abc need
3263 // not be atoms, they can be any reasonably limited character class or
3264 // small alternation.
3265 BoyerMooreLookahead* bm = bm_info(not_at_start: false);
3266 if (bm == nullptr) {
3267 eats_at_least = Utils::Minimum(
3268 x: kMaxLookaheadForBoyerMoore,
3269 y: EatsAtLeast(still_to_find: kMaxLookaheadForBoyerMoore, budget: kRecursionBudget, not_at_start: false));
3270 if (eats_at_least >= 1) {
3271 bm = new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
3272 GuardedAlternative alt0 = alternatives_->At(index: 0);
3273 alt0.node()->FillInBMInfo(offset: 0, budget: kRecursionBudget, bm, not_at_start: false);
3274 }
3275 }
3276 if (bm != nullptr) {
3277 bm->EmitSkipInstructions(masm: macro_assembler);
3278 }
3279 return eats_at_least;
3280}
3281
3282void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3283 AlternativeGenerationList* alt_gens,
3284 intptr_t first_choice,
3285 Trace* trace,
3286 PreloadState* preload) {
3287 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3288 SetUpPreLoad(compiler, current_trace: trace, state: preload);
3289
3290 // For now we just call all choices one after the other. The idea ultimately
3291 // is to use the Dispatch table to try only the relevant ones.
3292 intptr_t choice_count = alternatives_->length();
3293
3294 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3295
3296 for (intptr_t i = first_choice; i < choice_count; i++) {
3297 bool is_last = i == choice_count - 1;
3298 bool fall_through_on_failure = !is_last;
3299 GuardedAlternative alternative = alternatives_->At(index: i);
3300 AlternativeGeneration* alt_gen = alt_gens->at(i);
3301 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3302 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3303 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3304 Trace new_trace(*trace);
3305 new_trace.set_characters_preloaded(
3306 preload->preload_is_current_ ? preload->preload_characters_ : 0);
3307 if (preload->preload_has_checked_bounds_) {
3308 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3309 }
3310 new_trace.quick_check_performed()->Clear();
3311 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3312 if (!is_last) {
3313 new_trace.set_backtrack(&alt_gen->after);
3314 }
3315 alt_gen->expects_preload = preload->preload_is_current_;
3316 bool generate_full_check_inline = false;
3317 if (kRegexpOptimization &&
3318 try_to_emit_quick_check_for_alternative(is_first: i == 0) &&
3319 alternative.node()->EmitQuickCheck(
3320 compiler, bounds_check_trace: trace, trace: &new_trace, preload_has_checked_bounds: preload->preload_has_checked_bounds_,
3321 on_possible_success: &alt_gen->possible_success, details: &alt_gen->quick_check_details,
3322 fall_through_on_failure)) {
3323 // Quick check was generated for this choice.
3324 preload->preload_is_current_ = true;
3325 preload->preload_has_checked_bounds_ = true;
3326 // If we generated the quick check to fall through on possible success,
3327 // we now need to generate the full check inline.
3328 if (!fall_through_on_failure) {
3329 macro_assembler->BindBlock(label: &alt_gen->possible_success);
3330 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3331 new_trace.set_characters_preloaded(preload->preload_characters_);
3332 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3333 generate_full_check_inline = true;
3334 }
3335 } else if (alt_gen->quick_check_details.cannot_match()) {
3336 if (!fall_through_on_failure) {
3337 macro_assembler->GoTo(to: trace->backtrack());
3338 }
3339 continue;
3340 } else {
3341 // No quick check was generated. Put the full code here.
3342 // If this is not the first choice then there could be slow checks from
3343 // previous cases that go here when they fail. There's no reason to
3344 // insist that they preload characters since the slow check we are about
3345 // to generate probably can't use it.
3346 if (i != first_choice) {
3347 alt_gen->expects_preload = false;
3348 new_trace.InvalidateCurrentCharacter();
3349 }
3350 generate_full_check_inline = true;
3351 }
3352 if (generate_full_check_inline) {
3353 if (new_trace.actions() != nullptr) {
3354 new_trace.set_flush_budget(new_flush_budget);
3355 }
3356 for (intptr_t j = 0; j < guard_count; j++) {
3357 GenerateGuard(macro_assembler, guard: guards->At(index: j), trace: &new_trace);
3358 }
3359 alternative.node()->Emit(compiler, trace: &new_trace);
3360 preload->preload_is_current_ = false;
3361 }
3362 macro_assembler->BindBlock(label: &alt_gen->after);
3363 }
3364}
3365
3366void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3367 Trace* trace,
3368 GuardedAlternative alternative,
3369 AlternativeGeneration* alt_gen,
3370 intptr_t preload_characters,
3371 bool next_expects_preload) {
3372 if (!alt_gen->possible_success.is_linked()) return;
3373
3374 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3375 macro_assembler->BindBlock(label: &alt_gen->possible_success);
3376 Trace out_of_line_trace(*trace);
3377 out_of_line_trace.set_characters_preloaded(preload_characters);
3378 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3379 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3380 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3381 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3382 if (next_expects_preload) {
3383 BlockLabel reload_current_char;
3384 out_of_line_trace.set_backtrack(&reload_current_char);
3385 for (intptr_t j = 0; j < guard_count; j++) {
3386 GenerateGuard(macro_assembler, guard: guards->At(index: j), trace: &out_of_line_trace);
3387 }
3388 alternative.node()->Emit(compiler, trace: &out_of_line_trace);
3389 macro_assembler->BindBlock(label: &reload_current_char);
3390 // Reload the current character, since the next quick check expects that.
3391 // We don't need to check bounds here because we only get into this
3392 // code through a quick check which already did the checked load.
3393 macro_assembler->LoadCurrentCharacter(cp_offset: trace->cp_offset(), on_end_of_input: nullptr, check_bounds: false,
3394 characters: preload_characters);
3395 macro_assembler->GoTo(to: &(alt_gen->after));
3396 } else {
3397 out_of_line_trace.set_backtrack(&(alt_gen->after));
3398 for (intptr_t j = 0; j < guard_count; j++) {
3399 GenerateGuard(macro_assembler, guard: guards->At(index: j), trace: &out_of_line_trace);
3400 }
3401 alternative.node()->Emit(compiler, trace: &out_of_line_trace);
3402 }
3403}
3404
3405void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3406 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3407 LimitResult limit_result = LimitVersions(compiler, trace);
3408 if (limit_result == DONE) return;
3409 ASSERT(limit_result == CONTINUE);
3410
3411 RecursionCheck rc(compiler);
3412
3413 switch (action_type_) {
3414 case STORE_POSITION: {
3415 Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3416 data_.u_position_register.is_capture,
3417 trace);
3418 Trace new_trace = *trace;
3419 new_trace.add_action(new_action: &new_capture);
3420 on_success()->Emit(compiler, trace: &new_trace);
3421 break;
3422 }
3423 case INCREMENT_REGISTER: {
3424 Trace::DeferredIncrementRegister new_increment(
3425 data_.u_increment_register.reg);
3426 Trace new_trace = *trace;
3427 new_trace.add_action(new_action: &new_increment);
3428 on_success()->Emit(compiler, trace: &new_trace);
3429 break;
3430 }
3431 case SET_REGISTER: {
3432 Trace::DeferredSetRegister new_set(data_.u_store_register.reg,
3433 data_.u_store_register.value);
3434 Trace new_trace = *trace;
3435 new_trace.add_action(new_action: &new_set);
3436 on_success()->Emit(compiler, trace: &new_trace);
3437 break;
3438 }
3439 case CLEAR_CAPTURES: {
3440 Trace::DeferredClearCaptures new_capture(Interval(
3441 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3442 Trace new_trace = *trace;
3443 new_trace.add_action(new_action: &new_capture);
3444 on_success()->Emit(compiler, trace: &new_trace);
3445 break;
3446 }
3447 case BEGIN_SUBMATCH:
3448 if (!trace->is_trivial()) {
3449 trace->Flush(compiler, successor: this);
3450 } else {
3451 assembler->WriteCurrentPositionToRegister(
3452 reg: data_.u_submatch.current_position_register, cp_offset: 0);
3453 assembler->WriteStackPointerToRegister(
3454 reg: data_.u_submatch.stack_pointer_register);
3455 on_success()->Emit(compiler, trace);
3456 }
3457 break;
3458 case EMPTY_MATCH_CHECK: {
3459 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
3460 intptr_t stored_pos = 0;
3461 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
3462 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3463 bool know_dist = trace->GetStoredPosition(reg: start_pos_reg, cp_offset: &stored_pos);
3464 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3465 // If we know we haven't advanced and there is no minimum we
3466 // can just backtrack immediately.
3467 assembler->GoTo(to: trace->backtrack());
3468 } else if (know_dist && stored_pos < trace->cp_offset()) {
3469 // If we know we've advanced we can generate the continuation
3470 // immediately.
3471 on_success()->Emit(compiler, trace);
3472 } else if (!trace->is_trivial()) {
3473 trace->Flush(compiler, successor: this);
3474 } else {
3475 BlockLabel skip_empty_check;
3476 // If we have a minimum number of repetitions we check the current
3477 // number first and skip the empty check if it's not enough.
3478 if (has_minimum) {
3479 intptr_t limit = data_.u_empty_match_check.repetition_limit;
3480 assembler->IfRegisterLT(reg: rep_reg, comparand: limit, if_lt: &skip_empty_check);
3481 }
3482 // If the match is empty we bail out, otherwise we fall through
3483 // to the on-success continuation.
3484 assembler->IfRegisterEqPos(reg: data_.u_empty_match_check.start_register,
3485 if_eq: trace->backtrack());
3486 assembler->BindBlock(label: &skip_empty_check);
3487 on_success()->Emit(compiler, trace);
3488 }
3489 break;
3490 }
3491 case POSITIVE_SUBMATCH_SUCCESS: {
3492 if (!trace->is_trivial()) {
3493 trace->Flush(compiler, successor: this);
3494 return;
3495 }
3496 assembler->ReadCurrentPositionFromRegister(
3497 reg: data_.u_submatch.current_position_register);
3498 assembler->ReadStackPointerFromRegister(
3499 reg: data_.u_submatch.stack_pointer_register);
3500 intptr_t clear_register_count = data_.u_submatch.clear_register_count;
3501 if (clear_register_count == 0) {
3502 on_success()->Emit(compiler, trace);
3503 return;
3504 }
3505 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
3506 BlockLabel clear_registers_backtrack;
3507 Trace new_trace = *trace;
3508 new_trace.set_backtrack(&clear_registers_backtrack);
3509 on_success()->Emit(compiler, trace: &new_trace);
3510
3511 assembler->BindBlock(label: &clear_registers_backtrack);
3512 intptr_t clear_registers_to =
3513 clear_registers_from + clear_register_count - 1;
3514 assembler->ClearRegisters(reg_from: clear_registers_from, reg_to: clear_registers_to);
3515
3516 ASSERT(trace->backtrack() == nullptr);
3517 assembler->Backtrack();
3518 return;
3519 }
3520 default:
3521 UNREACHABLE();
3522 }
3523}
3524
3525void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3526 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3527 if (!trace->is_trivial()) {
3528 trace->Flush(compiler, successor: this);
3529 return;
3530 }
3531
3532 LimitResult limit_result = LimitVersions(compiler, trace);
3533 if (limit_result == DONE) return;
3534 ASSERT(limit_result == CONTINUE);
3535
3536 RecursionCheck rc(compiler);
3537
3538 ASSERT(start_reg_ + 1 == end_reg_);
3539 if (flags_.IgnoreCase()) {
3540 assembler->CheckNotBackReferenceIgnoreCase(
3541 start_reg: start_reg_, read_backward: read_backward(), unicode: flags_.IsUnicode(), on_no_match: trace->backtrack());
3542 } else {
3543 assembler->CheckNotBackReference(start_reg: start_reg_, read_backward: read_backward(),
3544 on_no_match: trace->backtrack());
3545 }
3546 // We are going to advance backward, so we may end up at the start.
3547 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3548
3549 // Check that the back reference does not end inside a surrogate pair.
3550 if (flags_.IsUnicode() && !compiler->one_byte()) {
3551 assembler->CheckNotInSurrogatePair(cp_offset: trace->cp_offset(), on_failure: trace->backtrack());
3552 }
3553
3554 on_success()->Emit(compiler, trace);
3555}
3556
3557// -------------------------------------------------------------------
3558// Dot/dotty output
3559
3560#ifdef DEBUG
3561
3562class DotPrinter : public NodeVisitor {
3563 public:
3564 explicit DotPrinter(bool ignore_case) {}
3565 void PrintNode(const char* label, RegExpNode* node);
3566 void Visit(RegExpNode* node);
3567 void PrintAttributes(RegExpNode* from);
3568 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3569#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
3570 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3571#undef DECLARE_VISIT
3572};
3573
3574void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3575 OS::PrintErr("digraph G {\n graph [label=\"");
3576 for (intptr_t i = 0; label[i] != '\0'; i++) {
3577 switch (label[i]) {
3578 case '\\':
3579 OS::PrintErr("\\\\");
3580 break;
3581 case '"':
3582 OS::PrintErr("\"");
3583 break;
3584 default:
3585 OS::PrintErr("%c", label[i]);
3586 break;
3587 }
3588 }
3589 OS::PrintErr("\"];\n");
3590 Visit(node);
3591 OS::PrintErr("}\n");
3592}
3593
3594void DotPrinter::Visit(RegExpNode* node) {
3595 if (node->info()->visited) return;
3596 node->info()->visited = true;
3597 node->Accept(this);
3598}
3599
3600void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3601 OS::PrintErr(" n%p -> n%p [style=dotted];\n", from, on_failure);
3602 Visit(on_failure);
3603}
3604
3605class AttributePrinter : public ValueObject {
3606 public:
3607 AttributePrinter() : first_(true) {}
3608 void PrintSeparator() {
3609 if (first_) {
3610 first_ = false;
3611 } else {
3612 OS::PrintErr("|");
3613 }
3614 }
3615 void PrintBit(const char* name, bool value) {
3616 if (!value) return;
3617 PrintSeparator();
3618 OS::PrintErr("{%s}", name);
3619 }
3620 void PrintPositive(const char* name, intptr_t value) {
3621 if (value < 0) return;
3622 PrintSeparator();
3623 OS::PrintErr("{%s|%" Pd "}", name, value);
3624 }
3625
3626 private:
3627 bool first_;
3628};
3629
3630void DotPrinter::PrintAttributes(RegExpNode* that) {
3631 OS::PrintErr(
3632 " a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3633 "margin=0.1, fontsize=10, label=\"{",
3634 that);
3635 AttributePrinter printer;
3636 NodeInfo* info = that->info();
3637 printer.PrintBit("NI", info->follows_newline_interest);
3638 printer.PrintBit("WI", info->follows_word_interest);
3639 printer.PrintBit("SI", info->follows_start_interest);
3640 BlockLabel* label = that->label();
3641 if (label->is_bound()) printer.PrintPositive("@", label->pos());
3642 OS::PrintErr(
3643 "}\"];\n"
3644 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
3645 that, that);
3646}
3647
3648void DotPrinter::VisitChoice(ChoiceNode* that) {
3649 OS::PrintErr(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3650 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3651 GuardedAlternative alt = that->alternatives()->At(i);
3652 OS::PrintErr(" n%p -> n%p", that, alt.node());
3653 }
3654 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3655 GuardedAlternative alt = that->alternatives()->At(i);
3656 alt.node()->Accept(this);
3657 }
3658}
3659
3660void DotPrinter::VisitText(TextNode* that) {
3661 OS::PrintErr(" n%p [label=\"", that);
3662 for (intptr_t i = 0; i < that->elements()->length(); i++) {
3663 if (i > 0) OS::PrintErr(" ");
3664 TextElement elm = that->elements()->At(i);
3665 switch (elm.text_type()) {
3666 case TextElement::ATOM: {
3667 ZoneGrowableArray<uint16_t>* data = elm.atom()->data();
3668 for (intptr_t i = 0; i < data->length(); i++) {
3669 OS::PrintErr("%c", static_cast<char>(data->At(i)));
3670 }
3671 break;
3672 }
3673 case TextElement::CHAR_CLASS: {
3674 RegExpCharacterClass* node = elm.char_class();
3675 OS::PrintErr("[");
3676 if (node->is_negated()) OS::PrintErr("^");
3677 for (intptr_t j = 0; j < node->ranges()->length(); j++) {
3678 CharacterRange range = node->ranges()->At(j);
3679 PrintUtf16(range.from());
3680 OS::PrintErr("-");
3681 PrintUtf16(range.to());
3682 }
3683 OS::PrintErr("]");
3684 break;
3685 }
3686 default:
3687 UNREACHABLE();
3688 }
3689 }
3690 OS::PrintErr("\", shape=box, peripheries=2];\n");
3691 PrintAttributes(that);
3692 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3693 Visit(that->on_success());
3694}
3695
3696void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3697 OS::PrintErr(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",
3698 that, that->start_register(), that->end_register());
3699 PrintAttributes(that);
3700 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3701 Visit(that->on_success());
3702}
3703
3704void DotPrinter::VisitEnd(EndNode* that) {
3705 OS::PrintErr(" n%p [style=bold, shape=point];\n", that);
3706 PrintAttributes(that);
3707}
3708
3709void DotPrinter::VisitAssertion(AssertionNode* that) {
3710 OS::PrintErr(" n%p [", that);
3711 switch (that->assertion_type()) {
3712 case AssertionNode::AT_END:
3713 OS::PrintErr("label=\"$\", shape=septagon");
3714 break;
3715 case AssertionNode::AT_START:
3716 OS::PrintErr("label=\"^\", shape=septagon");
3717 break;
3718 case AssertionNode::AT_BOUNDARY:
3719 OS::PrintErr("label=\"\\b\", shape=septagon");
3720 break;
3721 case AssertionNode::AT_NON_BOUNDARY:
3722 OS::PrintErr("label=\"\\B\", shape=septagon");
3723 break;
3724 case AssertionNode::AFTER_NEWLINE:
3725 OS::PrintErr("label=\"(?<=\\n)\", shape=septagon");
3726 break;
3727 }
3728 OS::PrintErr("];\n");
3729 PrintAttributes(that);
3730 RegExpNode* successor = that->on_success();
3731 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3732 Visit(successor);
3733}
3734
3735void DotPrinter::VisitAction(ActionNode* that) {
3736 OS::PrintErr(" n%p [", that);
3737 switch (that->action_type_) {
3738 case ActionNode::SET_REGISTER:
3739 OS::PrintErr("label=\"$%" Pd ":=%" Pd "\", shape=octagon",
3740 that->data_.u_store_register.reg,
3741 that->data_.u_store_register.value);
3742 break;
3743 case ActionNode::INCREMENT_REGISTER:
3744 OS::PrintErr("label=\"$%" Pd "++\", shape=octagon",
3745 that->data_.u_increment_register.reg);
3746 break;
3747 case ActionNode::STORE_POSITION:
3748 OS::PrintErr("label=\"$%" Pd ":=$pos\", shape=octagon",
3749 that->data_.u_position_register.reg);
3750 break;
3751 case ActionNode::BEGIN_SUBMATCH:
3752 OS::PrintErr("label=\"$%" Pd ":=$pos,begin\", shape=septagon",
3753 that->data_.u_submatch.current_position_register);
3754 break;
3755 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3756 OS::PrintErr("label=\"escape\", shape=septagon");
3757 break;
3758 case ActionNode::EMPTY_MATCH_CHECK:
3759 OS::PrintErr("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",
3760 that->data_.u_empty_match_check.start_register,
3761 that->data_.u_empty_match_check.repetition_register,
3762 that->data_.u_empty_match_check.repetition_limit);
3763 break;
3764 case ActionNode::CLEAR_CAPTURES: {
3765 OS::PrintErr("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",
3766 that->data_.u_clear_captures.range_from,
3767 that->data_.u_clear_captures.range_to);
3768 break;
3769 }
3770 }
3771 OS::PrintErr("];\n");
3772 PrintAttributes(that);
3773 RegExpNode* successor = that->on_success();
3774 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3775 Visit(successor);
3776}
3777
3778void RegExpEngine::DotPrint(const char* label,
3779 RegExpNode* node,
3780 bool ignore_case) {
3781 DotPrinter printer(ignore_case);
3782 printer.PrintNode(label, node);
3783}
3784
3785#endif // DEBUG
3786
3787// -------------------------------------------------------------------
3788// Tree to graph conversion
3789
3790// The zone in which we allocate graph nodes.
3791#define OZ (on_success->zone())
3792
3793RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3794 RegExpNode* on_success) {
3795 ZoneGrowableArray<TextElement>* elms =
3796 new (OZ) ZoneGrowableArray<TextElement>(1);
3797 elms->Add(value: TextElement::Atom(atom: this));
3798 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3799}
3800
3801RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3802 RegExpNode* on_success) {
3803 ZoneGrowableArray<TextElement>* elms =
3804 new (OZ) ZoneGrowableArray<TextElement>(1);
3805 for (intptr_t i = 0; i < elements()->length(); i++) {
3806 elms->Add(value: elements()->At(index: i));
3807 }
3808 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3809}
3810
3811static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges,
3812 const int32_t* special_class,
3813 intptr_t length) {
3814 length--; // Remove final kRangeEndMarker.
3815 ASSERT(special_class[length] == kRangeEndMarker);
3816 ASSERT(ranges->length() != 0);
3817 ASSERT(length != 0);
3818 ASSERT(special_class[0] != 0);
3819 if (ranges->length() != (length >> 1) + 1) {
3820 return false;
3821 }
3822 CharacterRange range = ranges->At(index: 0);
3823 if (range.from() != 0) {
3824 return false;
3825 }
3826 for (intptr_t i = 0; i < length; i += 2) {
3827 if (special_class[i] != (range.to() + 1)) {
3828 return false;
3829 }
3830 range = ranges->At(index: (i >> 1) + 1);
3831 if (special_class[i + 1] != range.from()) {
3832 return false;
3833 }
3834 }
3835 if (range.to() != Utf::kMaxCodePoint) {
3836 return false;
3837 }
3838 return true;
3839}
3840
3841static bool CompareRanges(ZoneGrowableArray<CharacterRange>* ranges,
3842 const int32_t* special_class,
3843 intptr_t length) {
3844 length--; // Remove final kRangeEndMarker.
3845 ASSERT(special_class[length] == kRangeEndMarker);
3846 if (ranges->length() * 2 != length) {
3847 return false;
3848 }
3849 for (intptr_t i = 0; i < length; i += 2) {
3850 CharacterRange range = ranges->At(index: i >> 1);
3851 if (range.from() != special_class[i] ||
3852 range.to() != special_class[i + 1] - 1) {
3853 return false;
3854 }
3855 }
3856 return true;
3857}
3858
3859bool RegExpCharacterClass::is_standard() {
3860 // TODO(lrn): Remove need for this function, by not throwing away information
3861 // along the way.
3862 if (is_negated()) {
3863 return false;
3864 }
3865 if (set_.is_standard()) {
3866 return true;
3867 }
3868 if (CompareRanges(ranges: set_.ranges(), special_class: kSpaceRanges, length: kSpaceRangeCount)) {
3869 set_.set_standard_set_type('s');
3870 return true;
3871 }
3872 if (CompareInverseRanges(ranges: set_.ranges(), special_class: kSpaceRanges, length: kSpaceRangeCount)) {
3873 set_.set_standard_set_type('S');
3874 return true;
3875 }
3876 if (CompareInverseRanges(ranges: set_.ranges(), special_class: kLineTerminatorRanges,
3877 length: kLineTerminatorRangeCount)) {
3878 set_.set_standard_set_type('.');
3879 return true;
3880 }
3881 if (CompareRanges(ranges: set_.ranges(), special_class: kLineTerminatorRanges,
3882 length: kLineTerminatorRangeCount)) {
3883 set_.set_standard_set_type('n');
3884 return true;
3885 }
3886 if (CompareRanges(ranges: set_.ranges(), special_class: kWordRanges, length: kWordRangeCount)) {
3887 set_.set_standard_set_type('w');
3888 return true;
3889 }
3890 if (CompareInverseRanges(ranges: set_.ranges(), special_class: kWordRanges, length: kWordRangeCount)) {
3891 set_.set_standard_set_type('W');
3892 return true;
3893 }
3894 return false;
3895}
3896
3897UnicodeRangeSplitter::UnicodeRangeSplitter(
3898 Zone* zone,
3899 ZoneGrowableArray<CharacterRange>* base)
3900 : zone_(zone),
3901 table_(zone),
3902 bmp_(nullptr),
3903 lead_surrogates_(nullptr),
3904 trail_surrogates_(nullptr),
3905 non_bmp_(nullptr) {
3906 // The unicode range splitter categorizes given character ranges into:
3907 // - Code points from the BMP representable by one code unit.
3908 // - Code points outside the BMP that need to be split into surrogate pairs.
3909 // - Lone lead surrogates.
3910 // - Lone trail surrogates.
3911 // Lone surrogates are valid code points, even though no actual characters.
3912 // They require special matching to make sure we do not split surrogate pairs.
3913 // We use the dispatch table to accomplish this. The base range is split up
3914 // by the table by the overlay ranges, and the Call callback is used to
3915 // filter and collect ranges for each category.
3916 for (intptr_t i = 0; i < base->length(); i++) {
3917 table_.AddRange(range: base->At(index: i), value: kBase, zone: zone_);
3918 }
3919 // Add overlay ranges.
3920 table_.AddRange(range: CharacterRange::Range(from: 0, to: Utf16::kLeadSurrogateStart - 1),
3921 value: kBmpCodePoints, zone: zone_);
3922 table_.AddRange(range: CharacterRange::Range(from: Utf16::kLeadSurrogateStart,
3923 to: Utf16::kLeadSurrogateEnd),
3924 value: kLeadSurrogates, zone: zone_);
3925 table_.AddRange(range: CharacterRange::Range(from: Utf16::kTrailSurrogateStart,
3926 to: Utf16::kTrailSurrogateEnd),
3927 value: kTrailSurrogates, zone: zone_);
3928 table_.AddRange(
3929 range: CharacterRange::Range(from: Utf16::kTrailSurrogateEnd + 1, to: Utf16::kMaxCodeUnit),
3930 value: kBmpCodePoints, zone: zone_);
3931 table_.AddRange(
3932 range: CharacterRange::Range(from: Utf16::kMaxCodeUnit + 1, to: Utf::kMaxCodePoint),
3933 value: kNonBmpCodePoints, zone: zone_);
3934 table_.ForEach(callback: this);
3935}
3936
3937void UnicodeRangeSplitter::Call(uint32_t from, ChoiceTable::Entry entry) {
3938 OutSet* outset = entry.out_set();
3939 if (!outset->Get(value: kBase)) return;
3940 ZoneGrowableArray<CharacterRange>** target = nullptr;
3941 if (outset->Get(value: kBmpCodePoints)) {
3942 target = &bmp_;
3943 } else if (outset->Get(value: kLeadSurrogates)) {
3944 target = &lead_surrogates_;
3945 } else if (outset->Get(value: kTrailSurrogates)) {
3946 target = &trail_surrogates_;
3947 } else {
3948 ASSERT(outset->Get(kNonBmpCodePoints));
3949 target = &non_bmp_;
3950 }
3951 if (*target == nullptr) {
3952 *target = new (zone_) ZoneGrowableArray<CharacterRange>(2);
3953 }
3954 (*target)->Add(value: CharacterRange::Range(from: entry.from(), to: entry.to()));
3955}
3956
3957void AddBmpCharacters(RegExpCompiler* compiler,
3958 ChoiceNode* result,
3959 RegExpNode* on_success,
3960 UnicodeRangeSplitter* splitter) {
3961 ZoneGrowableArray<CharacterRange>* bmp = splitter->bmp();
3962 if (bmp == nullptr) return;
3963 result->AddAlternative(node: GuardedAlternative(TextNode::CreateForCharacterRanges(
3964 ranges: bmp, read_backward: compiler->read_backward(), on_success, flags: RegExpFlags())));
3965}
3966
3967void AddNonBmpSurrogatePairs(RegExpCompiler* compiler,
3968 ChoiceNode* result,
3969 RegExpNode* on_success,
3970 UnicodeRangeSplitter* splitter) {
3971 ZoneGrowableArray<CharacterRange>* non_bmp = splitter->non_bmp();
3972 if (non_bmp == nullptr) return;
3973 ASSERT(!compiler->one_byte());
3974 CharacterRange::Canonicalize(ranges: non_bmp);
3975 for (int i = 0; i < non_bmp->length(); i++) {
3976 // Match surrogate pair.
3977 // E.g. [\u10005-\u11005] becomes
3978 // \ud800[\udc05-\udfff]|
3979 // [\ud801-\ud803][\udc00-\udfff]|
3980 // \ud804[\udc00-\udc05]
3981 uint32_t from = non_bmp->At(index: i).from();
3982 uint32_t to = non_bmp->At(index: i).to();
3983 uint16_t from_points[2];
3984 Utf16::Encode(codepoint: from, dst: from_points);
3985 uint16_t to_points[2];
3986 Utf16::Encode(codepoint: to, dst: to_points);
3987 if (from_points[0] == to_points[0]) {
3988 // The lead surrogate is the same.
3989 result->AddAlternative(
3990 node: GuardedAlternative(TextNode::CreateForSurrogatePair(
3991 lead: CharacterRange::Singleton(value: from_points[0]),
3992 trail: CharacterRange::Range(from: from_points[1], to: to_points[1]),
3993 read_backward: compiler->read_backward(), on_success, flags: RegExpFlags())));
3994 } else {
3995 if (from_points[1] != Utf16::kTrailSurrogateStart) {
3996 // Add [from_l][from_t-\udfff]
3997 result->AddAlternative(
3998 node: GuardedAlternative(TextNode::CreateForSurrogatePair(
3999 lead: CharacterRange::Singleton(value: from_points[0]),
4000 trail: CharacterRange::Range(from: from_points[1],
4001 to: Utf16::kTrailSurrogateEnd),
4002 read_backward: compiler->read_backward(), on_success, flags: RegExpFlags())));
4003 from_points[0]++;
4004 }
4005 if (to_points[1] != Utf16::kTrailSurrogateEnd) {
4006 // Add [to_l][\udc00-to_t]
4007 result->AddAlternative(
4008 node: GuardedAlternative(TextNode::CreateForSurrogatePair(
4009 lead: CharacterRange::Singleton(value: to_points[0]),
4010 trail: CharacterRange::Range(from: Utf16::kTrailSurrogateStart,
4011 to: to_points[1]),
4012 read_backward: compiler->read_backward(), on_success, flags: RegExpFlags())));
4013 to_points[0]--;
4014 }
4015 if (from_points[0] <= to_points[0]) {
4016 // Add [from_l-to_l][\udc00-\udfff]
4017 result->AddAlternative(
4018 node: GuardedAlternative(TextNode::CreateForSurrogatePair(
4019 lead: CharacterRange::Range(from: from_points[0], to: to_points[0]),
4020 trail: CharacterRange::Range(from: Utf16::kTrailSurrogateStart,
4021 to: Utf16::kTrailSurrogateEnd),
4022 read_backward: compiler->read_backward(), on_success, flags: RegExpFlags())));
4023 }
4024 }
4025 }
4026}
4027
4028RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4029 RegExpCompiler* compiler,
4030 ZoneGrowableArray<CharacterRange>* lookbehind,
4031 ZoneGrowableArray<CharacterRange>* match,
4032 RegExpNode* on_success,
4033 bool read_backward,
4034 RegExpFlags flags) {
4035 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4036 ranges: match, read_backward, on_success, flags);
4037 int stack_register = compiler->UnicodeLookaroundStackRegister();
4038 int position_register = compiler->UnicodeLookaroundPositionRegister();
4039 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4040 position_register);
4041 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4042 ranges: lookbehind, read_backward: !read_backward, on_success: lookaround.on_match_success(), flags);
4043 return lookaround.ForMatch(match: negative_match);
4044}
4045
4046RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4047 RegExpCompiler* compiler,
4048 ZoneGrowableArray<CharacterRange>* match,
4049 ZoneGrowableArray<CharacterRange>* lookahead,
4050 RegExpNode* on_success,
4051 bool read_backward,
4052 RegExpFlags flags) {
4053 int stack_register = compiler->UnicodeLookaroundStackRegister();
4054 int position_register = compiler->UnicodeLookaroundPositionRegister();
4055 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4056 position_register);
4057 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4058 ranges: lookahead, read_backward, on_success: lookaround.on_match_success(), flags);
4059 return TextNode::CreateForCharacterRanges(
4060 ranges: match, read_backward, on_success: lookaround.ForMatch(match: negative_match), flags);
4061}
4062
4063void AddLoneLeadSurrogates(RegExpCompiler* compiler,
4064 ChoiceNode* result,
4065 RegExpNode* on_success,
4066 UnicodeRangeSplitter* splitter) {
4067 auto lead_surrogates = splitter->lead_surrogates();
4068 if (lead_surrogates == nullptr) return;
4069 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4070 auto trail_surrogates = CharacterRange::List(
4071 zone: on_success->zone(), range: CharacterRange::Range(from: Utf16::kTrailSurrogateStart,
4072 to: Utf16::kTrailSurrogateEnd));
4073
4074 RegExpNode* match;
4075 if (compiler->read_backward()) {
4076 // Reading backward. Assert that reading forward, there is no trail
4077 // surrogate, and then backward match the lead surrogate.
4078 match = NegativeLookaroundAgainstReadDirectionAndMatch(
4079 compiler, lookbehind: trail_surrogates, match: lead_surrogates, on_success, read_backward: true,
4080 flags: RegExpFlags());
4081 } else {
4082 // Reading forward. Forward match the lead surrogate and assert that
4083 // no trail surrogate follows.
4084 match = MatchAndNegativeLookaroundInReadDirection(
4085 compiler, match: lead_surrogates, lookahead: trail_surrogates, on_success, read_backward: false,
4086 flags: RegExpFlags());
4087 }
4088 result->AddAlternative(node: GuardedAlternative(match));
4089}
4090
4091void AddLoneTrailSurrogates(RegExpCompiler* compiler,
4092 ChoiceNode* result,
4093 RegExpNode* on_success,
4094 UnicodeRangeSplitter* splitter) {
4095 auto trail_surrogates = splitter->trail_surrogates();
4096 if (trail_surrogates == nullptr) return;
4097 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4098 auto lead_surrogates = CharacterRange::List(
4099 zone: on_success->zone(), range: CharacterRange::Range(from: Utf16::kLeadSurrogateStart,
4100 to: Utf16::kLeadSurrogateEnd));
4101
4102 RegExpNode* match;
4103 if (compiler->read_backward()) {
4104 // Reading backward. Backward match the trail surrogate and assert that no
4105 // lead surrogate precedes it.
4106 match = MatchAndNegativeLookaroundInReadDirection(
4107 compiler, match: trail_surrogates, lookahead: lead_surrogates, on_success, read_backward: true,
4108 flags: RegExpFlags());
4109 } else {
4110 // Reading forward. Assert that reading backward, there is no lead
4111 // surrogate, and then forward match the trail surrogate.
4112 match = NegativeLookaroundAgainstReadDirectionAndMatch(
4113 compiler, lookbehind: lead_surrogates, match: trail_surrogates, on_success, read_backward: false,
4114 flags: RegExpFlags());
4115 }
4116 result->AddAlternative(node: GuardedAlternative(match));
4117}
4118
4119RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
4120 RegExpNode* on_success) {
4121 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
4122 ASSERT(!compiler->read_backward());
4123 // Advance any character. If the character happens to be a lead surrogate and
4124 // we advanced into the middle of a surrogate pair, it will work out, as
4125 // nothing will match from there. We will have to advance again, consuming
4126 // the associated trail surrogate.
4127 auto range = CharacterRange::List(
4128 zone: on_success->zone(), range: CharacterRange::Range(from: 0, to: Utf16::kMaxCodeUnit));
4129 return TextNode::CreateForCharacterRanges(ranges: range, read_backward: false, on_success,
4130 flags: RegExpFlags());
4131}
4132
4133void AddUnicodeCaseEquivalents(ZoneGrowableArray<CharacterRange>* ranges) {
4134 ASSERT(CharacterRange::IsCanonical(ranges));
4135
4136 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
4137 // See also https://crbug.com/v8/6727.
4138 // TODO(sstrickl): This only covers the special case of the {0,0x10FFFF}
4139 // range, which we use frequently internally. But large ranges can also easily
4140 // be created by the user. We might want to have a more general caching
4141 // mechanism for such ranges.
4142 if (ranges->length() == 1 && ranges->At(index: 0).IsEverything(max: Utf::kMaxCodePoint)) {
4143 return;
4144 }
4145
4146 icu::UnicodeSet set;
4147 for (int i = 0; i < ranges->length(); i++) {
4148 set.add(start: ranges->At(index: i).from(), end: ranges->At(index: i).to());
4149 }
4150 ranges->Clear();
4151 set.closeOver(attribute: USET_CASE_INSENSITIVE);
4152 // Full case mapping map single characters to multiple characters.
4153 // Those are represented as strings in the set. Remove them so that
4154 // we end up with only simple and common case mappings.
4155 set.removeAllStrings();
4156 for (int i = 0; i < set.getRangeCount(); i++) {
4157 ranges->Add(
4158 value: CharacterRange::Range(from: set.getRangeStart(index: i), to: set.getRangeEnd(index: i)));
4159 }
4160 // No errors and everything we collected have been ranges.
4161 CharacterRange::Canonicalize(ranges);
4162}
4163
4164RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4165 RegExpNode* on_success) {
4166 set_.Canonicalize();
4167 ZoneGrowableArray<CharacterRange>* ranges = this->ranges();
4168 if (flags_.NeedsUnicodeCaseEquivalents()) {
4169 AddUnicodeCaseEquivalents(ranges);
4170 }
4171 if (flags_.IsUnicode() && !compiler->one_byte() &&
4172 !contains_split_surrogate()) {
4173 if (is_negated()) {
4174 ZoneGrowableArray<CharacterRange>* negated =
4175 new ZoneGrowableArray<CharacterRange>(2);
4176 CharacterRange::Negate(src: ranges, dst: negated);
4177 ranges = negated;
4178 }
4179 if (ranges->length() == 0) {
4180 RegExpCharacterClass* fail =
4181 new RegExpCharacterClass(ranges, RegExpFlags());
4182 return new TextNode(fail, compiler->read_backward(), on_success);
4183 }
4184 if (standard_type() == '*') {
4185 return UnanchoredAdvance(compiler, on_success);
4186 } else {
4187 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4188 UnicodeRangeSplitter splitter(OZ, ranges);
4189 AddBmpCharacters(compiler, result, on_success, splitter: &splitter);
4190 AddNonBmpSurrogatePairs(compiler, result, on_success, splitter: &splitter);
4191 AddLoneLeadSurrogates(compiler, result, on_success, splitter: &splitter);
4192 AddLoneTrailSurrogates(compiler, result, on_success, splitter: &splitter);
4193 return result;
4194 }
4195 } else {
4196 return new TextNode(this, compiler->read_backward(), on_success);
4197 }
4198 return new (OZ) TextNode(this, compiler->read_backward(), on_success);
4199}
4200
4201RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4202 RegExpNode* on_success) {
4203 ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
4204 intptr_t length = alternatives->length();
4205 ChoiceNode* result = new (OZ) ChoiceNode(length, OZ);
4206 for (intptr_t i = 0; i < length; i++) {
4207 GuardedAlternative alternative(
4208 alternatives->At(index: i)->ToNode(compiler, on_success));
4209 result->AddAlternative(node: alternative);
4210 }
4211 return result;
4212}
4213
4214RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4215 RegExpNode* on_success) {
4216 return ToNode(min: min(), max: max(), is_greedy: is_greedy(), body: body(), compiler, on_success);
4217}
4218
4219// Scoped object to keep track of how much we unroll quantifier loops in the
4220// regexp graph generator.
4221class RegExpExpansionLimiter : public ValueObject {
4222 public:
4223 static constexpr intptr_t kMaxExpansionFactor = 6;
4224 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)
4225 : compiler_(compiler),
4226 saved_expansion_factor_(compiler->current_expansion_factor()),
4227 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4228 ASSERT(factor > 0);
4229 if (ok_to_expand_) {
4230 if (factor > kMaxExpansionFactor) {
4231 // Avoid integer overflow of the current expansion factor.
4232 ok_to_expand_ = false;
4233 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4234 } else {
4235 intptr_t new_factor = saved_expansion_factor_ * factor;
4236 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4237 compiler->set_current_expansion_factor(new_factor);
4238 }
4239 }
4240 }
4241
4242 ~RegExpExpansionLimiter() {
4243 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4244 }
4245
4246 bool ok_to_expand() { return ok_to_expand_; }
4247
4248 private:
4249 RegExpCompiler* compiler_;
4250 intptr_t saved_expansion_factor_;
4251 bool ok_to_expand_;
4252
4253 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4254};
4255
4256RegExpNode* RegExpQuantifier::ToNode(intptr_t min,
4257 intptr_t max,
4258 bool is_greedy,
4259 RegExpTree* body,
4260 RegExpCompiler* compiler,
4261 RegExpNode* on_success,
4262 bool not_at_start) {
4263 // x{f, t} becomes this:
4264 //
4265 // (r++)<-.
4266 // | `
4267 // | (x)
4268 // v ^
4269 // (r=0)-->(?)---/ [if r < t]
4270 // |
4271 // [if r >= f] \----> ...
4272 //
4273
4274 // 15.10.2.5 RepeatMatcher algorithm.
4275 // The parser has already eliminated the case where max is 0. In the case
4276 // where max_match is zero the parser has removed the quantifier if min was
4277 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4278
4279 // If we know that we cannot match zero length then things are a little
4280 // simpler since we don't need to make the special zero length match check
4281 // from step 2.1. If the min and max are small we can unroll a little in
4282 // this case.
4283 // Unroll (foo)+ and (foo){3,}
4284 const intptr_t kMaxUnrolledMinMatches = 3;
4285 // Unroll (foo)? and (foo){x,3}
4286 const intptr_t kMaxUnrolledMaxMatches = 3;
4287 if (max == 0) return on_success; // This can happen due to recursion.
4288 bool body_can_be_empty = (body->min_match() == 0);
4289 intptr_t body_start_reg = RegExpCompiler::kNoRegister;
4290 Interval capture_registers = body->CaptureRegisters();
4291 bool needs_capture_clearing = !capture_registers.is_empty();
4292 Zone* zone = compiler->zone();
4293
4294 if (body_can_be_empty) {
4295 body_start_reg = compiler->AllocateRegister();
4296 } else if (kRegexpOptimization && !needs_capture_clearing) {
4297 // Only unroll if there are no captures and the body can't be
4298 // empty.
4299 {
4300 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
4301 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4302 intptr_t new_max = (max == kInfinity) ? max : max - min;
4303 // Recurse once to get the loop or optional matches after the fixed
4304 // ones.
4305 RegExpNode* answer =
4306 ToNode(min: 0, max: new_max, is_greedy, body, compiler, on_success, not_at_start: true);
4307 // Unroll the forced matches from 0 to min. This can cause chains of
4308 // TextNodes (which the parser does not generate). These should be
4309 // combined if it turns out they hinder good code generation.
4310 for (intptr_t i = 0; i < min; i++) {
4311 answer = body->ToNode(compiler, on_success: answer);
4312 }
4313 return answer;
4314 }
4315 }
4316 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4317 ASSERT(max > 0); // Due to the 'if' above.
4318 RegExpExpansionLimiter limiter(compiler, max);
4319 if (limiter.ok_to_expand()) {
4320 // Unroll the optional matches up to max.
4321 RegExpNode* answer = on_success;
4322 for (intptr_t i = 0; i < max; i++) {
4323 ChoiceNode* alternation = new (zone) ChoiceNode(2, zone);
4324 if (is_greedy) {
4325 alternation->AddAlternative(
4326 node: GuardedAlternative(body->ToNode(compiler, on_success: answer)));
4327 alternation->AddAlternative(node: GuardedAlternative(on_success));
4328 } else {
4329 alternation->AddAlternative(node: GuardedAlternative(on_success));
4330 alternation->AddAlternative(
4331 node: GuardedAlternative(body->ToNode(compiler, on_success: answer)));
4332 }
4333 answer = alternation;
4334 if (not_at_start && !compiler->read_backward()) {
4335 alternation->set_not_at_start();
4336 }
4337 }
4338 return answer;
4339 }
4340 }
4341 }
4342 bool has_min = min > 0;
4343 bool has_max = max < RegExpTree::kInfinity;
4344 bool needs_counter = has_min || has_max;
4345 intptr_t reg_ctr = needs_counter ? compiler->AllocateRegister()
4346 : RegExpCompiler::kNoRegister;
4347 LoopChoiceNode* center = new (zone)
4348 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
4349 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
4350 RegExpNode* loop_return =
4351 needs_counter ? static_cast<RegExpNode*>(
4352 ActionNode::IncrementRegister(reg: reg_ctr, on_success: center))
4353 : static_cast<RegExpNode*>(center);
4354 if (body_can_be_empty) {
4355 // If the body can be empty we need to check if it was and then
4356 // backtrack.
4357 loop_return =
4358 ActionNode::EmptyMatchCheck(start_register: body_start_reg, repetition_register: reg_ctr, repetition_limit: min, on_success: loop_return);
4359 }
4360 RegExpNode* body_node = body->ToNode(compiler, on_success: loop_return);
4361 if (body_can_be_empty) {
4362 // If the body can be empty we need to store the start position
4363 // so we can bail out if it was empty.
4364 body_node = ActionNode::StorePosition(reg: body_start_reg, is_capture: false, on_success: body_node);
4365 }
4366 if (needs_capture_clearing) {
4367 // Before entering the body of this loop we need to clear captures.
4368 body_node = ActionNode::ClearCaptures(range: capture_registers, on_success: body_node);
4369 }
4370 GuardedAlternative body_alt(body_node);
4371 if (has_max) {
4372 Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max);
4373 body_alt.AddGuard(guard: body_guard, zone);
4374 }
4375 GuardedAlternative rest_alt(on_success);
4376 if (has_min) {
4377 Guard* rest_guard = new (zone) Guard(reg_ctr, Guard::GEQ, min);
4378 rest_alt.AddGuard(guard: rest_guard, zone);
4379 }
4380 if (is_greedy) {
4381 center->AddLoopAlternative(alt: body_alt);
4382 center->AddContinueAlternative(alt: rest_alt);
4383 } else {
4384 center->AddContinueAlternative(alt: rest_alt);
4385 center->AddLoopAlternative(alt: body_alt);
4386 }
4387 if (needs_counter) {
4388 return ActionNode::SetRegister(reg: reg_ctr, val: 0, on_success: center);
4389 } else {
4390 return center;
4391 }
4392}
4393
4394namespace {
4395// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
4396// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
4397RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
4398 RegExpNode* on_success,
4399 RegExpAssertion::AssertionType type,
4400 RegExpFlags flags) {
4401 ASSERT(flags.NeedsUnicodeCaseEquivalents());
4402 ZoneGrowableArray<CharacterRange>* word_range =
4403 new ZoneGrowableArray<CharacterRange>(2);
4404 CharacterRange::AddClassEscape(type: 'w', ranges: word_range, add_unicode_case_equivalents: true);
4405 int stack_register = compiler->UnicodeLookaroundStackRegister();
4406 int position_register = compiler->UnicodeLookaroundPositionRegister();
4407 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4408 // Add two choices. The (non-)boundary could start with a word or
4409 // a non-word-character.
4410 for (int i = 0; i < 2; i++) {
4411 bool lookbehind_for_word = i == 0;
4412 bool lookahead_for_word =
4413 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
4414 // Look to the left.
4415 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
4416 stack_register, position_register);
4417 RegExpNode* backward = TextNode::CreateForCharacterRanges(
4418 ranges: word_range, read_backward: true, on_success: lookbehind.on_match_success(), flags);
4419 // Look to the right.
4420 RegExpLookaround::Builder lookahead(lookahead_for_word,
4421 lookbehind.ForMatch(match: backward),
4422 stack_register, position_register);
4423 RegExpNode* forward = TextNode::CreateForCharacterRanges(
4424 ranges: word_range, read_backward: false, on_success: lookahead.on_match_success(), flags);
4425 result->AddAlternative(node: GuardedAlternative(lookahead.ForMatch(match: forward)));
4426 }
4427 return result;
4428}
4429} // anonymous namespace
4430
4431RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
4432 RegExpNode* on_success) {
4433 switch (assertion_type()) {
4434 case START_OF_LINE:
4435 return AssertionNode::AfterNewline(on_success);
4436 case START_OF_INPUT:
4437 return AssertionNode::AtStart(on_success);
4438 case BOUNDARY:
4439 return flags_.NeedsUnicodeCaseEquivalents()
4440 ? BoundaryAssertionAsLookaround(compiler, on_success, type: BOUNDARY,
4441 flags: flags_)
4442 : AssertionNode::AtBoundary(on_success);
4443 case NON_BOUNDARY:
4444 return flags_.NeedsUnicodeCaseEquivalents()
4445 ? BoundaryAssertionAsLookaround(compiler, on_success,
4446 type: NON_BOUNDARY, flags: flags_)
4447 : AssertionNode::AtNonBoundary(on_success);
4448 case END_OF_INPUT:
4449 return AssertionNode::AtEnd(on_success);
4450 case END_OF_LINE: {
4451 // Compile $ in multiline regexps as an alternation with a positive
4452 // lookahead in one side and an end-of-input on the other side.
4453 // We need two registers for the lookahead.
4454 intptr_t stack_pointer_register = compiler->AllocateRegister();
4455 intptr_t position_register = compiler->AllocateRegister();
4456 // The ChoiceNode to distinguish between a newline and end-of-input.
4457 ChoiceNode* result = new ChoiceNode(2, on_success->zone());
4458 // Create a newline atom.
4459 ZoneGrowableArray<CharacterRange>* newline_ranges =
4460 new ZoneGrowableArray<CharacterRange>(3);
4461 CharacterRange::AddClassEscape(type: 'n', ranges: newline_ranges);
4462 RegExpCharacterClass* newline_atom =
4463 new RegExpCharacterClass('n', RegExpFlags());
4464 TextNode* newline_matcher =
4465 new TextNode(newline_atom, /*read_backwards=*/false,
4466 ActionNode::PositiveSubmatchSuccess(
4467 stack_reg: stack_pointer_register, position_reg: position_register,
4468 clear_register_count: 0, // No captures inside.
4469 clear_register_from: -1, // Ignored if no captures.
4470 on_success));
4471 // Create an end-of-input matcher.
4472 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
4473 stack_reg: stack_pointer_register, position_reg: position_register, on_success: newline_matcher);
4474 // Add the two alternatives to the ChoiceNode.
4475 GuardedAlternative eol_alternative(end_of_line);
4476 result->AddAlternative(node: eol_alternative);
4477 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4478 result->AddAlternative(node: end_alternative);
4479 return result;
4480 }
4481 default:
4482 UNREACHABLE();
4483 }
4484 return on_success;
4485}
4486
4487RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
4488 RegExpNode* on_success) {
4489 return new (OZ) BackReferenceNode(RegExpCapture::StartRegister(index: index()),
4490 RegExpCapture::EndRegister(index: index()), flags_,
4491 compiler->read_backward(), on_success);
4492}
4493
4494RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
4495 RegExpNode* on_success) {
4496 return on_success;
4497}
4498
4499RegExpLookaround::Builder::Builder(bool is_positive,
4500 RegExpNode* on_success,
4501 intptr_t stack_pointer_register,
4502 intptr_t position_register,
4503 intptr_t capture_register_count,
4504 intptr_t capture_register_start)
4505 : is_positive_(is_positive),
4506 on_success_(on_success),
4507 stack_pointer_register_(stack_pointer_register),
4508 position_register_(position_register) {
4509 if (is_positive_) {
4510 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
4511 stack_reg: stack_pointer_register, position_reg: position_register, clear_register_count: capture_register_count,
4512 clear_register_from: capture_register_start, on_success);
4513 } else {
4514 on_match_success_ = new (OZ) NegativeSubmatchSuccess(
4515 stack_pointer_register, position_register, capture_register_count,
4516 capture_register_start, OZ);
4517 }
4518}
4519
4520RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
4521 if (is_positive_) {
4522 return ActionNode::BeginSubmatch(stack_reg: stack_pointer_register_,
4523 position_reg: position_register_, on_success: match);
4524 } else {
4525 Zone* zone = on_success_->zone();
4526 // We use a ChoiceNode to represent the negative lookaround. The first
4527 // alternative is the negative match. On success, the end node backtracks.
4528 // On failure, the second alternative is tried and leads to success.
4529 // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
4530 // first exit when calculating quick checks.
4531 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
4532 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
4533 return ActionNode::BeginSubmatch(stack_reg: stack_pointer_register_,
4534 position_reg: position_register_, on_success: choice_node);
4535 }
4536}
4537
4538RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
4539 RegExpNode* on_success) {
4540 intptr_t stack_pointer_register = compiler->AllocateRegister();
4541 intptr_t position_register = compiler->AllocateRegister();
4542
4543 const intptr_t registers_per_capture = 2;
4544 const intptr_t register_of_first_capture = 2;
4545 intptr_t register_count = capture_count_ * registers_per_capture;
4546 intptr_t register_start =
4547 register_of_first_capture + capture_from_ * registers_per_capture;
4548
4549 RegExpNode* result;
4550 bool was_reading_backward = compiler->read_backward();
4551 compiler->set_read_backward(type() == LOOKBEHIND);
4552 Builder builder(is_positive(), on_success, stack_pointer_register,
4553 position_register, register_count, register_start);
4554 RegExpNode* match = body_->ToNode(compiler, on_success: builder.on_match_success());
4555 result = builder.ForMatch(match);
4556 compiler->set_read_backward(was_reading_backward);
4557 return result;
4558}
4559
4560RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
4561 RegExpNode* on_success) {
4562 return ToNode(body: body(), index: index(), compiler, on_success);
4563}
4564
4565RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4566 intptr_t index,
4567 RegExpCompiler* compiler,
4568 RegExpNode* on_success) {
4569 ASSERT(body != nullptr);
4570 intptr_t start_reg = RegExpCapture::StartRegister(index);
4571 intptr_t end_reg = RegExpCapture::EndRegister(index);
4572 if (compiler->read_backward()) {
4573 intptr_t tmp = end_reg;
4574 end_reg = start_reg;
4575 start_reg = tmp;
4576 }
4577 RegExpNode* store_end = ActionNode::StorePosition(reg: end_reg, is_capture: true, on_success);
4578 RegExpNode* body_node = body->ToNode(compiler, on_success: store_end);
4579 return ActionNode::StorePosition(reg: start_reg, is_capture: true, on_success: body_node);
4580}
4581
4582RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
4583 RegExpNode* on_success) {
4584 ZoneGrowableArray<RegExpTree*>* children = nodes();
4585 RegExpNode* current = on_success;
4586 if (compiler->read_backward()) {
4587 for (intptr_t i = 0; i < children->length(); i++) {
4588 current = children->At(index: i)->ToNode(compiler, on_success: current);
4589 }
4590 } else {
4591 for (intptr_t i = children->length() - 1; i >= 0; i--) {
4592 current = children->At(index: i)->ToNode(compiler, on_success: current);
4593 }
4594 }
4595 return current;
4596}
4597
4598static void AddClass(const int32_t* elmv,
4599 intptr_t elmc,
4600 ZoneGrowableArray<CharacterRange>* ranges) {
4601 elmc--;
4602 ASSERT(elmv[elmc] == kRangeEndMarker);
4603 for (intptr_t i = 0; i < elmc; i += 2) {
4604 ASSERT(elmv[i] < elmv[i + 1]);
4605 ranges->Add(value: CharacterRange(elmv[i], elmv[i + 1] - 1));
4606 }
4607}
4608
4609static void AddClassNegated(const int32_t* elmv,
4610 intptr_t elmc,
4611 ZoneGrowableArray<CharacterRange>* ranges) {
4612 elmc--;
4613 ASSERT(elmv[elmc] == kRangeEndMarker);
4614 ASSERT(elmv[0] != 0x0000);
4615 ASSERT(elmv[elmc - 1] != Utf::kMaxCodePoint);
4616 uint16_t last = 0x0000;
4617 for (intptr_t i = 0; i < elmc; i += 2) {
4618 ASSERT(last <= elmv[i] - 1);
4619 ASSERT(elmv[i] < elmv[i + 1]);
4620 ranges->Add(value: CharacterRange(last, elmv[i] - 1));
4621 last = elmv[i + 1];
4622 }
4623 ranges->Add(value: CharacterRange(last, Utf::kMaxCodePoint));
4624}
4625
4626void CharacterRange::AddClassEscape(uint16_t type,
4627 ZoneGrowableArray<CharacterRange>* ranges,
4628 bool add_unicode_case_equivalents) {
4629 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
4630 // See #sec-runtime-semantics-wordcharacters-abstract-operation
4631 // In case of unicode and ignore_case, we need to create the closure over
4632 // case equivalent characters before negating.
4633 ZoneGrowableArray<CharacterRange>* new_ranges =
4634 new ZoneGrowableArray<CharacterRange>(2);
4635 AddClass(elmv: kWordRanges, elmc: kWordRangeCount, ranges: new_ranges);
4636 AddUnicodeCaseEquivalents(ranges: new_ranges);
4637 if (type == 'W') {
4638 ZoneGrowableArray<CharacterRange>* negated =
4639 new ZoneGrowableArray<CharacterRange>(2);
4640 CharacterRange::Negate(src: new_ranges, dst: negated);
4641 new_ranges = negated;
4642 }
4643 ranges->AddArray(src: *new_ranges);
4644 return;
4645 }
4646 AddClassEscape(type, ranges);
4647}
4648
4649void CharacterRange::AddClassEscape(uint16_t type,
4650 ZoneGrowableArray<CharacterRange>* ranges) {
4651 switch (type) {
4652 case 's':
4653 AddClass(elmv: kSpaceRanges, elmc: kSpaceRangeCount, ranges);
4654 break;
4655 case 'S':
4656 AddClassNegated(elmv: kSpaceRanges, elmc: kSpaceRangeCount, ranges);
4657 break;
4658 case 'w':
4659 AddClass(elmv: kWordRanges, elmc: kWordRangeCount, ranges);
4660 break;
4661 case 'W':
4662 AddClassNegated(elmv: kWordRanges, elmc: kWordRangeCount, ranges);
4663 break;
4664 case 'd':
4665 AddClass(elmv: kDigitRanges, elmc: kDigitRangeCount, ranges);
4666 break;
4667 case 'D':
4668 AddClassNegated(elmv: kDigitRanges, elmc: kDigitRangeCount, ranges);
4669 break;
4670 case '.':
4671 AddClassNegated(elmv: kLineTerminatorRanges, elmc: kLineTerminatorRangeCount, ranges);
4672 break;
4673 // This is not a character range as defined by the spec but a
4674 // convenient shorthand for a character class that matches any
4675 // character.
4676 case '*':
4677 ranges->Add(value: CharacterRange::Everything());
4678 break;
4679 // This is the set of characters matched by the $ and ^ symbols
4680 // in multiline mode.
4681 case 'n':
4682 AddClass(elmv: kLineTerminatorRanges, elmc: kLineTerminatorRangeCount, ranges);
4683 break;
4684 default:
4685 UNREACHABLE();
4686 }
4687}
4688
4689void CharacterRange::AddCaseEquivalents(
4690 ZoneGrowableArray<CharacterRange>* ranges,
4691 bool is_one_byte,
4692 Zone* zone) {
4693 CharacterRange::Canonicalize(ranges);
4694 int range_count = ranges->length();
4695 for (intptr_t i = 0; i < range_count; i++) {
4696 CharacterRange range = ranges->At(index: i);
4697 int32_t bottom = range.from();
4698 if (bottom > Utf16::kMaxCodeUnit) continue;
4699 int32_t top = Utils::Minimum(x: range.to(), y: Utf16::kMaxCodeUnit);
4700 // Nothing to be done for surrogates
4701 if (bottom >= Utf16::kLeadSurrogateStart &&
4702 top <= Utf16::kTrailSurrogateEnd) {
4703 continue;
4704 }
4705 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
4706 if (bottom > Symbols::kMaxOneCharCodeSymbol) continue;
4707 if (top > Symbols::kMaxOneCharCodeSymbol) {
4708 top = Symbols::kMaxOneCharCodeSymbol;
4709 }
4710 }
4711
4712 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
4713 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;
4714 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4715 if (top == bottom) {
4716 // If this is a singleton we just expand the one character.
4717 intptr_t length = jsregexp_uncanonicalize.get(c: bottom, n: '\0', result: chars);
4718 for (intptr_t i = 0; i < length; i++) {
4719 int32_t chr = chars[i];
4720 if (chr != bottom) {
4721 ranges->Add(value: CharacterRange::Singleton(value: chars[i]));
4722 }
4723 }
4724 } else {
4725 // If this is a range we expand the characters block by block,
4726 // expanding contiguous subranges (blocks) one at a time.
4727 // The approach is as follows. For a given start character we
4728 // look up the remainder of the block that contains it (represented
4729 // by the end point), for instance we find 'z' if the character
4730 // is 'c'. A block is characterized by the property
4731 // that all characters uncanonicalize in the same way, except that
4732 // each entry in the result is incremented by the distance from the first
4733 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A']
4734 // and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4735 // Once we've found the end point we look up its uncanonicalization
4736 // and produce a range for each element. For instance for [c-f]
4737 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
4738 // add a range if it is not already contained in the input, so [c-f]
4739 // will be skipped but [C-F] will be added. If this range is not
4740 // completely contained in a block we do this for all the blocks
4741 // covered by the range (handling characters that is not in a block
4742 // as a "singleton block").
4743 int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4744 intptr_t pos = bottom;
4745 while (pos <= top) {
4746 intptr_t length = jsregexp_canonrange.get(c: pos, n: '\0', result: range);
4747 int32_t block_end;
4748 if (length == 0) {
4749 block_end = pos;
4750 } else {
4751 ASSERT(length == 1);
4752 block_end = range[0];
4753 }
4754 intptr_t end = (block_end > top) ? top : block_end;
4755 length = jsregexp_uncanonicalize.get(c: block_end, n: '\0', result: range);
4756 for (intptr_t i = 0; i < length; i++) {
4757 int32_t c = range[i];
4758 int32_t range_from = c - (block_end - pos);
4759 int32_t range_to = c - (block_end - end);
4760 if (!(bottom <= range_from && range_to <= top)) {
4761 ranges->Add(value: CharacterRange(range_from, range_to));
4762 }
4763 }
4764 pos = end + 1;
4765 }
4766 }
4767 }
4768}
4769
4770bool CharacterRange::IsCanonical(ZoneGrowableArray<CharacterRange>* ranges) {
4771 ASSERT(ranges != nullptr);
4772 intptr_t n = ranges->length();
4773 if (n <= 1) return true;
4774 intptr_t max = ranges->At(index: 0).to();
4775 for (intptr_t i = 1; i < n; i++) {
4776 CharacterRange next_range = ranges->At(index: i);
4777 if (next_range.from() <= max + 1) return false;
4778 max = next_range.to();
4779 }
4780 return true;
4781}
4782
4783ZoneGrowableArray<CharacterRange>* CharacterSet::ranges() {
4784 if (ranges_ == nullptr) {
4785 ranges_ = new ZoneGrowableArray<CharacterRange>(2);
4786 CharacterRange::AddClassEscape(type: standard_set_type_, ranges: ranges_);
4787 }
4788 return ranges_;
4789}
4790
4791// Move a number of elements in a zone array to another position
4792// in the same array. Handles overlapping source and target areas.
4793static void MoveRanges(ZoneGrowableArray<CharacterRange>* list,
4794 intptr_t from,
4795 intptr_t to,
4796 intptr_t count) {
4797 // Ranges are potentially overlapping.
4798 if (from < to) {
4799 for (intptr_t i = count - 1; i >= 0; i--) {
4800 (*list)[to + i] = list->At(index: from + i);
4801 }
4802 } else {
4803 for (intptr_t i = 0; i < count; i++) {
4804 (*list)[to + i] = list->At(index: from + i);
4805 }
4806 }
4807}
4808
4809static intptr_t InsertRangeInCanonicalList(
4810 ZoneGrowableArray<CharacterRange>* list,
4811 intptr_t count,
4812 CharacterRange insert) {
4813 // Inserts a range into list[0..count[, which must be sorted
4814 // by from value and non-overlapping and non-adjacent, using at most
4815 // list[0..count] for the result. Returns the number of resulting
4816 // canonicalized ranges. Inserting a range may collapse existing ranges into
4817 // fewer ranges, so the return value can be anything in the range 1..count+1.
4818 int32_t from = insert.from();
4819 int32_t to = insert.to();
4820 intptr_t start_pos = 0;
4821 intptr_t end_pos = count;
4822 for (intptr_t i = count - 1; i >= 0; i--) {
4823 CharacterRange current = list->At(index: i);
4824 if (current.from() > to + 1) {
4825 end_pos = i;
4826 } else if (current.to() + 1 < from) {
4827 start_pos = i + 1;
4828 break;
4829 }
4830 }
4831
4832 // Inserted range overlaps, or is adjacent to, ranges at positions
4833 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4834 // not affected by the insertion.
4835 // If start_pos == end_pos, the range must be inserted before start_pos.
4836 // if start_pos < end_pos, the entire range from start_pos to end_pos
4837 // must be merged with the insert range.
4838
4839 if (start_pos == end_pos) {
4840 // Insert between existing ranges at position start_pos.
4841 if (start_pos < count) {
4842 MoveRanges(list, from: start_pos, to: start_pos + 1, count: count - start_pos);
4843 }
4844 (*list)[start_pos] = insert;
4845 return count + 1;
4846 }
4847 if (start_pos + 1 == end_pos) {
4848 // Replace single existing range at position start_pos.
4849 CharacterRange to_replace = list->At(index: start_pos);
4850 intptr_t new_from = Utils::Minimum(x: to_replace.from(), y: from);
4851 intptr_t new_to = Utils::Maximum(x: to_replace.to(), y: to);
4852 (*list)[start_pos] = CharacterRange(new_from, new_to);
4853 return count;
4854 }
4855 // Replace a number of existing ranges from start_pos to end_pos - 1.
4856 // Move the remaining ranges down.
4857
4858 intptr_t new_from = Utils::Minimum(x: list->At(index: start_pos).from(), y: from);
4859 intptr_t new_to = Utils::Maximum(x: list->At(index: end_pos - 1).to(), y: to);
4860 if (end_pos < count) {
4861 MoveRanges(list, from: end_pos, to: start_pos + 1, count: count - end_pos);
4862 }
4863 (*list)[start_pos] = CharacterRange(new_from, new_to);
4864 return count - (end_pos - start_pos) + 1;
4865}
4866
4867void CharacterSet::Canonicalize() {
4868 // Special/default classes are always considered canonical. The result
4869 // of calling ranges() will be sorted.
4870 if (ranges_ == nullptr) return;
4871 CharacterRange::Canonicalize(ranges: ranges_);
4872}
4873
4874void CharacterRange::Canonicalize(
4875 ZoneGrowableArray<CharacterRange>* character_ranges) {
4876 if (character_ranges->length() <= 1) return;
4877 // Check whether ranges are already canonical (increasing, non-overlapping,
4878 // non-adjacent).
4879 intptr_t n = character_ranges->length();
4880 intptr_t max = character_ranges->At(index: 0).to();
4881 intptr_t i = 1;
4882 while (i < n) {
4883 CharacterRange current = character_ranges->At(index: i);
4884 if (current.from() <= max + 1) {
4885 break;
4886 }
4887 max = current.to();
4888 i++;
4889 }
4890 // Canonical until the i'th range. If that's all of them, we are done.
4891 if (i == n) return;
4892
4893 // The ranges at index i and forward are not canonicalized. Make them so by
4894 // doing the equivalent of insertion sort (inserting each into the previous
4895 // list, in order).
4896 // Notice that inserting a range can reduce the number of ranges in the
4897 // result due to combining of adjacent and overlapping ranges.
4898 intptr_t read = i; // Range to insert.
4899 intptr_t num_canonical = i; // Length of canonicalized part of list.
4900 do {
4901 num_canonical = InsertRangeInCanonicalList(list: character_ranges, count: num_canonical,
4902 insert: character_ranges->At(index: read));
4903 read++;
4904 } while (read < n);
4905 character_ranges->TruncateTo(length: num_canonical);
4906
4907 ASSERT(CharacterRange::IsCanonical(character_ranges));
4908}
4909
4910void CharacterRange::Negate(ZoneGrowableArray<CharacterRange>* ranges,
4911 ZoneGrowableArray<CharacterRange>* negated_ranges) {
4912 ASSERT(CharacterRange::IsCanonical(ranges));
4913 ASSERT(negated_ranges->length() == 0);
4914 intptr_t range_count = ranges->length();
4915 uint32_t from = 0;
4916 intptr_t i = 0;
4917 if (range_count > 0 && ranges->At(index: 0).from() == 0) {
4918 from = ranges->At(index: 0).to();
4919 i = 1;
4920 }
4921 while (i < range_count) {
4922 CharacterRange range = ranges->At(index: i);
4923 negated_ranges->Add(value: CharacterRange(from + 1, range.from() - 1));
4924 from = range.to();
4925 i++;
4926 }
4927 if (from < Utf::kMaxCodePoint) {
4928 negated_ranges->Add(value: CharacterRange(from + 1, Utf::kMaxCodePoint));
4929 }
4930}
4931
4932// -------------------------------------------------------------------
4933// Splay tree
4934
4935// Workaround for the fact that ZoneGrowableArray does not have contains().
4936static bool ArrayContains(ZoneGrowableArray<unsigned>* array, unsigned value) {
4937 for (intptr_t i = 0; i < array->length(); i++) {
4938 if (array->At(index: i) == value) {
4939 return true;
4940 }
4941 }
4942 return false;
4943}
4944
4945OutSet* OutSet::Extend(unsigned value, Zone* zone) {
4946 if (Get(value)) return this;
4947 if (successors() != nullptr) {
4948 for (int i = 0; i < successors()->length(); i++) {
4949 OutSet* successor = successors()->At(index: i);
4950 if (successor->Get(value)) return successor;
4951 }
4952 } else {
4953 successors_ = new (zone) ZoneGrowableArray<OutSet*>(2);
4954 }
4955 OutSet* result = new (zone) OutSet(first_, remaining_);
4956 result->Set(value, zone);
4957 successors()->Add(value: result);
4958 return result;
4959}
4960
4961void OutSet::Set(unsigned value, Zone* zone) {
4962 if (value < kFirstLimit) {
4963 first_ |= (1 << value);
4964 } else {
4965 if (remaining_ == nullptr)
4966 remaining_ = new (zone) ZoneGrowableArray<unsigned>(1);
4967
4968 bool remaining_contains_value = ArrayContains(array: remaining_, value);
4969 if (remaining_->is_empty() || !remaining_contains_value) {
4970 remaining_->Add(value);
4971 }
4972 }
4973}
4974
4975bool OutSet::Get(unsigned value) const {
4976 if (value < kFirstLimit) {
4977 return (first_ & (1 << value)) != 0;
4978 } else if (remaining_ == nullptr) {
4979 return false;
4980 } else {
4981 return ArrayContains(array: remaining_, value);
4982 }
4983}
4984
4985const int32_t ChoiceTable::Config::kNoKey = Utf::kInvalidChar;
4986
4987void ChoiceTable::AddRange(CharacterRange full_range,
4988 int32_t value,
4989 Zone* zone) {
4990 CharacterRange current = full_range;
4991 if (tree()->is_empty()) {
4992 // If this is the first range we just insert into the table.
4993 ZoneSplayTree<Config>::Locator loc;
4994 bool inserted = tree()->Insert(key: current.from(), locator: &loc);
4995 ASSERT(inserted);
4996 USE(inserted);
4997 loc.set_value(
4998 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
4999 return;
5000 }
5001 // First see if there is a range to the left of this one that
5002 // overlaps.
5003 ZoneSplayTree<Config>::Locator loc;
5004 if (tree()->FindGreatestLessThan(key: current.from(), locator: &loc)) {
5005 Entry* entry = &loc.value();
5006 // If we've found a range that overlaps with this one, and it
5007 // starts strictly to the left of this one, we have to fix it
5008 // because the following code only handles ranges that start on
5009 // or after the start point of the range we're adding.
5010 if (entry->from() < current.from() && entry->to() >= current.from()) {
5011 // Snap the overlapping range in half around the start point of
5012 // the range we're adding.
5013 CharacterRange left =
5014 CharacterRange::Range(from: entry->from(), to: current.from() - 1);
5015 CharacterRange right = CharacterRange::Range(from: current.from(), to: entry->to());
5016 // The left part of the overlapping range doesn't overlap.
5017 // Truncate the whole entry to be just the left part.
5018 entry->set_to(left.to());
5019 // The right part is the one that overlaps. We add this part
5020 // to the map and let the next step deal with merging it with
5021 // the range we're adding.
5022 ZoneSplayTree<Config>::Locator loc;
5023 bool inserted = tree()->Insert(key: right.from(), locator: &loc);
5024 ASSERT(inserted);
5025 USE(inserted);
5026 loc.set_value(Entry(right.from(), right.to(), entry->out_set()));
5027 }
5028 }
5029 while (current.is_valid()) {
5030 if (tree()->FindLeastGreaterThan(key: current.from(), locator: &loc) &&
5031 (loc.value().from() <= current.to()) &&
5032 (loc.value().to() >= current.from())) {
5033 Entry* entry = &loc.value();
5034 // We have overlap. If there is space between the start point of
5035 // the range we're adding and where the overlapping range starts
5036 // then we have to add a range covering just that space.
5037 if (current.from() < entry->from()) {
5038 ZoneSplayTree<Config>::Locator ins;
5039 bool inserted = tree()->Insert(key: current.from(), locator: &ins);
5040 ASSERT(inserted);
5041 USE(inserted);
5042 ins.set_value(Entry(current.from(), entry->from() - 1,
5043 empty()->Extend(value, zone)));
5044 current.set_from(entry->from());
5045 }
5046 ASSERT(current.from() == entry->from());
5047 // If the overlapping range extends beyond the one we want to add
5048 // we have to snap the right part off and add it separately.
5049 if (entry->to() > current.to()) {
5050 ZoneSplayTree<Config>::Locator ins;
5051 bool inserted = tree()->Insert(key: current.to() + 1, locator: &ins);
5052 ASSERT(inserted);
5053 USE(inserted);
5054 ins.set_value(Entry(current.to() + 1, entry->to(), entry->out_set()));
5055 entry->set_to(current.to());
5056 }
5057 ASSERT(entry->to() <= current.to());
5058 // The overlapping range is now completely contained by the range
5059 // we're adding so we can just update it and move the start point
5060 // of the range we're adding just past it.
5061 entry->AddValue(value, zone);
5062 ASSERT(entry->to() + 1 > current.from());
5063 current.set_from(entry->to() + 1);
5064 } else {
5065 // There is no overlap so we can just add the range
5066 ZoneSplayTree<Config>::Locator ins;
5067 bool inserted = tree()->Insert(key: current.from(), locator: &ins);
5068 ASSERT(inserted);
5069 USE(inserted);
5070 ins.set_value(
5071 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
5072 break;
5073 }
5074 }
5075}
5076
5077OutSet* ChoiceTable::Get(int32_t value) {
5078 ZoneSplayTree<Config>::Locator loc;
5079 if (!tree()->FindGreatestLessThan(key: value, locator: &loc)) return empty();
5080 Entry* entry = &loc.value();
5081 if (value <= entry->to())
5082 return entry->out_set();
5083 else
5084 return empty();
5085}
5086
5087// -------------------------------------------------------------------
5088// Analysis
5089
5090void Analysis::EnsureAnalyzed(RegExpNode* that) {
5091 if (that->info()->been_analyzed || that->info()->being_analyzed) return;
5092 that->info()->being_analyzed = true;
5093 that->Accept(visitor: this);
5094 that->info()->being_analyzed = false;
5095 that->info()->been_analyzed = true;
5096}
5097
5098void Analysis::VisitEnd(EndNode* that) {
5099 // nothing to do
5100}
5101
5102void TextNode::CalculateOffsets() {
5103 intptr_t element_count = elements()->length();
5104 // Set up the offsets of the elements relative to the start. This is a fixed
5105 // quantity since a TextNode can only contain fixed-width things.
5106 intptr_t cp_offset = 0;
5107 for (intptr_t i = 0; i < element_count; i++) {
5108 TextElement& elm = (*elements())[i];
5109 elm.set_cp_offset(cp_offset);
5110 cp_offset += elm.length();
5111 }
5112}
5113
5114void Analysis::VisitText(TextNode* that) {
5115 that->MakeCaseIndependent(is_one_byte: is_one_byte_);
5116 EnsureAnalyzed(that: that->on_success());
5117 if (!has_failed()) {
5118 that->CalculateOffsets();
5119 }
5120}
5121
5122void Analysis::VisitAction(ActionNode* that) {
5123 RegExpNode* target = that->on_success();
5124 EnsureAnalyzed(that: target);
5125 if (!has_failed()) {
5126 // If the next node is interested in what it follows then this node
5127 // has to be interested too so it can pass the information on.
5128 that->info()->AddFromFollowing(that: target->info());
5129 }
5130}
5131
5132void Analysis::VisitChoice(ChoiceNode* that) {
5133 NodeInfo* info = that->info();
5134 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5135 RegExpNode* node = (*that->alternatives())[i].node();
5136 EnsureAnalyzed(that: node);
5137 if (has_failed()) return;
5138 // Anything the following nodes need to know has to be known by
5139 // this node also, so it can pass it on.
5140 info->AddFromFollowing(that: node->info());
5141 }
5142}
5143
5144void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5145 NodeInfo* info = that->info();
5146 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5147 RegExpNode* node = (*that->alternatives())[i].node();
5148 if (node != that->loop_node()) {
5149 EnsureAnalyzed(that: node);
5150 if (has_failed()) return;
5151 info->AddFromFollowing(that: node->info());
5152 }
5153 }
5154 // Check the loop last since it may need the value of this node
5155 // to get a correct result.
5156 EnsureAnalyzed(that: that->loop_node());
5157 if (!has_failed()) {
5158 info->AddFromFollowing(that: that->loop_node()->info());
5159 }
5160}
5161
5162void Analysis::VisitBackReference(BackReferenceNode* that) {
5163 EnsureAnalyzed(that: that->on_success());
5164}
5165
5166void Analysis::VisitAssertion(AssertionNode* that) {
5167 EnsureAnalyzed(that: that->on_success());
5168}
5169
5170void BackReferenceNode::FillInBMInfo(intptr_t offset,
5171 intptr_t budget,
5172 BoyerMooreLookahead* bm,
5173 bool not_at_start) {
5174 // Working out the set of characters that a backreference can match is too
5175 // hard, so we just say that any character can match.
5176 bm->SetRest(offset);
5177 SaveBMInfo(bm, not_at_start, offset);
5178}
5179
5180COMPILE_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5181 RegExpMacroAssembler::kTableSize);
5182
5183void ChoiceNode::FillInBMInfo(intptr_t offset,
5184 intptr_t budget,
5185 BoyerMooreLookahead* bm,
5186 bool not_at_start) {
5187 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
5188 budget = (budget - 1) / alts->length();
5189 for (intptr_t i = 0; i < alts->length(); i++) {
5190 GuardedAlternative& alt = (*alts)[i];
5191 if (alt.guards() != nullptr && alt.guards()->length() != 0) {
5192 bm->SetRest(offset); // Give up trying to fill in info.
5193 SaveBMInfo(bm, not_at_start, offset);
5194 return;
5195 }
5196 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5197 }
5198 SaveBMInfo(bm, not_at_start, offset);
5199}
5200
5201void TextNode::FillInBMInfo(intptr_t initial_offset,
5202 intptr_t budget,
5203 BoyerMooreLookahead* bm,
5204 bool not_at_start) {
5205 if (initial_offset >= bm->length()) return;
5206 intptr_t offset = initial_offset;
5207 intptr_t max_char = bm->max_char();
5208 for (intptr_t i = 0; i < elements()->length(); i++) {
5209 if (offset >= bm->length()) {
5210 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5211 return;
5212 }
5213 TextElement text = elements()->At(index: i);
5214 if (text.text_type() == TextElement::ATOM) {
5215 RegExpAtom* atom = text.atom();
5216 for (intptr_t j = 0; j < atom->length(); j++, offset++) {
5217 if (offset >= bm->length()) {
5218 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5219 return;
5220 }
5221 uint16_t character = atom->data()->At(index: j);
5222 if (atom->flags().IgnoreCase()) {
5223 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5224 intptr_t length = GetCaseIndependentLetters(
5225 character, one_byte_subject: bm->max_char() == Symbols::kMaxOneCharCodeSymbol,
5226 letters: chars);
5227 for (intptr_t j = 0; j < length; j++) {
5228 bm->Set(map_number: offset, character: chars[j]);
5229 }
5230 } else {
5231 if (character <= max_char) bm->Set(map_number: offset, character);
5232 }
5233 }
5234 } else {
5235 ASSERT(text.text_type() == TextElement::CHAR_CLASS);
5236 RegExpCharacterClass* char_class = text.char_class();
5237 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
5238 if (char_class->is_negated()) {
5239 bm->SetAll(offset);
5240 } else {
5241 for (intptr_t k = 0; k < ranges->length(); k++) {
5242 const CharacterRange& range = ranges->At(index: k);
5243 if (range.from() > max_char) continue;
5244 intptr_t to =
5245 Utils::Minimum(x: max_char, y: static_cast<intptr_t>(range.to()));
5246 bm->SetInterval(map_number: offset, interval: Interval(range.from(), to));
5247 }
5248 }
5249 offset++;
5250 }
5251 }
5252 if (offset >= bm->length()) {
5253 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5254 return;
5255 }
5256 on_success()->FillInBMInfo(offset, budget: budget - 1, bm,
5257 not_at_start: true); // Not at start after a text node.
5258 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5259}
5260
5261RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
5262 RegExpNode* on_success,
5263 RegExpFlags flags) {
5264 // If the regexp matching starts within a surrogate pair, step back
5265 // to the lead surrogate and start matching from there.
5266 ASSERT(!compiler->read_backward());
5267 Zone* zone = compiler->zone();
5268
5269 auto lead_surrogates = CharacterRange::List(
5270 zone: on_success->zone(), range: CharacterRange::Range(from: Utf16::kLeadSurrogateStart,
5271 to: Utf16::kLeadSurrogateEnd));
5272 auto trail_surrogates = CharacterRange::List(
5273 zone: on_success->zone(), range: CharacterRange::Range(from: Utf16::kTrailSurrogateStart,
5274 to: Utf16::kTrailSurrogateEnd));
5275
5276 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
5277
5278 int stack_register = compiler->UnicodeLookaroundStackRegister();
5279 int position_register = compiler->UnicodeLookaroundPositionRegister();
5280 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
5281 ranges: lead_surrogates, /*read_backward=*/true, on_success, flags);
5282 RegExpLookaround::Builder builder(/*is_positive=*/true, step_back,
5283 stack_register, position_register);
5284 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
5285 ranges: trail_surrogates, /*read_backward=*/false, on_success: builder.on_match_success(),
5286 flags);
5287
5288 optional_step_back->AddAlternative(
5289 node: GuardedAlternative(builder.ForMatch(match: match_trail)));
5290 optional_step_back->AddAlternative(node: GuardedAlternative(on_success));
5291
5292 return optional_step_back;
5293}
5294
5295#if !defined(DART_PRECOMPILED_RUNTIME)
5296RegExpEngine::CompilationResult RegExpEngine::CompileIR(
5297 RegExpCompileData* data,
5298 const ParsedFunction* parsed_function,
5299 const ZoneGrowableArray<const ICData*>& ic_data_array,
5300 intptr_t osr_id) {
5301 ASSERT(!FLAG_interpret_irregexp);
5302 Zone* zone = Thread::Current()->zone();
5303
5304 const Function& function = parsed_function->function();
5305 const intptr_t specialization_cid = function.string_specialization_cid();
5306 const bool is_sticky = function.is_sticky_specialization();
5307 const bool is_one_byte = (specialization_cid == kOneByteStringCid ||
5308 specialization_cid == kExternalOneByteStringCid);
5309 RegExp& regexp = RegExp::Handle(zone, ptr: function.regexp());
5310 const String& pattern = String::Handle(zone, ptr: regexp.pattern());
5311
5312 ASSERT(!regexp.IsNull());
5313 ASSERT(!pattern.IsNull());
5314
5315 const bool is_global = regexp.flags().IsGlobal();
5316 const bool is_unicode = regexp.flags().IsUnicode();
5317
5318 RegExpCompiler compiler(data->capture_count, is_one_byte);
5319
5320 // TODO(zerny): Frequency sampling is currently disabled because of several
5321 // issues. We do not want to store subject strings in the regexp object since
5322 // they might be long and we should not prevent their garbage collection.
5323 // Passing them to this function explicitly does not help, since we must
5324 // generate exactly the same IR for both the unoptimizing and optimizing
5325 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5326 // An option would be to store sampling results in the regexp object, but
5327 // I'm not sure the performance gains are relevant enough.
5328
5329 // Wrap the body of the regexp in capture #0.
5330 RegExpNode* captured_body =
5331 RegExpCapture::ToNode(body: data->tree, index: 0, compiler: &compiler, on_success: compiler.accept());
5332
5333 RegExpNode* node = captured_body;
5334 const bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5335 const bool is_start_anchored = data->tree->IsAnchoredAtStart();
5336 intptr_t max_length = data->tree->max_match();
5337 if (!is_start_anchored && !is_sticky) {
5338 // Add a .*? at the beginning, outside the body capture, unless
5339 // this expression is anchored at the beginning or is sticky.
5340 RegExpNode* loop_node = RegExpQuantifier::ToNode(
5341 min: 0, max: RegExpTree::kInfinity, is_greedy: false,
5342 body: new (zone) RegExpCharacterClass('*', RegExpFlags()), compiler: &compiler,
5343 on_success: captured_body, not_at_start: data->contains_anchor);
5344
5345 if (data->contains_anchor) {
5346 // Unroll loop once, to take care of the case that might start
5347 // at the start of input.
5348 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5349 first_step_node->AddAlternative(node: GuardedAlternative(captured_body));
5350 first_step_node->AddAlternative(node: GuardedAlternative(new (zone) TextNode(
5351 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5352 /*read_backwards=*/false, loop_node)));
5353 node = first_step_node;
5354 } else {
5355 node = loop_node;
5356 }
5357 }
5358 if (is_one_byte) {
5359 node = node->FilterOneByte(depth: RegExpCompiler::kMaxRecursion);
5360 // Do it again to propagate the new nodes to places where they were not
5361 // put because they had not been calculated yet.
5362 if (node != nullptr) {
5363 node = node->FilterOneByte(depth: RegExpCompiler::kMaxRecursion);
5364 }
5365 } else if (is_unicode && (is_global || is_sticky)) {
5366 node = OptionallyStepBackToLeadSurrogate(compiler: &compiler, on_success: node, flags: regexp.flags());
5367 }
5368
5369 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5370 data->node = node;
5371 Analysis analysis(is_one_byte);
5372 analysis.EnsureAnalyzed(that: node);
5373 if (analysis.has_failed()) {
5374 const char* error_message = analysis.error_message();
5375 return CompilationResult(error_message);
5376 }
5377
5378 // Native regexp implementation.
5379
5380 IRRegExpMacroAssembler* macro_assembler = new (zone)
5381 IRRegExpMacroAssembler(specialization_cid, data->capture_count,
5382 parsed_function, ic_data_array, osr_id, zone);
5383
5384 // Inserted here, instead of in Assembler, because it depends on information
5385 // in the AST that isn't replicated in the Node structure.
5386 const intptr_t kMaxBacksearchLimit = 1024;
5387 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5388 max_length < kMaxBacksearchLimit) {
5389 macro_assembler->SetCurrentPositionFromEnd(max_length);
5390 }
5391
5392 if (is_global) {
5393 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
5394 if (data->tree->min_match() > 0) {
5395 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
5396 } else if (is_unicode) {
5397 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
5398 }
5399 macro_assembler->set_global_mode(mode);
5400 }
5401
5402 RegExpEngine::CompilationResult result =
5403 compiler.Assemble(macro_assembler, start: node, capture_count: data->capture_count, pattern);
5404
5405 if (FLAG_trace_irregexp) {
5406 macro_assembler->PrintBlocks();
5407 }
5408
5409 return result;
5410}
5411#endif // !defined(DART_PRECOMPILED_RUNTIME)
5412
5413RegExpEngine::CompilationResult RegExpEngine::CompileBytecode(
5414 RegExpCompileData* data,
5415 const RegExp& regexp,
5416 bool is_one_byte,
5417 bool is_sticky,
5418 Zone* zone) {
5419 ASSERT(FLAG_interpret_irregexp);
5420 const String& pattern = String::Handle(zone, ptr: regexp.pattern());
5421
5422 ASSERT(!regexp.IsNull());
5423 ASSERT(!pattern.IsNull());
5424
5425 const bool is_global = regexp.flags().IsGlobal();
5426 const bool is_unicode = regexp.flags().IsUnicode();
5427
5428 RegExpCompiler compiler(data->capture_count, is_one_byte);
5429
5430 // TODO(zerny): Frequency sampling is currently disabled because of several
5431 // issues. We do not want to store subject strings in the regexp object since
5432 // they might be long and we should not prevent their garbage collection.
5433 // Passing them to this function explicitly does not help, since we must
5434 // generate exactly the same IR for both the unoptimizing and optimizing
5435 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5436 // An option would be to store sampling results in the regexp object, but
5437 // I'm not sure the performance gains are relevant enough.
5438
5439 // Wrap the body of the regexp in capture #0.
5440 RegExpNode* captured_body =
5441 RegExpCapture::ToNode(body: data->tree, index: 0, compiler: &compiler, on_success: compiler.accept());
5442
5443 RegExpNode* node = captured_body;
5444 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5445 bool is_start_anchored = data->tree->IsAnchoredAtStart();
5446 intptr_t max_length = data->tree->max_match();
5447 if (!is_start_anchored && !is_sticky) {
5448 // Add a .*? at the beginning, outside the body capture, unless
5449 // this expression is anchored at the beginning.
5450 RegExpNode* loop_node = RegExpQuantifier::ToNode(
5451 min: 0, max: RegExpTree::kInfinity, is_greedy: false,
5452 body: new (zone) RegExpCharacterClass('*', RegExpFlags()), compiler: &compiler,
5453 on_success: captured_body, not_at_start: data->contains_anchor);
5454
5455 if (data->contains_anchor) {
5456 // Unroll loop once, to take care of the case that might start
5457 // at the start of input.
5458 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5459 first_step_node->AddAlternative(node: GuardedAlternative(captured_body));
5460 first_step_node->AddAlternative(node: GuardedAlternative(new (zone) TextNode(
5461 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5462 /*read_backwards=*/false, loop_node)));
5463 node = first_step_node;
5464 } else {
5465 node = loop_node;
5466 }
5467 }
5468 if (is_one_byte) {
5469 node = node->FilterOneByte(depth: RegExpCompiler::kMaxRecursion);
5470 // Do it again to propagate the new nodes to places where they were not
5471 // put because they had not been calculated yet.
5472 if (node != nullptr) {
5473 node = node->FilterOneByte(depth: RegExpCompiler::kMaxRecursion);
5474 }
5475 } else if (is_unicode && (is_global || is_sticky)) {
5476 node = OptionallyStepBackToLeadSurrogate(compiler: &compiler, on_success: node, flags: regexp.flags());
5477 }
5478
5479 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5480 data->node = node;
5481 Analysis analysis(is_one_byte);
5482 analysis.EnsureAnalyzed(that: node);
5483 if (analysis.has_failed()) {
5484 const char* error_message = analysis.error_message();
5485 return CompilationResult(error_message);
5486 }
5487
5488 // Bytecode regexp implementation.
5489
5490 ZoneGrowableArray<uint8_t> buffer(zone, 1024);
5491 BytecodeRegExpMacroAssembler* macro_assembler =
5492 new (zone) BytecodeRegExpMacroAssembler(&buffer, zone);
5493
5494 // Inserted here, instead of in Assembler, because it depends on information
5495 // in the AST that isn't replicated in the Node structure.
5496 const intptr_t kMaxBacksearchLimit = 1024;
5497 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5498 max_length < kMaxBacksearchLimit) {
5499 macro_assembler->SetCurrentPositionFromEnd(max_length);
5500 }
5501
5502 if (is_global) {
5503 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
5504 if (data->tree->min_match() > 0) {
5505 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
5506 } else if (is_unicode) {
5507 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
5508 }
5509 macro_assembler->set_global_mode(mode);
5510 }
5511
5512 RegExpEngine::CompilationResult result =
5513 compiler.Assemble(macro_assembler, start: node, capture_count: data->capture_count, pattern);
5514
5515 if (FLAG_trace_irregexp) {
5516 macro_assembler->PrintBlocks();
5517 }
5518
5519 return result;
5520}
5521
5522void CreateSpecializedFunction(Thread* thread,
5523 Zone* zone,
5524 const RegExp& regexp,
5525 intptr_t specialization_cid,
5526 bool sticky,
5527 const Object& owner) {
5528 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;
5529
5530 const FunctionType& signature =
5531 FunctionType::Handle(zone, ptr: FunctionType::New());
5532 const String& pattern = String::Handle(zone, ptr: regexp.pattern());
5533 Function& fn =
5534 Function::Handle(zone, ptr: Function::New(signature, name: pattern,
5535 kind: UntaggedFunction::kIrregexpFunction,
5536 is_static: true, // Static.
5537 is_const: false, // Not const.
5538 is_abstract: false, // Not abstract.
5539 is_external: false, // Not external.
5540 is_native: false, // Not native.
5541 owner, token_pos: TokenPosition::kMinSource));
5542
5543 // TODO(zerny): Share these arrays between all irregexp functions.
5544 // TODO(regis): Better, share a common signature.
5545 signature.set_num_fixed_parameters(kParamCount);
5546 signature.set_parameter_types(
5547 Array::Handle(zone, ptr: Array::New(len: kParamCount, space: Heap::kOld)));
5548 fn.CreateNameArray();
5549 signature.SetParameterTypeAt(index: RegExpMacroAssembler::kParamRegExpIndex,
5550 value: Object::dynamic_type());
5551 fn.SetParameterNameAt(index: RegExpMacroAssembler::kParamRegExpIndex,
5552 value: Symbols::This());
5553 signature.SetParameterTypeAt(index: RegExpMacroAssembler::kParamStringIndex,
5554 value: Object::dynamic_type());
5555 fn.SetParameterNameAt(index: RegExpMacroAssembler::kParamStringIndex,
5556 value: Symbols::string_param());
5557 signature.SetParameterTypeAt(index: RegExpMacroAssembler::kParamStartOffsetIndex,
5558 value: Object::dynamic_type());
5559 fn.SetParameterNameAt(index: RegExpMacroAssembler::kParamStartOffsetIndex,
5560 value: Symbols::start_index_param());
5561 signature.set_result_type(Type::Handle(zone, ptr: Type::ArrayType()));
5562
5563 // Cache the result.
5564 regexp.set_function(cid: specialization_cid, sticky, value: fn);
5565
5566 fn.SetRegExpData(regexp, string_specialization_cid: specialization_cid, sticky);
5567 fn.set_is_debuggable(false);
5568
5569 // The function is compiled lazily during the first call.
5570}
5571
5572RegExpPtr RegExpEngine::CreateRegExp(Thread* thread,
5573 const String& pattern,
5574 RegExpFlags flags) {
5575 Zone* zone = thread->zone();
5576 const RegExp& regexp = RegExp::Handle(ptr: RegExp::New(zone));
5577
5578 regexp.set_pattern(pattern);
5579 regexp.set_flags(flags);
5580
5581 // TODO(zerny): We might want to use normal string searching algorithms
5582 // for simple patterns.
5583 regexp.set_is_complex();
5584 regexp.set_is_global(); // All dart regexps are global.
5585
5586 if (!FLAG_interpret_irregexp) {
5587 const Library& lib = Library::Handle(zone, ptr: Library::CoreLibrary());
5588 const Class& owner =
5589 Class::Handle(zone, ptr: lib.LookupClass(name: Symbols::RegExp()));
5590
5591 for (intptr_t cid = kOneByteStringCid; cid <= kExternalTwoByteStringCid;
5592 cid++) {
5593 CreateSpecializedFunction(thread, zone, regexp, specialization_cid: cid, /*sticky=*/false,
5594 owner);
5595 CreateSpecializedFunction(thread, zone, regexp, specialization_cid: cid, /*sticky=*/true,
5596 owner);
5597 }
5598 }
5599
5600 return regexp.ptr();
5601}
5602
5603} // namespace dart
5604

source code of flutter_engine/third_party/dart/runtime/vm/regexp.cc