Import Upstream version 0.14

2022-09-29 15:22:54 +08:00 · 2022-09-29 15:22:54 +08:00 · f03dd0cdde
commit f03dd0cdde
143 changed files with 47281 additions and 0 deletions
--- a/63
+++ b/63
@ -0,0 +1,63 @@
+0.14 2021-01-31
+- Lexical $_ has been removed (Perl 5.24)  (PR #1, thanks Tim Heaney)
+- Expose the NamedCapturingGroups method (PR #2, thanks rouzier)
+- Fix build on macOS by defaulting RE2 to use C++11 ("tr1/unordered_set" isn't provided by the clang C++ library
+  anymore and it's 2021)
+- Link to GitHub issue tracker as CPAN RT is going away.
+
+0.13 2015-01-18
+- Unbreak Windows build
+- Up minimum perl to 5.12
+
+0.12 2015-01-17
+- Fix for `"" =~ {}` crashing
+- Fix for building on perl >= 5.20 (RT #95144, thanks Tony C. for the patch)
+- Fix build with -Werror=format-security (RT #96338)
+
+0.11 2012-07-29
+- Support named capture groups
+- Support perl >= 5.17.1; add a nulled out op_comp to engine struct
+
+0.10 2012-07-24
+- Add missing compat-rx.h file
+
+0.09 2012-04-01 (Brought to you from the 2012 QA Hackathon in Paris)
+- Thread destruction fixes
+
+0.08 2011-04-18
+- Add files I forgot to add
+
+0.07_01 2011-04-16
+- Use cophh API
+- Support -strict mode
+
+0.07 2011-04-11
+- RT #67192: Fix /s support
+- Attempt to compile with -O3 as RE2 does
+- Fix leak in possible_match_range
+- Fix compilation on gcc 4.6 (RE2 issue 35)
+
+0.06 2011-04-02
+- RT #67153: Fix interpolation of RE2 into RE2
+  (qr// stringification included the x flag which RE2 doesn't support)
+
+0.05 2011-02-06
+- Allow setting of RE2's max_mem to control memory bound
+- Improve documentation
+
+0.04 2011-01-29
+- Remove various UNIXisms from RE2, now builds under Win32/Strawberry
+  (still needs gmake installed, which comes with Strawberry)
+
+0.03 2011-01-23
+- Pass more options from MakeMaker to RE2, should now work on x86_64 again
+- Run RE2's own test suite as part of build if we can
+
+0.02 2011-01-22
+- Use ExtUtils::CppGuess and try to find GNU make
+
+0.01 2011-01-16
+- Fixes for //g, captures, generally many things
+
+0.01_01 2010-07-25
+- Initial dev. version
--- a/143
+++ b/143
@ -0,0 +1,143 @@
+Changes
+compat-cophh.h
+compat-rx.h
+lib/re/engine/RE2.pm
+Makefile.PL
+MANIFEST			This list of files
+MANIFEST.SKIP
+ppport.h
+RE2.xs
+re2/.hgignore
+re2/AUTHORS
+re2/CONTRIBUTORS
+re2/libre2.symbols
+re2/libre2.symbols.darwin
+re2/LICENSE
+re2/Makefile
+re2/re2/bitstate.cc
+re2/re2/compile.cc
+re2/re2/dfa.cc
+re2/re2/filtered_re2.cc
+re2/re2/filtered_re2.h
+re2/re2/make_perl_groups.pl
+re2/re2/make_unicode_casefold.py
+re2/re2/make_unicode_groups.py
+re2/re2/Makefile
+re2/re2/mimics_pcre.cc
+re2/re2/nfa.cc
+re2/re2/onepass.cc
+re2/re2/parse.cc
+re2/re2/perl_groups.cc
+re2/re2/prefilter.cc
+re2/re2/prefilter.h
+re2/re2/prefilter_tree.cc
+re2/re2/prefilter_tree.h
+re2/re2/prog.cc
+re2/re2/prog.h
+re2/re2/re2.cc
+re2/re2/re2.h
+re2/re2/regexp.cc
+re2/re2/regexp.h
+re2/re2/set.cc
+re2/re2/set.h
+re2/re2/simplify.cc
+re2/re2/stringpiece.h
+re2/re2/testing/backtrack.cc
+re2/re2/testing/charclass_test.cc
+re2/re2/testing/compile_test.cc
+re2/re2/testing/dfa_test.cc
+re2/re2/testing/dump.cc
+re2/re2/testing/exhaustive1_test.cc
+re2/re2/testing/exhaustive2_test.cc
+re2/re2/testing/exhaustive3_test.cc
+re2/re2/testing/exhaustive_test.cc
+re2/re2/testing/exhaustive_tester.cc
+re2/re2/testing/exhaustive_tester.h
+re2/re2/testing/filtered_re2_test.cc
+re2/re2/testing/mimics_pcre_test.cc
+re2/re2/testing/null_walker.cc
+re2/re2/testing/parse_test.cc
+re2/re2/testing/possible_match_test.cc
+re2/re2/testing/random_test.cc
+re2/re2/testing/re2_arg_test.cc
+re2/re2/testing/re2_test.cc
+re2/re2/testing/regexp_benchmark.cc
+re2/re2/testing/regexp_generator.cc
+re2/re2/testing/regexp_generator.h
+re2/re2/testing/regexp_test.cc
+re2/re2/testing/required_prefix_test.cc
+re2/re2/testing/search_test.cc
+re2/re2/testing/set_test.cc
+re2/re2/testing/simplify_test.cc
+re2/re2/testing/string_generator.cc
+re2/re2/testing/string_generator.h
+re2/re2/testing/string_generator_test.cc
+re2/re2/testing/tester.cc
+re2/re2/testing/tester.h
+re2/re2/testing/unicode_test.py
+re2/re2/tostring.cc
+re2/re2/unicode.py
+re2/re2/unicode_casefold.cc
+re2/re2/unicode_casefold.h
+re2/re2/unicode_groups.cc
+re2/re2/unicode_groups.h
+re2/re2/variadic_function.h
+re2/re2/walker-inl.h
+re2/README
+re2/runtests
+re2/testinstall.cc
+re2/util/arena.cc
+re2/util/arena.h
+re2/util/atomicops.h
+re2/util/benchmark.cc
+re2/util/benchmark.h
+re2/util/flags.h
+re2/util/hash.cc
+re2/util/logging.h
+re2/util/mutex.h
+re2/util/pcre.cc
+re2/util/pcre.h
+re2/util/random.cc
+re2/util/random.h
+re2/util/rune.cc
+re2/util/sparse_array.h
+re2/util/sparse_array_test.cc
+re2/util/sparse_set.h
+re2/util/stringpiece.cc
+re2/util/stringprintf.cc
+re2/util/strutil.cc
+re2/util/test.cc
+re2/util/test.h
+re2/util/thread.cc
+re2/util/thread.h
+re2/util/utf.h
+re2/util/util.h
+re2/util/valgrind.cc
+re2/util/valgrind.h
+re2_xs.cc
+re2_xs.h
+README
+t/00.compile.t
+t/00.re2-tests.t
+t/01.basic.t
+t/02.chars.t
+t/03.modifiers.t
+t/04.multiline.t
+t/05.url.t
+t/06.matchrange.t
+t/07.utf8.t
+t/08.pos.t
+t/09.mem.t
+t/10.options.t
+t/ree-pcre/capture.t
+t/ree-pcre/import.t
+t/ree-pcre/match.t
+t/ree-pcre/qr.t
+t/ree-pcre/s.t
+t/ree-pcre/split-null.t
+t/ree-pcre/split.t
+t/ree-pcre/subexp.t
+t/ree-pcre/unimport.t
+TODO
+META.yml                                 Module YAML meta-data (added by MakeMaker)
+META.json                                Module JSON meta-data (added by MakeMaker)
--- a/MANIFEST.SKIP
+++ b/MANIFEST.SKIP
@ -0,0 +1,46 @@
+#!start included /Users/dgl/.perl5/lib/perl5/ExtUtils/MANIFEST.SKIP
+# Avoid version control files.
+\bRCS\b
+\bCVS\b
+\bSCCS\b
+,v$
+\B\.svn\b
+\B\.git\b
+\B\.gitignore\b
+\b_darcs\b
+
+# Avoid Makemaker generated and utility files.
+\bMANIFEST\.bak
+^Makefile$
+\bblib/
+\bMakeMaker-\d
+\bpm_to_blib\.ts$
+\bpm_to_blib$
+\bblibdirs\.ts$         # 6.18 through 6.25 generated this
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\b_build/
+
+# Avoid temp and backup files.
+~$
+\.old$
+\#$
+\b\.#
+\.bak$
+
+# Avoid Devel::Cover files.
+\bcover_db\b
+#!end included /Users/dgl/.perl5/lib/perl5/ExtUtils/MANIFEST.SKIP
+
+.*\.o$
+.*\.c$
+.*\.bs$
+.*\.gz$
+.*\.tar$
+
+.*\.so$
+MYMETA.yml
+
+^misc/
+^re2/obj/
--- a/META.json
+++ b/META.json
@ -0,0 +1,43 @@
+{
+   "abstract" : "RE2 regex engine",
+   "author" : [
+      "David Leadbeater <dgl@dgl.cx>"
+   ],
+   "dynamic_config" : 1,
+   "generated_by" : "ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010",
+   "license" : [
+      "perl_5"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+      "version" : 2
+   },
+   "name" : "re-engine-RE2",
+   "no_index" : {
+      "directory" : [
+         "t",
+         "inc"
+      ]
+   },
+   "prereqs" : {
+      "build" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "configure" : {
+         "requires" : {
+            "ExtUtils::CppGuess" : "0",
+            "Test::More" : "0.88"
+         }
+      }
+   },
+   "release_status" : "stable",
+   "resources" : {
+      "bugtracker" : {
+         "web" : "https://github.com/dgl/re-engine-RE2/issues"
+      }
+   },
+   "version" : "0.14",
+   "x_serialization_backend" : "JSON::PP version 2.97001"
+}
--- a/META.yml
+++ b/META.yml
@ -0,0 +1,24 @@
+---
+abstract: 'RE2 regex engine'
+author:
+  - 'David Leadbeater <dgl@dgl.cx>'
+build_requires:
+  ExtUtils::MakeMaker: '0'
+configure_requires:
+  ExtUtils::CppGuess: '0'
+  Test::More: '0.88'
+dynamic_config: 1
+generated_by: 'ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010'
+license: perl
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: '1.4'
+name: re-engine-RE2
+no_index:
+  directory:
+    - t
+    - inc
+resources:
+  bugtracker: https://github.com/dgl/re-engine-RE2/issues
+version: '0.14'
+x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
--- a/Makefile.PL
+++ b/Makefile.PL
@ -0,0 +1,132 @@
+use 5.012;
+use strict;
+use warnings;
+
+use Config;
+use ExtUtils::MakeMaker;
+use ExtUtils::CppGuess;
+
+# TODO: Optionally use system libre2, via ExtUtils::Liblist?
+
+my @objects = qw(RE2.o re2_xs.o re2/obj/libre2.a);
+
+my $guess = ExtUtils::CppGuess->new;
+
+my %opt = (
+  NAME               => 're::engine::RE2',
+  AUTHOR             => 'David Leadbeater <dgl@dgl.cx>',
+  VERSION_FROM       => 'lib/re/engine/RE2.pm',
+  ABSTRACT_FROM      => 'lib/re/engine/RE2.pm',
+  LICENSE            => 'perl',
+  INC                => '-Ire2',
+  PMLIBDIRS          => ["lib"],
+  OBJECT             => join(" ", @objects),
+  test               => {TESTS => 't/*.t t/ree-pcre/*.t'},
+  CONFIGURE_REQUIRES => {
+    "ExtUtils::CppGuess" => 0,
+    "Test::More"         => 0.88,
+  },
+  $guess->makemaker_options
+);
+
+if(eval { ExtUtils::MakeMaker->VERSION(6.46) }) {
+  $opt{META_MERGE} = {
+    'meta-spec' => { version => 2 },
+    resources => {
+      repository => 'https://github.com/dgl/re-engine-RE2',
+      bugtracker => {
+        web => 'https://github.com/dgl/re-engine-RE2/issues',
+      }
+    }
+  }
+}
+
+# If the user didn't explicitly provide optimisation settings, we'll try to do
+# it ourselves, but only for gcc.
+
+my $cc = (map +(/^CC=(.*)/i), @ARGV)[0] || $Config{cc};
+if(!grep(/^OPTIMIZE=/i, @ARGV)
+    and my $gcc_version = gcc_version($cc)) {
+  say "Compiling on gcc $gcc_version";
+  my $optimize = $Config{optimize};
+
+  if($gcc_version) {
+    $optimize =~ s/-O[s0-2]/-O3/ and say "Optimize level set to -O3";
+  }
+
+  # Attempt to work out if we have a gcc that is likely to support -flto.
+  # This is probably a lot of work for a minimal gain, but it's worth a try.
+  if($gcc_version >= 4.5) {
+    my $try_optimize = "$optimize -flto";
+    # Try to use this flag
+    if(gcc_try(cc => $cc, %opt, OPTIMIZE => $try_optimize)) {
+      $optimize = $try_optimize;
+    }
+
+    # gcc 4.9 needs this otherwise it gets rid of nearly everything in libre2.a.
+    $try_optimize = "$optimize -ffat-lto-objects";
+    if(gcc_try(cc => $cc, %opt, OPTIMIZE => $try_optimize)) {
+      $optimize = $try_optimize;
+    }
+  }
+
+  say "OPTIMIZE is now: $optimize";
+  $opt{OPTIMIZE} = $optimize;
+}
+
+if(defined $Config{usethreads} && $Config{usethreads} eq 'define') {
+  if(defined $Config{i_pthread} && $Config{i_pthread} eq 'define') {
+    $opt{DEFINE} = "-DHAVE_PTHREAD -pthread";
+  } else {
+    # For now this allows compilation under Win32/Strawberry, but might cause weird crashes on thread
+    # destruction...
+    $opt{DEFINE} = "-DNO_THREADS";
+  }
+} else {
+  $opt{DEFINE} = "-DNO_THREADS";
+}
+
+# This is a bit hacky, RE2 makefile needs GNU make, for now we'll try to find
+# it, ideally should rewrite the RE2 makefile to not need GNU make.
+our $MAKE;
+for my $make(qw(make gmake)) {
+  if(qx{$make --version 2>&1} =~ /GNU Make/i) {
+    $MAKE = $make;
+    last;
+  }
+}
+
+if(!$MAKE) {
+  die "RE2 currently needs GNU Make, please install gmake.\n";
+}
+
+WriteMakefile(%opt);
+
+sub gcc_version {
+  my($cc) = @_;
+  my $gcc_out = qx{$cc -v 2>&1};
+  # Just the first two digits
+  $gcc_out =~ /gcc version (\d+\.\d+)/ ? $1 : 0;
+}
+
+# This is highly gcc and unix specific, but that's where I care about
+# optimising this anyway.
+sub gcc_try {
+  my(%opts) = @_;
+  system "$opts{cc} $opts{CCFLAGS} $opts{OPTIMIZE} -c -o /dev/null /dev/null >/dev/null 2>&1";
+  not $?;
+}
+
+sub MY::postamble {
+  return <<MAKE_FRAG;
+
+RE2_FLAGS = CC="\$(CC)" CXXFLAGS="\$(CCFLAGS) \$(CCCDLFLAGS) \$(OPTIMIZE) \$(DEFINE) -DUSE_CXX0X" LDFLAGS="\$(OTHERLDFLAGS) \$(LDLOADLIBS)"
+
+re2/obj/libre2.a: re2/Makefile
+	$MAKE -C re2 obj/libre2.a \$(RE2_FLAGS)
+
+re2-tests:
+	$MAKE -C re2 static-test \$(RE2_FLAGS) LDFLAGS="\$(OTHERLDFLAGS) \$(LDLOADLIBS) -lm -lpthread"
+
+MAKE_FRAG
+}
--- a/RE2.xs
+++ b/RE2.xs
@ -0,0 +1,52 @@
+#include "re2_xs.h"
+#include "ppport.h"
+
+MODULE = re::engine::RE2 PACKAGE = re::engine::RE2
+PROTOTYPES: ENABLE
+
+void
+ENGINE(...)
+PROTOTYPE:
+PPCODE:
+	XPUSHs(sv_2mortal(newSViv(PTR2IV(&re2_engine))));
+
+# Use a typemap for this maybe, especially if we add more methods like it!
+void
+possible_match_range(SV *self, STRLEN len = 10)
+PROTOTYPE:
+PPCODE:
+        REGEXP* rx;
+        SV *possible_min, *possible_max;
+
+        if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
+                croak("qr// reference to a re::engine::RE2 instance required");
+        rx = SvRX(self);
+
+        RE2_possible_match_range(aTHX_ rx, len, &possible_min, &possible_max);
+
+        mXPUSHs(possible_min);
+        mXPUSHs(possible_max);
+
+HV*
+named_captures(SV *self)
+PROTOTYPE:
+CODE:
+        REGEXP* rx;
+        if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
+                croak("qr// reference to a re::engine::RE2 instance required");
+        rx = SvRX(self);
+        RETVAL = RE2_named_captures(aTHX_ rx);
+OUTPUT:
+        RETVAL
+
+int
+number_of_capture_groups(SV *self)
+PROTOTYPE:
+CODE:
+        REGEXP* rx;
+        if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
+                croak("qr// reference to a re::engine::RE2 instance required");
+        rx = SvRX(self);
+        RETVAL = RE2_number_of_capture_groups(aTHX_ rx);
+OUTPUT:
+        RETVAL
--- a/145
+++ b/145
@ -0,0 +1,145 @@
+NAME
+    re::engine::RE2 - RE2 regex engine
+
+SYNOPSIS
+        use re::engine::RE2;
+
+        if ("Hello, world" =~ /Hello, (world)/) {
+            print "Greetings, $1!";
+        }
+
+DESCRIPTION
+    This module replaces perl's regex engine in a given lexical scope with
+    RE2.
+
+    RE2 is a primarily DFA based regexp engine from Google that is very fast
+    at matching large amounts of text. However it does not support look
+    behind and some other Perl regular expression features. See
+    http://code.google.com/p/re2 for more information.
+
+    Fallback to normal Perl regexp is implemented by this module. If RE2 is
+    unable to compile a regexp it will use Perl instead, therefore features
+    not implemented by RE2 don't suddenly stop working, they will just use
+    Perl's regexp implementation.
+
+METHODS
+    To access extra functionality of RE2 methods can be called on a compiled
+    regular expression (i.e. a "qr//").
+
+    *   "possible_match_range([length = 10])"
+
+        Returns an array of two strings: where the expression will start
+        matching and just after where it will finish matching. See RE2's
+        documentation on PossibleMatchRange for further details.
+
+        Example:
+
+            my($min, $max) = qr/^(a|b)/->possible_match_range;
+            is $min, 'a';
+            is $max, 'c';'
+
+PERFORMANCE
+    Performance is really the primary reason for using RE2, so here's some
+    benchmarks. Like any benchmark take them with a pinch of salt.
+
+  Simple matching
+      my $foo = "foo bar baz";
+      $foo =~ /foo/;
+      $foo =~ /foox/;
+
+    On this very simple match RE2 is actually slower:
+
+               Rate  re2   re
+      re2  674634/s   -- -76%
+      re  2765739/s 310%   --
+
+  URL matching
+    Matching "m{([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^
+    @]+)}" against a several KB file:
+
+            Rate    re   re2
+      re  35.2/s    --  -99%
+      re2 2511/s 7037%    --
+
+  Many alternatives
+    Matching a string against a regexp with 17,576 alternatives ("aaa ..
+    zzz").
+
+    This uses trie matching on Perl (obviously RE2 does similar by default).
+
+      $ perl misc/altern.pl
+              Rate   re  re2
+      re   52631/s   -- -91%
+      re2 554938/s 954%   --
+
+NOTES
+    *   No support for "m//x"
+
+        The "/x" modifier is not supported. (There's no particular reason
+        for this, just RE2 itself doesn't support it). Fallback to Perl
+        regexp will happen automatically if "//x" is used.
+
+    *   "re2/dfa.cc:447: DFA out of memory: prog size xxx mem yyy"
+
+        If you attempt to compile a really large regular expression you may
+        get this error. RE2 has an internal limit on memory consumption for
+        the DFA state tables. By default this is 8 MiB.
+
+        If you need to increase this size then use the max_mem parameter:
+
+          use re::engine::RE2 -max_mem => 8<<23; # 64MiB
+
+    *   How do I tell if RE2 will be used?
+
+        See if your regexp is matching quickly or slowly ;).
+
+        Alternatively normal OO concepts apply and you may examine the
+        object returned by "qr//":
+
+          use re::engine::RE2;
+
+          ok qr/foo/->isa("re::engine::RE2");
+
+          # Perl Regexp used instead
+          ok not qr/(?<=foo)bar/->isa("re::engine::RE2");
+
+BUGS
+    Known issues:
+
+    *   Unicode handling
+
+        Currently the Unicode handling of re::engine::RE2 does not fully
+        match Perl's behaviour.
+
+        The UTF-8 flag of the regexp currently determines how the string is
+        matched. This is obviously broken, so will be fixed at some point.
+
+    *   Final newline matching differs to Perl
+
+          "\n" =~ /$/
+
+        The above is true in Perl, false in RE2. To work around the issue
+        you can write "\n?\z" when you mean Perl's "$".
+
+    Please report bugs or provide patches at
+    <https://github.com/dgl/re-engine-RE2>.
+
+AUTHORS
+    David Leadbeater <dgl[at]dgl[dot]cx>
+
+COPYRIGHT
+    Copyright 2010 David Leadbeater.
+
+    Based on re::engine::PCRE:
+
+    Copyright 2007 Ævar Arnfjörð Bjarmason.
+
+    The original version was copyright 2006 Audrey Tang <cpan@audreyt.org>
+    and Yves Orton.
+
+    This program is free software; you can redistribute it and/or modify it
+    under the same terms as Perl itself.
+
+    (However the bundled copy of RE2 has a different copyright owner and is
+    under a BSD-like license, see re2/LICENSE.)
+
--- a/40
+++ b/40
@ -0,0 +1,40 @@
+# -*- mode: org -*-
+
+* Fix UTF-8 support
+  This turns out to be harder than I was thinking. The first step is to compile
+  two versions of the regexp, one for matching UTF-8 and one for matching
+  Latin1 (maybe on demand).
+
+  RE2 won't accept \x{...} escapes that are greater than the current character
+  set. I was hoping it would be possible to give a string containing these to
+  RE2 then let RE2 realise part of it won't match (e.g. (?:foo|\x{1234}) will
+  still match foo, even if the input string isn't UTF-8).
+
+  (I'm only talking about \x{...}; this is the only case I have to
+  care about, \p{...} *are* accepted by RE2 regardless. Due to Perl's
+  behaviour we can't have raw UTF-8 in the string if the UTF-8 flag
+  isn't on.)
+
+  The approach for now will probably be to replace \x{nnn} in strings (where
+  nnn>0xFF) with something that won't match (maybe [^\x00-\xff]), but allows
+  the other branches to match.
+** Think about supporting perl 5.14's unicode regexp flags
+  At least at the top level, implementing within RE2 would be silly.
+
+  RE2 doesn't have all the behaviours perl does (i.e. /a is implied
+  for \d, etc.). Might just be a case of documenting what RE2 does,
+  once UTF-8 is working to some extent.  An alternative could be to
+  make things explicit (e.g. you need to say "no feature
+  'unicode_strings'" if you happen to have enabled them to use RE2).
+* Switch to dzil
+* Support more options
+** never_nl could be useful for cpangrep optimisations
+* Support RE2::Set functionality
+  i.e. a Regexp::RE2::Set class that can have RE2 regexps added into it
+  then either a match method or maybe overload ~~?
+* Improve tests
+** See if t/re/re_tests from Perl can be used.
+** Improve performance comparisons
+   See maybe https://github.com/axiak/pyre2/blob/master/tests/performance.py
+* Support /x (probably needs RE2 changes to do properly)
+* Both Perl and RE2 store the stringification of the regexp, can we avoid this?
--- a/compat-cophh.h
+++ b/compat-cophh.h
@ -0,0 +1,10 @@
+/* Compatibility for bits of the cophh API which was added in 5.13.7.
+ * This uses refcounted_he_* functions that are not part of the public perl
+ * API, therefore won't work on platforms with strict linkers (Windows, AIX).
+ */
+#if PERL_VERSION < 13 || (PERL_VERSION == 13 && PERL_SUBVERSION < 7)
+
+#define cophh_fetch_pvs(cophh, key, flags) \
+    Perl_refcounted_he_fetch(aTHX_ cophh, NULL, key, sizeof(key) - 1, 0, flags)
+
+#endif
--- a/compat-rx.h
+++ b/compat-rx.h
@ -0,0 +1,7 @@
+/* Compatibility for RX_* macros added around 5.10.1. */
+
+#ifndef RX_WRAPPED
+#define RX_WRAPPED(prog) ((prog)->wrapped)
+#define RX_WRAPLEN(prog) ((prog)->wraplen)
+#endif
+
--- a/lib/re/engine/RE2.pm
+++ b/lib/re/engine/RE2.pm
@ -0,0 +1,270 @@
+package re::engine::RE2;
+use 5.012;
+
+BEGIN {
+  $re::engine::RE2::VERSION = "0.14";
+}
+
+use XSLoader ();
+
+# All engines should subclass the core Regexp package
+our @ISA = 'Regexp';
+
+BEGIN
+{
+    XSLoader::load __PACKAGE__, $re::engine::RE2::VERSION;
+}
+
+sub import
+{
+    my $class = shift;
+
+    $^H{regcomp} = ENGINE;
+
+    if (@_) {
+        my %args = @_;
+        if (exists $args{"-max_mem"}) {
+            $^H{__PACKAGE__ . "::max-mem"} = $args{"-max_mem"};
+        }
+
+        if (exists $args{"-strict"}) {
+            $^H{__PACKAGE__ . "::strict"} = $args{"-strict"};
+        }
+
+        if (exists $args{"-longest_match"}) {
+            $^H{__PACKAGE__ . "::longest-match"} = $args{"-longest_match"};
+        }
+
+        if (exists $args{"-never_nl"}) {
+            $^H{__PACKAGE__ . "::never-nl"} = $args{"-never_nl"};
+        }
+    }
+}
+
+sub unimport
+{
+    delete $^H{regcomp}
+        if $^H{regcomp} == ENGINE;
+}
+
+1;
+
+__END__
+
+=encoding utf8
+
+=head1 NAME
+
+re::engine::RE2 - RE2 regex engine
+
+=head1 SYNOPSIS
+
+    use re::engine::RE2;
+
+    if ("Hello, world" =~ /Hello, (world)/) {
+        print "Greetings, $1!";
+    }
+
+=head1 DESCRIPTION
+
+This module replaces perl's regex engine in a given lexical scope with RE2.
+
+RE2 is a primarily DFA based regexp engine from Google that is very fast at
+matching large amounts of text. However it does not support look behind and
+some other Perl regular expression features. See
+L<RE2's website|http://code.google.com/p/re2> for more information.
+
+Fallback to normal Perl regexp is implemented by this module. If RE2 is unable
+to compile a regexp it will use Perl instead, therefore features not
+implemented by RE2 don't suddenly stop working, they will just use Perl's
+regexp implementation.
+
+=head1 METHODS
+
+To access extra functionality of RE2 methods can be called on a compiled
+regular expression (i.e. a C<qr//>).
+
+=over 4
+
+=item * C<possible_match_range([length = 10])>
+
+Returns an array of two strings: where the expression will start matching and
+just after where it will finish matching. See RE2's documentation on
+PossibleMatchRange for further details.
+
+Example:
+
+    my($min, $max) = qr/^(a|b)/->possible_match_range;
+    is $min, 'a';
+    is $max, 'c';'
+
+=item * C<named_captures()>
+
+Returns a hash of the name captures and index.
+
+Example:
+
+    my $named_captures = qr/(?P<a>\w+) (?P<d>\w+)/->named_captures;
+    is $named_captures->{a}, 1;
+    is $named_captures->{d}, 2;
+
+=item * C<number_of_capture_groups()>
+
+Return number of capture groups
+
+Example:
+
+    my $captures = qr/(Hello), (world)/->number_of_capture_groups;
+    is $captures, 2;
+
+=back
+
+=head1 PRAGMA OPTIONS
+
+Various options can be set by providing options to the C<use> line. These will
+be pragma scoped.
+
+=over 4
+
+=item * C<< -max_mem => 1<<24 >>
+
+Configure RE2's memory limit.
+
+=item * C<< -strict => 1 >>
+
+Be strict, i.e. don't allow regexps that are not supported by RE2.
+
+=item * C<< -longest_match => 1 >>
+
+Match on the longest match in alternations. For example with this option set
+matching C<"abc"> against C<(a|abc)> will match C<"abc">, without depending on
+order.
+
+=item * C<< -never_nl => 1 >>
+
+Never match a newline (C<"\n">) even if the provided regexp contains it.
+
+=back
+
+=head1 PERFORMANCE
+
+Performance is really the primary reason for using RE2, so here's some
+benchmarks. Like any benchmark take them with a pinch of salt.
+
+=head2 Simple matching
+
+  my $foo = "foo bar baz";
+  $foo =~ /foo/;
+  $foo =~ /foox/;
+
+On this very simple match RE2 is actually slower:
+
+           Rate  re2   re
+  re2  674634/s   -- -76%
+  re  2765739/s 310%   --
+
+=head2 URL matching
+
+Matching C<m{([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^
+@]+)}> against a several KB file:
+
+        Rate    re   re2
+  re  35.2/s    --  -99%
+  re2 2511/s 7037%    --
+
+=head2 Many alternatives
+
+Matching a string against a regexp with 17,576 alternatives (C<aaa .. zzz>).
+
+This uses trie matching on Perl (obviously RE2 does similar by default).
+
+  $ perl misc/altern.pl
+          Rate   re  re2
+  re   52631/s   -- -91%
+  re2 554938/s 954%   --
+
+=head1 NOTES
+
+=over 4
+
+=item * No support for C<m//x>
+
+The C</x> modifier is not supported. (There's no particular reason for this,
+just RE2 itself doesn't support it). Fallback to Perl regexp will happen
+automatically if C<//x> is used.
+
+=item * "re2/dfa.cc:447: DFA out of memory: prog size xxx mem yyy"
+
+If you attempt to compile a really large regular expression you may get this
+error. RE2 has an internal limit on memory consumption for the DFA state
+tables. By default this is 8 MiB.
+
+If you need to increase this size then use the max_mem parameter:
+
+  use re::engine::RE2 -max_mem => 8<<23; # 64MiB
+
+=item * How do I tell if RE2 will be used?
+
+See if your regexp is matching quickly or slowly ;).
+
+Alternatively normal OO concepts apply and you may examine the object returned
+by C<qr//>:
+
+  use re::engine::RE2;
+
+  ok qr/foo/->isa("re::engine::RE2");
+
+  # Perl Regexp used instead
+  ok not qr/(?<=foo)bar/->isa("re::engine::RE2");
+
+If you wish to force RE2, use the C<-strict> option.
+
+=back
+
+=head1 BUGS
+
+Known issues:
+
+=over 4
+
+=item * Unicode handling
+
+Currently the Unicode handling of re::engine::RE2 does not fully match Perl's
+behaviour.
+
+The UTF-8 flag of the regexp currently determines how the string is matched.
+This is obviously broken, so will be fixed at some point.
+
+=item * Final newline matching differs to Perl
+
+  "\n" =~ /$/
+
+The above is true in Perl, false in RE2. To work around the issue you can write
+C<\n?\z> when you mean Perl's C<$>.
+
+=back
+
+Please report bugs or provide patches at <https://github.com/dgl/re-engine-RE2>.
+
+=head1 AUTHORS
+
+David Leadbeater E<lt>dgl[at]dgl[dot]cxE<gt>
+
+=head1 COPYRIGHT
+
+Copyright 2010 David Leadbeater.
+
+Based on L<re::engine::PCRE>:
+
+Copyright 2007 E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason.
+
+The original version was copyright 2006 Audrey Tang
+E<lt>cpan@audreyt.orgE<gt> and Yves Orton.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+(However the bundled copy of RE2 has a different copyright owner and is under a
+BSD-like license, see F<re2/LICENSE>.)
+
+=cut
--- a/ppport.h
+++ b/ppport.h
--- a/re2/.hgignore
+++ b/re2/.hgignore
@ -0,0 +1,7 @@
+syntax:glob
+*.pyc
+*.orig
+core
+
+syntax:regexp
+^obj/
--- a/re2/AUTHORS
+++ b/re2/AUTHORS
@ -0,0 +1,12 @@
+# This is the official list of RE2 authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Google Inc.
+Stefano Rivera <stefano.rivera@gmail.com>
--- a/re2/CONTRIBUTORS
+++ b/re2/CONTRIBUTORS
@ -0,0 +1,33 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the RE2 repository.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# The submission process automatically checks to make sure
+# that people submitting code are listed in this file (by email address).
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+#     http://code.google.com/legal/individual-cla-v1.0.html
+#     http://code.google.com/legal/corporate-cla-v1.0.html
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+
+# Names should be added to this file like so:
+#     Name <email address>
+
+# Please keep the list sorted.
+
+Rob Pike <r@google.com>
+Russ Cox <rsc@swtch.com>
+Sanjay Ghemawat <sanjay@google.com>
+Stefano Rivera <stefano.rivera@gmail.com>
+Srinivasan Venkatachary <vsri@google.com>
--- a/re2/LICENSE
+++ b/re2/LICENSE
@ -0,0 +1,27 @@
+// Copyright (c) 2009 The RE2 Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/re2/Makefile
+++ b/re2/Makefile
@ -0,0 +1,287 @@
+# Copyright 2009 The RE2 Authors.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+all: obj/libre2.a obj/so/libre2.so
+
+# to build against PCRE for testing or benchmarking,
+# uncomment the next two lines
+# CCPCRE=-I/usr/local/include -DUSEPCRE
+# LDPCRE=-L/usr/local/lib -lpcre
+
+#CC=g++
+#CXXFLAGS=-Wall -O3 -g -pthread # can override
+RE2_CXXFLAGS=-Wno-sign-compare -c -I. $(CCPCRE)  # required
+#LDFLAGS=-pthread
+AR=ar
+ARFLAGS=rsc
+NM=nm
+NMFLAGS=-p
+
+# Variables mandated by GNU, the arbiter of all good taste on the internet.
+# http://www.gnu.org/prep/standards/standards.html
+prefix=/usr/local
+exec_prefix=$(prefix)
+bindir=$(exec_prefix)/bin
+includedir=$(prefix)/include
+libdir=$(exec_prefix)/lib
+INSTALL=install
+INSTALL_PROGRAM=$(INSTALL)
+INSTALL_DATA=$(INSTALL) -m 644
+
+# ABI version
+# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
+SONAME=0
+
+# To rebuild the Tables generated by Perl and Python scripts (requires Internet
+# access for Unicode data), uncomment the following line:
+# REBUILD_TABLES=1
+
+#ifeq ($(shell uname),Darwin)
+#MAKE_SHARED_LIBRARY=g++ -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin
+#else
+#MAKE_SHARED_LIBRARY=g++ -shared -Wl,-soname,libre2.so.0,--version-script=libre2.symbols $(LDFLAGS)
+#endif
+
+INSTALL_HFILES=\
+	re2/re2.h\
+	re2/set.h\
+	re2/stringpiece.h\
+	re2/variadic_function.h\
+
+HFILES=\
+	util/arena.h\
+	util/atomicops.h\
+	util/benchmark.h\
+	util/flags.h\
+	util/logging.h\
+	util/mutex.h\
+	util/pcre.h\
+	util/random.h\
+	util/sparse_array.h\
+	util/sparse_set.h\
+	util/test.h\
+	util/utf.h\
+	util/util.h\
+	util/valgrind.h\
+	re2/filtered_re2.h\
+	re2/prefilter.h\
+	re2/prefilter_tree.h\
+	re2/prog.h\
+	re2/re2.h\
+	re2/regexp.h\
+	re2/set.h\
+	re2/stringpiece.h\
+	re2/testing/exhaustive_tester.h\
+	re2/testing/regexp_generator.h\
+	re2/testing/string_generator.h\
+	re2/testing/tester.h\
+	re2/unicode_casefold.h\
+	re2/unicode_groups.h\
+	re2/variadic_function.h\
+	re2/walker-inl.h\
+
+OFILES=\
+	obj/util/arena.o\
+	obj/util/hash.o\
+	obj/util/rune.o\
+	obj/util/stringpiece.o\
+	obj/util/stringprintf.o\
+	obj/util/strutil.o\
+	obj/util/valgrind.o\
+	obj/re2/bitstate.o\
+	obj/re2/compile.o\
+	obj/re2/dfa.o\
+	obj/re2/filtered_re2.o\
+	obj/re2/mimics_pcre.o\
+	obj/re2/nfa.o\
+	obj/re2/onepass.o\
+	obj/re2/parse.o\
+	obj/re2/perl_groups.o\
+	obj/re2/prefilter.o\
+	obj/re2/prefilter_tree.o\
+	obj/re2/prog.o\
+	obj/re2/re2.o\
+	obj/re2/regexp.o\
+	obj/re2/set.o\
+	obj/re2/simplify.o\
+	obj/re2/tostring.o\
+	obj/re2/unicode_casefold.o\
+	obj/re2/unicode_groups.o\
+
+TESTOFILES=\
+	obj/util/pcre.o\
+	obj/util/random.o\
+	obj/util/thread.o\
+	obj/re2/testing/backtrack.o\
+	obj/re2/testing/dump.o\
+	obj/re2/testing/exhaustive_tester.o\
+	obj/re2/testing/null_walker.o\
+	obj/re2/testing/regexp_generator.o\
+	obj/re2/testing/string_generator.o\
+	obj/re2/testing/tester.o\
+
+TESTS=\
+	obj/test/charclass_test\
+	obj/test/compile_test\
+	obj/test/filtered_re2_test\
+	obj/test/mimics_pcre_test\
+	obj/test/parse_test\
+	obj/test/possible_match_test\
+	obj/test/re2_test\
+	obj/test/re2_arg_test\
+	obj/test/regexp_test\
+	obj/test/required_prefix_test\
+	obj/test/search_test\
+	obj/test/set_test\
+	obj/test/simplify_test\
+	obj/test/string_generator_test\
+
+BIGTESTS=\
+	obj/test/dfa_test\
+	obj/test/exhaustive1_test\
+	obj/test/exhaustive2_test\
+	obj/test/exhaustive3_test\
+	obj/test/exhaustive_test\
+	obj/test/random_test\
+
+SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
+STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES))
+STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
+SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
+
+DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
+DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
+DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
+DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
+
+obj:
+	mkdir $@
+
+obj/re2: obj
+	cd obj && mkdir re2 || echo Okay
+
+obj/util: obj
+	cd obj && mkdir util || echo Okay
+
+obj/test: obj
+	cd obj && mkdir test || echo Okay
+
+obj/re2/testing: obj/re2
+	cd obj/re2 && mkdir testing || echo Okay
+
+obj/%.o: obj/re2 obj/re2/testing obj/util %.cc $(HFILES)
+	$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
+
+obj/dbg/%.o: obj/dbg %.cc $(HFILES)
+	$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
+
+obj/so/%.o: obj/so %.cc $(HFILES)
+	$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
+
+obj/%.o: obj %.c $(HFILES)
+	$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
+
+obj/dbg/%.o: obj/dbg %.c $(HFILES)
+	$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) $*.c
+
+obj/so/%.o: obj/so %.c $(HFILES)
+	$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
+
+obj/libre2.a: $(OFILES)
+	$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
+
+obj/dbg/libre2.a: obj/dbg $(DOFILES)
+	$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
+
+obj/so/libre2.so: obj/so $(SOFILES)
+	$(MAKE_SHARED_LIBRARY) -o $@.0 $(SOFILES)
+	ln -sf libre2.so.0 $@
+
+obj/test/%: obj/test obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
+	$(CC) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
+
+obj/dbg/test/%: obj/dbg/test obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
+	$(CC) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
+
+obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
+	$(CC) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
+
+obj/test/regexp_benchmark: obj/test obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
+	$(CC) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
+
+ifdef REBUILD_TABLES
+re2/perl_groups.cc: re2/make_perl_groups.pl
+	perl $< > $@
+
+re2/unicode_%.cc: re2/make_unicode_%.py
+	python $< > $@
+endif
+
+distclean: clean
+	rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
+
+clean:
+	rm -rf obj
+	rm -f re2/*.pyc
+
+testofiles: $(TESTOFILES)
+
+test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
+
+debug-test: $(DTESTS)
+	@echo
+	@echo Running debug binary tests.
+	@echo
+	@./runtests $(DTESTS)
+
+static-test: $(TESTS)
+	@echo
+	@echo Running static binary tests.
+	@echo
+	@./runtests $(TESTS)
+
+shared-test: $(STESTS)
+	@echo
+	@echo Running dynamic binary tests.
+	@echo
+	@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS)
+
+debug-bigtest: $(DTESTS) $(DBIGTESTS)
+	@./runtests $(DTESTS) $(DBIGTESTS)
+
+static-bigtest: $(TESTS) $(BIGTESTS)
+	@./runtests $(TESTS) $(BIGTESTS)
+
+shared-bigtest: $(STESTS) $(SBIGTESTS)
+	@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) $(SBIGTESTS)
+
+benchmark: obj/test/regexp_benchmark
+
+install: obj/libre2.a obj/so/libre2.so.0
+	mkdir -p $(DESTDIR)$(includedir)/re2
+	$(INSTALL_DATA) $(DESTDIR)$(INSTALL_HFILES) $(includedir)/re2
+	$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
+	$(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0
+	ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME)
+	ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so
+
+testinstall:
+	@mkdir -p obj
+	cp testinstall.cc obj
+	(cd obj && g++ -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall)
+	LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
+
+benchlog: obj/test/regexp_benchmark
+	(echo '==BENCHMARK==' `hostname` `date`; \
+	  (uname -a; g++ --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
+	  echo; \
+	  ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
+
+# Keep gmake from deleting intermediate files it creates.
+# This makes repeated builds faster and preserves debug info on OS X.
+
+.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
+	obj/dbg/libre2.a obj/so/libre2.a \
+	obj/test/% obj/so/test/% obj/dbg/test/%
+
--- a/re2/README
+++ b/re2/README
@ -0,0 +1,19 @@
+This is the source code repository for RE2, a regular expression library.
+
+For documentation about how to install and use RE2,
+visit http://code.google.com/p/re2/.
+
+The short version is:
+
+make
+make test
+make install
+make testinstall
+
+Unless otherwise noted, the RE2 source files are distributed
+under the BSD-style license found in the LICENSE file.
+
+RE2's native language is C++.
+An Inferno wrapper is at http://code.google.com/p/inferno-re2/.
+A Python wrapper is at http://github.com/facebook/pyre2/.
+A Ruby wrapper is at http://github.com/axic/rre2/.
--- a/re2/libre2.symbols
+++ b/re2/libre2.symbols
@ -0,0 +1,15 @@
+{
+	global:
+		# re2::RE2*
+		_ZN3re23RE2*;
+		_ZNK3re23RE2*;
+		# re2::StringPiece*
+		_ZN3re211StringPiece*;
+		_ZNK3re211StringPiece*;
+		# operator==(re2::StringPiece const&, re2::StringPiece const&)
+		_ZeqRKN3re211StringPieceES2_;
+		# operator<<(std::ostream&, re2::StringPiece const&)
+		_ZlsRSoRKN3re211StringPieceE;
+	local:
+		*;
+};
--- a/re2/libre2.symbols.darwin
+++ b/re2/libre2.symbols.darwin
@ -0,0 +1,11 @@
+# Linker doesn't like these unmangled:
+# re2::RE2*
+__ZN3re23RE2*
+__ZNK3re23RE2*
+# re2::StringPiece*
+__ZN3re211StringPiece*
+__ZNK3re211StringPiece*
+# operator==(re2::StringPiece const&, re2::StringPiece const&)
+__ZeqRKN3re211StringPieceES2_
+# operator<<(std::ostream&, re2::StringPiece const&)
+__ZlsRSoRKN3re211StringPieceE
--- a/re2/re2/Makefile
+++ b/re2/re2/Makefile
@ -0,0 +1 @@
+
--- a/re2/re2/bitstate.cc
+++ b/re2/re2/bitstate.cc
@ -0,0 +1,378 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc, exhaustive_test.cc, tester.cc
+
+// Prog::SearchBitState is a regular expression search with submatch
+// tracking for small regular expressions and texts.  Like
+// testing/backtrack.cc, it allocates a bit vector with (length of
+// text) * (length of prog) bits, to make sure it never explores the
+// same (character position, instruction) state multiple times.  This
+// limits the search to run in time linear in the length of the text.
+//
+// Unlike testing/backtrack.cc, SearchBitState is not recursive
+// on the text.
+//
+// SearchBitState is a fast replacement for the NFA code on small
+// regexps and texts when SearchOnePass cannot be used.
+
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Job {
+  int id;
+  int arg;
+  const char* p;
+};
+
+class BitState {
+ public:
+  explicit BitState(Prog* prog);
+  ~BitState();
+
+  // The usual Search prototype.
+  // Can only call Search once per BitState.
+  bool Search(const StringPiece& text, const StringPiece& context,
+              bool anchored, bool longest,
+              StringPiece* submatch, int nsubmatch);
+
+ private:
+  inline bool ShouldVisit(int id, const char* p);
+  void Push(int id, const char* p, int arg);
+  bool GrowStack();
+  bool TrySearch(int id, const char* p);
+
+  // Search parameters
+  Prog* prog_;              // program being run
+  StringPiece text_;        // text being searched
+  StringPiece context_;     // greater context of text being searched
+  bool anchored_;           // whether search is anchored at text.begin()
+  bool longest_;            // whether search wants leftmost-longest match
+  bool endmatch_;           // whether match must end at text.end()
+  StringPiece *submatch_;   // submatches to fill in
+  int nsubmatch_;           //   # of submatches to fill in
+
+  // Search state
+  const char** cap_;        // capture registers
+  int ncap_;
+
+  static const int VisitedBits = 32;
+  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
+  int nvisited_;            //   # of words in bitmap
+
+  Job *job_;                // stack of text positions to explore
+  int njob_;
+  int maxjob_;
+};
+
+BitState::BitState(Prog* prog)
+  : prog_(prog),
+    anchored_(false),
+    longest_(false),
+    endmatch_(false),
+    submatch_(NULL),
+    nsubmatch_(0),
+    cap_(NULL),
+    ncap_(0),
+    visited_(NULL),
+    nvisited_(0),
+    job_(NULL),
+    njob_(0),
+    maxjob_(0) {
+}
+
+BitState::~BitState() {
+  delete[] visited_;
+  delete[] job_;
+  delete[] cap_;
+}
+
+// Should the search visit the pair ip, p?
+// If so, remember that it was visited so that the next time,
+// we don't repeat the visit.
+bool BitState::ShouldVisit(int id, const char* p) {
+  uint n = id * (text_.size() + 1) + (p - text_.begin());
+  if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
+    return false;
+  visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
+  return true;
+}
+
+// Grow the stack.
+bool BitState::GrowStack() {
+  // VLOG(0) << "Reallocate.";
+  maxjob_ *= 2;
+  Job* newjob = new Job[maxjob_];
+  memmove(newjob, job_, njob_*sizeof job_[0]);
+  delete[] job_;
+  job_ = newjob;
+  if (njob_ >= maxjob_) {
+    LOG(DFATAL) << "Job stack overflow.";
+    return false;
+  }
+  return true;
+}
+
+// Push the triple (id, p, arg) onto the stack, growing it if necessary.
+void BitState::Push(int id, const char* p, int arg) {
+  if (njob_ >= maxjob_) {
+    if (!GrowStack())
+      return;
+  }
+  int op = prog_->inst(id)->opcode();
+  if (op == kInstFail)
+    return;
+
+  // Only check ShouldVisit when arg == 0.
+  // When arg > 0, we are continuing a previous visit.
+  if (arg == 0 && !ShouldVisit(id, p))
+    return;
+
+  Job* j = &job_[njob_++];
+  j->id = id;
+  j->p = p;
+  j->arg = arg;
+}
+
+// Try a search from instruction id0 in state p0.
+// Return whether it succeeded.
+bool BitState::TrySearch(int id0, const char* p0) {
+  bool matched = false;
+  const char* end = text_.end();
+  njob_ = 0;
+  Push(id0, p0, 0);
+  while (njob_ > 0) {
+    // Pop job off stack.
+    --njob_;
+    int id = job_[njob_].id;
+    const char* p = job_[njob_].p;
+    int arg = job_[njob_].arg;
+
+    // Optimization: rather than push and pop,
+    // code that is going to Push and continue
+    // the loop simply updates ip, p, and arg
+    // and jumps to CheckAndLoop.  We have to
+    // do the ShouldVisit check that Push
+    // would have, but we avoid the stack
+    // manipulation.
+    if (0) {
+    CheckAndLoop:
+      if (!ShouldVisit(id, p))
+        continue;
+    }
+
+    // Visit ip, p.
+    // VLOG(0) << "Job: " << ip->id() << " "
+    //         << (p - text_.begin()) << " " << arg;
+    Prog::Inst* ip = prog_->inst(id);
+    switch (ip->opcode()) {
+      case kInstFail:
+      default:
+        LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
+        return false;
+
+      case kInstAlt:
+        // Cannot just
+        //   Push(ip->out1(), p, 0);
+        //   Push(ip->out(), p, 0);
+        // If, during the processing of ip->out(), we encounter
+        // ip->out1() via another path, we want to process it then.
+        // Pushing it here will inhibit that.  Instead, re-push
+        // ip with arg==1 as a reminder to push ip->out1() later.
+        switch (arg) {
+          case 0:
+            Push(id, p, 1);  // come back when we're done
+            id = ip->out();
+            goto CheckAndLoop;
+
+          case 1:
+            // Finished ip->out(); try ip->out1().
+            arg = 0;
+            id = ip->out1();
+            goto CheckAndLoop;
+        }
+        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
+        continue;
+
+      case kInstAltMatch:
+        // One opcode is byte range; the other leads to match.
+        if (ip->greedy(prog_)) {
+          // out1 is the match
+          Push(ip->out1(), p, 0);
+          id = ip->out1();
+          p = end;
+          goto CheckAndLoop;
+        }
+        // out is the match - non-greedy
+        Push(ip->out(), end, 0);
+        id = ip->out();
+        goto CheckAndLoop;
+
+      case kInstByteRange: {
+        int c = -1;
+        if (p < end)
+          c = *p & 0xFF;
+        if (ip->Matches(c)) {
+          id = ip->out();
+          p++;
+          goto CheckAndLoop;
+        }
+        continue;
+      }
+
+      case kInstCapture:
+        switch (arg) {
+          case 0:
+            if (0 <= ip->cap() && ip->cap() < ncap_) {
+              // Capture p to register, but save old value.
+              Push(id, cap_[ip->cap()], 1);  // come back when we're done
+              cap_[ip->cap()] = p;
+            }
+            // Continue on.
+            id = ip->out();
+            goto CheckAndLoop;
+          case 1:
+            // Finished ip->out(); restore the old value.
+            cap_[ip->cap()] = p;
+            continue;
+        }
+        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
+        continue;
+
+      case kInstEmptyWidth:
+        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
+          continue;
+        id = ip->out();
+        goto CheckAndLoop;
+
+      case kInstNop:
+        id = ip->out();
+        goto CheckAndLoop;
+
+      case kInstMatch: {
+        if (endmatch_ && p != text_.end())
+          continue;
+
+        // VLOG(0) << "Found match.";
+        // We found a match.  If the caller doesn't care
+        // where the match is, no point going further.
+        if (nsubmatch_ == 0)
+          return true;
+
+        // Record best match so far.
+        // Only need to check end point, because this entire
+        // call is only considering one start position.
+        matched = true;
+        cap_[1] = p;
+        if (submatch_[0].data() == NULL ||
+            (longest_ && p > submatch_[0].end())) {
+          for (int i = 0; i < nsubmatch_; i++)
+            submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
+        }
+
+        // If going for first match, we're done.
+        if (!longest_)
+          return true;
+
+        // If we used the entire text, no longer match is possible.
+        if (p == text_.end())
+          return true;
+
+        // Otherwise, continue on in hope of a longer match.
+        continue;
+      }
+    }
+  }
+  return matched;
+}
+
+// Search text (within context) for prog_.
+bool BitState::Search(const StringPiece& text, const StringPiece& context,
+                      bool anchored, bool longest,
+                      StringPiece* submatch, int nsubmatch) {
+  // Search parameters.
+  text_ = text;
+  context_ = context;
+  if (context_.begin() == NULL)
+    context_ = text;
+  if (prog_->anchor_start() && context_.begin() != text.begin())
+    return false;
+  if (prog_->anchor_end() && context_.end() != text.end())
+    return false;
+  anchored_ = anchored || prog_->anchor_start();
+  longest_ = longest || prog_->anchor_end();
+  endmatch_ = prog_->anchor_end();
+  submatch_ = submatch;
+  nsubmatch_ = nsubmatch;
+  for (int i = 0; i < nsubmatch_; i++)
+    submatch_[i] = NULL;
+
+  // Allocate scratch space.
+  nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
+  visited_ = new uint32[nvisited_];
+  memset(visited_, 0, nvisited_*sizeof visited_[0]);
+  // VLOG(0) << "nvisited_ = " << nvisited_;
+
+  ncap_ = 2*nsubmatch;
+  if (ncap_ < 2)
+    ncap_ = 2;
+  cap_ = new const char*[ncap_];
+  memset(cap_, 0, ncap_*sizeof cap_[0]);
+
+  maxjob_ = 256;
+  job_ = new Job[maxjob_];
+
+  // Anchored search must start at text.begin().
+  if (anchored_) {
+    cap_[0] = text.begin();
+    return TrySearch(prog_->start(), text.begin());
+  }
+
+  // Unanchored search, starting from each possible text position.
+  // Notice that we have to try the empty string at the end of
+  // the text, so the loop condition is p <= text.end(), not p < text.end().
+  // This looks like it's quadratic in the size of the text,
+  // but we are not clearing visited_ between calls to TrySearch,
+  // so no work is duplicated and it ends up still being linear.
+  for (const char* p = text.begin(); p <= text.end(); p++) {
+    cap_[0] = p;
+    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
+      return true;
+  }
+  return false;
+}
+
+// Bit-state search.
+bool Prog::SearchBitState(const StringPiece& text,
+                          const StringPiece& context,
+                          Anchor anchor,
+                          MatchKind kind,
+                          StringPiece* match,
+                          int nmatch) {
+  // If full match, we ask for an anchored longest match
+  // and then check that match[0] == text.
+  // So make sure match[0] exists.
+  StringPiece sp0;
+  if (kind == kFullMatch) {
+    anchor = kAnchored;
+    if (nmatch < 1) {
+      match = &sp0;
+      nmatch = 1;
+    }
+  }
+
+  // Run the search.
+  BitState b(this);
+  bool anchored = anchor == kAnchored;
+  bool longest = kind != kFirstMatch;
+  if (!b.Search(text, context, anchored, longest, match, nmatch))
+    return false;
+  if (kind == kFullMatch && match[0].end() != text.end())
+    return false;
+  return true;
+}
+
+}  // namespace re2
--- a/re2/re2/compile.cc
+++ b/re2/re2/compile.cc
--- a/re2/re2/dfa.cc
+++ b/re2/re2/dfa.cc
--- a/re2/re2/filtered_re2.cc
+++ b/re2/re2/filtered_re2.cc
@ -0,0 +1,100 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string>
+#include "util/util.h"
+#include "re2/filtered_re2.h"
+#include "re2/prefilter.h"
+#include "re2/prefilter_tree.h"
+
+namespace re2 {
+
+FilteredRE2::FilteredRE2()
+    : compiled_(false),
+      prefilter_tree_(new PrefilterTree()) {
+}
+
+FilteredRE2::~FilteredRE2() {
+  for (int i = 0; i < re2_vec_.size(); i++)
+    delete re2_vec_[i];
+  delete prefilter_tree_;
+}
+
+RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
+                                const RE2::Options& options, int* id) {
+  RE2* re = new RE2(pattern, options);
+  RE2::ErrorCode code = re->error_code();
+
+  if (!re->ok()) {
+    LOG(ERROR) << "Couldn't compile regular expression, skipping: "
+               << re << " due to error " << re->error();
+    delete re;
+  } else {
+    *id = re2_vec_.size();
+    re2_vec_.push_back(re);
+  }
+
+  return code;
+}
+
+void FilteredRE2::Compile(vector<string>* atoms) {
+  if (compiled_ || re2_vec_.size() == 0) {
+    LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
+    return;
+  }
+
+  for (int i = 0; i < re2_vec_.size(); i++) {
+    Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
+    prefilter_tree_->Add(prefilter);
+  }
+  atoms->clear();
+  prefilter_tree_->Compile(atoms);
+  compiled_ = true;
+}
+
+int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
+  for (int i = 0; i < re2_vec_.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[i]))
+      return i;
+  return -1;
+}
+
+int FilteredRE2::FirstMatch(const StringPiece& text,
+                            const vector<int>& atoms) const {
+  if (!compiled_) {
+    LOG(DFATAL) << "FirstMatch called before Compile";
+    return -1;
+  }
+  vector<int> regexps;
+  prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+  for (int i = 0; i < regexps.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+      return regexps[i];
+  return -1;
+}
+
+bool FilteredRE2::AllMatches(
+    const StringPiece& text,
+    const vector<int>& atoms,
+    vector<int>* matching_regexps) const {
+  matching_regexps->clear();
+  vector<int> regexps;
+  prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+  for (int i = 0; i < regexps.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+      matching_regexps->push_back(regexps[i]);
+  return !matching_regexps->empty();
+}
+
+void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
+                                      vector<int>* passed_regexps) {
+  prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
+}
+
+
+void FilteredRE2::PrintPrefilter(int regexpid) {
+  prefilter_tree_->PrintPrefilter(regexpid);
+}
+
+}  // namespace re2
--- a/re2/re2/filtered_re2.h
+++ b/re2/re2/filtered_re2.h
@ -0,0 +1,101 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
+// It provides a prefilter mechanism that helps in cutting down the
+// number of regexps that need to be actually searched.
+//
+// By design, it does not include a string matching engine. This is to
+// allow the user of the class to use their favorite string match
+// engine. The overall flow is: Add all the regexps using Add, then
+// Compile the FilteredRE2. The compile returns strings that need to
+// be matched. Note that all returned strings are lowercase. For
+// applying regexps to a search text, the caller does the string
+// matching using the strings returned. When doing the string match,
+// note that the caller has to do that on lower cased version of the
+// search text. Then call FirstMatch or AllMatches with a vector of
+// indices of strings that were found in the text to get the actual
+// regexp matches.
+
+#ifndef RE2_FILTERED_RE2_H_
+#define RE2_FILTERED_RE2_H_
+
+#include <vector>
+#include "re2/re2.h"
+
+namespace re2 {
+using std::vector;
+
+class PrefilterTree;
+
+class FilteredRE2 {
+ public:
+  FilteredRE2();
+  ~FilteredRE2();
+
+  // Uses RE2 constructor to create a RE2 object (re). Returns
+  // re->error_code(). If error_code is other than NoError, then re is
+  // deleted and not added to re2_vec_.
+  RE2::ErrorCode Add(const StringPiece& pattern,
+                     const RE2::Options& options,
+                     int *id);
+
+  // Prepares the regexps added by Add for filtering.  Returns a set
+  // of strings that the caller should check for in candidate texts.
+  // The returned strings are lowercased. When doing string matching,
+  // the search text should be lowercased first to find matching
+  // strings from the set of strings returned by Compile.  Call after
+  // all Add calls are done.
+  void Compile(vector<string>* strings_to_match);
+
+  // Returns the index of the first matching regexp.
+  // Returns -1 on no match. Can be called prior to Compile.
+  // Does not do any filtering: simply tries to Match the
+  // regexps in a loop.
+  int SlowFirstMatch(const StringPiece& text) const;
+
+  // Returns the index of the first matching regexp.
+  // Returns -1 on no match. Compile has to be called before
+  // calling this.
+  int FirstMatch(const StringPiece& text,
+                 const vector<int>& atoms) const;
+
+  // Returns the indices of all matching regexps, after first clearing
+  // matched_regexps.
+  bool AllMatches(const StringPiece& text,
+                  const vector<int>& atoms,
+                  vector<int>* matching_regexps) const;
+
+  // The number of regexps added.
+  int NumRegexps() const { return re2_vec_.size(); }
+
+ private:
+
+  // Get the individual RE2 objects. Useful for testing.
+  RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
+
+  // Print prefilter.
+  void PrintPrefilter(int regexpid);
+
+  // Useful for testing and debugging.
+  void RegexpsGivenStrings(const vector<int>& matched_atoms,
+                           vector<int>* passed_regexps);
+
+  // All the regexps in the FilteredRE2.
+  vector<RE2*> re2_vec_;
+
+  // Has the FilteredRE2 been compiled using Compile()
+  bool compiled_;
+
+  // An AND-OR tree of string atoms used for filtering regexps.
+  PrefilterTree* prefilter_tree_;
+
+  //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
+  FilteredRE2(const FilteredRE2&);
+  void operator=(const FilteredRE2&);
+};
+
+}  // namespace re2
+
+#endif  // RE2_FILTERED_RE2_H_
--- a/re2/re2/make_perl_groups.pl
+++ b/re2/re2/make_perl_groups.pl
@ -0,0 +1,110 @@
+#!/usr/bin/perl
+# Copyright 2008 The RE2 Authors.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Generate table entries giving character ranges
+# for POSIX/Perl character classes.  Rather than
+# figure out what the definition is, it is easier to ask
+# Perl about each letter from 0-128 and write down
+# its answer.
+
+@posixclasses = (
+	"[:alnum:]",
+	"[:alpha:]",
+	"[:ascii:]",
+	"[:blank:]",
+	"[:cntrl:]",
+	"[:digit:]",
+	"[:graph:]",
+	"[:lower:]",
+	"[:print:]",
+	"[:punct:]",
+	"[:space:]",
+	"[:upper:]",
+	"[:word:]",
+	"[:xdigit:]",
+);
+
+@perlclasses = (
+	"\\d",
+	"\\s",
+	"\\w",
+);
+
+sub ComputeClass($) {
+  my @ranges;
+  my ($class) = @_;
+  my $regexp = "[$class]";
+  my $start = -1;
+  for (my $i=0; $i<=129; $i++) {
+    if ($i == 129) { $i = 256; }
+    if ($i <= 128 && chr($i) =~ $regexp) {
+      if ($start < 0) {
+        $start = $i;
+      }
+    } else {
+      if ($start >= 0) {
+        push @ranges, [$start, $i-1];
+      }
+      $start = -1;
+    }
+  }
+  return @ranges;
+}
+
+sub PrintClass($$@) {
+  my ($cname, $name, @ranges) = @_;
+  print "static URange16 code${cname}[] = {  /* $name */\n";
+  for (my $i=0; $i<@ranges; $i++) {
+    my @a = @{$ranges[$i]};
+    printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
+  }
+  print "};\n";
+  my $n = @ranges;
+  my $escname = $name;
+  $escname =~ s/\\/\\\\/g;
+  $negname = $escname;
+  if ($negname =~ /:/) {
+    $negname =~ s/:/:^/;
+  } else {
+    $negname =~ y/a-z/A-Z/;
+  }
+  return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }";
+}
+
+my $gen = 0;
+
+sub PrintClasses($@) {
+  my ($cname, @classes) = @_;
+  my @entries;
+  foreach my $cl (@classes) {
+    my @ranges = ComputeClass($cl);
+    push @entries, PrintClass(++$gen, $cl, @ranges);
+  }
+  print "UGroup ${cname}_groups[] = {\n";
+  foreach my $e (@entries) {
+    print "\t$e,\n";
+  }
+  print "};\n";
+  my $count = @entries;
+  print "int num_${cname}_groups = $count;\n";
+}
+
+print <<EOF;
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+EOF
+
+PrintClasses("perl", @perlclasses);
+PrintClasses("posix", @posixclasses);
+
+print <<EOF;
+
+}  // namespace re2
+EOF
--- a/re2/re2/make_unicode_casefold.py
+++ b/re2/re2/make_unicode_casefold.py
@ -0,0 +1,146 @@
+#!/usr/bin/python
+# coding=utf-8
+#
+# Copyright 2008 The RE2 Authors.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# See unicode_casefold.h for description of case folding tables.
+
+"""Generate C++ table for Unicode case folding."""
+
+import unicode, sys
+
+_header = """
+// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
+// make_unicode_casefold.py >unicode_casefold.cc
+
+#include "re2/unicode_casefold.h"
+
+namespace re2 {
+
+"""
+
+_trailer = """
+
+} // namespace re2
+
+"""
+
+def _Delta(a, b):
+  """Compute the delta for b - a.  Even/odd and odd/even
+     are handled specially, as described above."""
+  if a+1 == b:
+    if a%2 == 0:
+      return 'EvenOdd'
+    else:
+      return 'OddEven'
+  if a == b+1:
+    if a%2 == 0:
+      return 'OddEven'
+    else:
+      return 'EvenOdd'
+  return b - a
+
+def _AddDelta(a, delta):
+  """Return a + delta, handling EvenOdd and OddEven specially."""
+  if type(delta) == int:
+    return a+delta
+  if delta == 'EvenOdd':
+    if a%2 == 0:
+      return a+1
+    else:
+      return a-1
+  if delta == 'OddEven':
+    if a%2 == 1:
+      return a+1
+    else:
+      return a-1
+  print >>sys.stderr, "Bad Delta: ", delta
+  raise "Bad Delta"
+
+def _MakeRanges(pairs):
+  """Turn a list like [(65,97), (66, 98), ..., (90,122)]
+     into [(65, 90, +32)]."""
+  ranges = []
+  last = -100
+
+  def evenodd(last, a, b, r):
+    if a != last+1 or b != _AddDelta(a, r[2]):
+      return False
+    r[1] = a
+    return True
+
+  def evenoddpair(last, a, b, r):
+    if a != last+2:
+      return False
+    delta = r[2]
+    d = delta
+    if type(delta) is not str:
+      return False
+    if delta.endswith('Skip'):
+      d = delta[:-4]
+    else:
+      delta = d + 'Skip'
+    if b != _AddDelta(a, d):
+      return False
+    r[1] = a
+    r[2] = delta
+    return True
+
+  for a, b in pairs:
+    if ranges and evenodd(last, a, b, ranges[-1]):
+      pass
+    elif ranges and evenoddpair(last, a, b, ranges[-1]):
+      pass
+    else:
+      ranges.append([a, a, _Delta(a, b)])
+    last = a
+  return ranges
+
+# The maximum size of a case-folding group.
+# Case folding is implemented in parse.cc by a recursive process
+# with a recursion depth equal to the size of the largest
+# case-folding group, so it is important that this bound be small.
+# The current tables have no group bigger than 4.
+# If there are ever groups bigger than 10 or so, it will be
+# time to rework the code in parse.cc.
+MaxCasefoldGroup = 4
+
+def main():
+  lowergroups, casegroups = unicode.CaseGroups()
+  foldpairs = []
+  seen = {}
+  for c in casegroups:
+    if len(c) > MaxCasefoldGroup:
+      raise unicode.Error("casefold group too long: %s" % (c,))
+    for i in range(len(c)):
+      if c[i-1] in seen:
+        raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
+      seen[c[i-1]] = True
+      foldpairs.append([c[i-1], c[i]])
+
+  lowerpairs = []
+  for lower, group in lowergroups.iteritems():
+    for g in group:
+      if g != lower:
+        lowerpairs.append([g, lower])
+
+  def printpairs(name, foldpairs):
+    foldpairs.sort()
+    foldranges = _MakeRanges(foldpairs)
+    print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
+    print "CaseFold unicode_%s[] = {" % (name,)
+    for lo, hi, delta in foldranges:
+      print "\t{ %d, %d, %s }," % (lo, hi, delta)
+    print "};"
+    print "int num_unicode_%s = %d;" % (name, len(foldranges),)
+    print ""
+
+  print _header
+  printpairs("casefold", foldpairs)
+  printpairs("tolower", lowerpairs)
+  print _trailer
+
+if __name__ == '__main__':
+  main()
--- a/re2/re2/make_unicode_groups.py
+++ b/re2/re2/make_unicode_groups.py
@ -0,0 +1,111 @@
+#!/usr/bin/python
+# Copyright 2008 The RE2 Authors.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""Generate C++ tables for Unicode Script and Category groups."""
+
+import sys
+import unicode
+
+_header = """
+// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
+// make_unicode_groups.py >unicode_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+"""
+
+_trailer = """
+
+}  // namespace re2
+
+"""
+
+n16 = 0
+n32 = 0
+
+def MakeRanges(codes):
+  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
+  ranges = []
+  last = -100
+  for c in codes:
+    if c == last+1:
+      ranges[-1][1] = c
+    else:
+      ranges.append([c, c])
+    last = c
+  return ranges
+
+def PrintRanges(type, name, ranges):
+  """Print the ranges as an array of type named name."""
+  print "static %s %s[] = {" % (type, name,)
+  for lo, hi in ranges:
+    print "\t{ %d, %d }," % (lo, hi)
+  print "};"
+
+# def PrintCodes(type, name, codes):
+#   """Print the codes as an array of type named name."""
+#   print "static %s %s[] = {" % (type, name,)
+#   for c in codes:
+#     print "\t%d," % (c,)
+#   print "};"
+
+def PrintGroup(name, codes):
+  """Print the data structures for the group of codes.
+  Return a UGroup literal for the group."""
+
+  # See unicode_groups.h for a description of the data structure.
+
+  # Split codes into 16-bit ranges and 32-bit ranges.
+  range16 = MakeRanges([c for c in codes if c < 65536])
+  range32 = MakeRanges([c for c in codes if c >= 65536])
+
+  # Pull singleton ranges out of range16.
+  # code16 = [lo for lo, hi in range16 if lo == hi]
+  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
+
+  global n16
+  global n32
+  n16 += len(range16)
+  n32 += len(range32)
+
+  ugroup = "{ \"%s\", +1" % (name,)
+  # if len(code16) > 0:
+  #   PrintCodes("uint16", name+"_code16", code16)
+  #   ugroup += ", %s_code16, %d" % (name, len(code16))
+  # else:
+  #   ugroup += ", 0, 0"
+  if len(range16) > 0:
+    PrintRanges("URange16", name+"_range16", range16)
+    ugroup += ", %s_range16, %d" % (name, len(range16))
+  else:
+    ugroup += ", 0, 0"
+  if len(range32) > 0:
+    PrintRanges("URange32", name+"_range32", range32)
+    ugroup += ", %s_range32, %d" % (name, len(range32))
+  else:
+    ugroup += ", 0, 0"
+  ugroup += " }"
+  return ugroup
+
+def main():
+  print _header
+  ugroups = []
+  for name, codes in unicode.Categories().iteritems():
+    ugroups.append(PrintGroup(name, codes))
+  for name, codes in unicode.Scripts().iteritems():
+    ugroups.append(PrintGroup(name, codes))
+  print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
+  print "UGroup unicode_groups[] = {";
+  ugroups.sort()
+  for ug in ugroups:
+    print "\t%s," % (ug,)
+  print "};"
+  print "int num_unicode_groups = %d;" % (len(ugroups),)
+  print _trailer
+
+if __name__ == '__main__':
+  main()
--- a/re2/re2/mimics_pcre.cc
+++ b/re2/re2/mimics_pcre.cc
@ -0,0 +1,185 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Determine whether this library should match PCRE exactly
+// for a particular Regexp.  (If so, the testing framework can
+// check that it does.)
+//
+// This library matches PCRE except in these cases:
+//   * the regexp contains a repetition of an empty string,
+//     like (a*)* or (a*)+.  In this case, PCRE will treat
+//     the repetition sequence as ending with an empty string,
+//     while this library does not.
+//   * Perl and PCRE differ on whether \v matches \n.
+//     For historical reasons, this library implements the Perl behavior.
+//   * Perl and PCRE allow $ in one-line mode to match either the very
+//     end of the text or just before a \n at the end of the text.
+//     This library requires it to match only the end of the text.
+//   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
+//     match the end of the text if the last character is a \n.
+//     This library does allow it.
+//
+// Regexp::MimicsPCRE checks for any of these conditions.
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Returns whether re might match an empty string.
+static bool CanBeEmptyString(Regexp *re);
+
+// Walker class to compute whether library handles a regexp
+// exactly as PCRE would.  See comment at top for conditions.
+
+class PCREWalker : public Regexp::Walker<bool> {
+ public:
+  PCREWalker() {}
+  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
+                 int nchild_args);
+
+  bool ShortVisit(Regexp* re, bool a) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+    return a;
+  }
+};
+
+// Called after visiting each of re's children and accumulating
+// the return values in child_args.  So child_args contains whether
+// this library mimics PCRE for those subexpressions.
+bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                           bool* child_args, int nchild_args) {
+  // If children failed, so do we.
+  for (int i = 0; i < nchild_args; i++)
+    if (!child_args[i])
+      return false;
+
+  // Otherwise look for other reasons to fail.
+  switch (re->op()) {
+    // Look for repeated empty string.
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      if (CanBeEmptyString(re->sub()[0]))
+        return false;
+      break;
+    case kRegexpRepeat:
+      if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
+        return false;
+      break;
+
+    // Look for \v
+    case kRegexpLiteral:
+      if (re->rune() == '\v')
+        return false;
+      break;
+
+    // Look for $ in single-line mode.
+    case kRegexpEndText:
+    case kRegexpEmptyMatch:
+      if (re->parse_flags() & Regexp::WasDollar)
+        return false;
+      break;
+
+    // Look for ^ in multi-line mode.
+    case kRegexpBeginLine:
+      // No condition: in single-line mode ^ becomes kRegexpBeginText.
+      return false;
+
+    default:
+      break;
+  }
+
+  // Not proven guilty.
+  return true;
+}
+
+// Returns whether this regexp's behavior will mimic PCRE's exactly.
+bool Regexp::MimicsPCRE() {
+  PCREWalker w;
+  return w.Walk(this, true);
+}
+
+
+// Walker class to compute whether a Regexp can match an empty string.
+// It is okay to overestimate.  For example, \b\B cannot match an empty
+// string, because \b and \B are mutually exclusive, but this isn't
+// that smart and will say it can.  Spurious empty strings
+// will reduce the number of regexps we sanity check against PCRE,
+// but they won't break anything.
+
+class EmptyStringWalker : public Regexp::Walker<bool> {
+ public:
+  EmptyStringWalker() { }
+  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                 bool* child_args, int nchild_args);
+
+  bool ShortVisit(Regexp* re, bool a) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+    return a;
+  }
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
+};
+
+// Called after visiting re's children.  child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string).  Returns whether this clause can match an
+// empty string.
+bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                                  bool* child_args, int nchild_args) {
+  switch (re->op()) {
+    case kRegexpNoMatch:               // never empty
+    case kRegexpLiteral:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpCharClass:
+    case kRegexpLiteralString:
+      return false;
+
+    case kRegexpEmptyMatch:            // always empty
+    case kRegexpBeginLine:             // always empty, when they match
+    case kRegexpEndLine:
+    case kRegexpNoWordBoundary:
+    case kRegexpWordBoundary:
+    case kRegexpBeginText:
+    case kRegexpEndText:
+    case kRegexpStar:                  // can always be empty
+    case kRegexpQuest:
+    case kRegexpHaveMatch:
+      return true;
+
+    case kRegexpConcat:                // can be empty if all children can
+      for (int i = 0; i < nchild_args; i++)
+        if (!child_args[i])
+          return false;
+      return true;
+
+    case kRegexpAlternate:             // can be empty if any child can
+      for (int i = 0; i < nchild_args; i++)
+        if (child_args[i])
+          return true;
+      return false;
+
+    case kRegexpPlus:                  // can be empty if the child can
+    case kRegexpCapture:
+      return child_args[0];
+
+    case kRegexpRepeat:                // can be empty if child can or is x{0}
+      return child_args[0] || re->min() == 0;
+  }
+  return false;
+}
+
+// Returns whether re can match an empty string.
+static bool CanBeEmptyString(Regexp* re) {
+  EmptyStringWalker w;
+  return w.Walk(re, true);
+}
+
+}  // namespace re2
--- a/re2/re2/nfa.cc
+++ b/re2/re2/nfa.cc
@ -0,0 +1,709 @@
+// Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc.
+//
+// Prog::SearchNFA, an NFA search.
+// This is an actual NFA like the theorists talk about,
+// not the pseudo-NFA found in backtracking regexp implementations.
+//
+// IMPLEMENTATION
+//
+// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
+// which is a variant of the one described in Thompson's 1968 CACM paper.
+// See http://swtch.com/~rsc/regexp/ for various history.  The main feature
+// over the DFA implementation is that it tracks submatch boundaries.
+//
+// When the choice of submatch boundaries is ambiguous, this particular
+// implementation makes the same choices that traditional backtracking
+// implementations (in particular, Perl and PCRE) do.
+// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
+// time in the length of the input.
+//
+// Like Thompson's original machine and like the DFA implementation, this
+// implementation notices a match only once it is one byte past it.
+
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "util/sparse_array.h"
+#include "util/sparse_set.h"
+
+namespace re2 {
+
+class NFA {
+ public:
+  NFA(Prog* prog);
+  ~NFA();
+
+  // Searches for a matching string.
+  //   * If anchored is true, only considers matches starting at offset.
+  //     Otherwise finds lefmost match at or after offset.
+  //   * If longest is true, returns the longest match starting
+  //     at the chosen start point.  Otherwise returns the so-called
+  //     left-biased match, the one traditional backtracking engines
+  //     (like Perl and PCRE) find.
+  // Records submatch boundaries in submatch[1..nsubmatch-1].
+  // Submatch[0] is the entire match.  When there is a choice in
+  // which text matches each subexpression, the submatch boundaries
+  // are chosen to match what a backtracking implementation would choose.
+  bool Search(const StringPiece& text, const StringPiece& context,
+              bool anchored, bool longest,
+              StringPiece* submatch, int nsubmatch);
+
+  static const int Debug = 0;
+
+ private:
+  struct Thread {
+    union {
+      int id;
+      Thread* next;  // when on free list
+    };
+    const char** capture;
+  };
+
+  // State for explicit stack in AddToThreadq.
+  struct AddState {
+    int id;           // Inst to process
+    int j;
+    const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
+
+    AddState()
+      : id(0), j(-1), cap_j(NULL) {}
+    explicit AddState(int id)
+      : id(id), j(-1), cap_j(NULL) {}
+    AddState(int id, const char* cap_j, int j)
+      : id(id), j(j), cap_j(cap_j) {}
+  };
+
+  // Threadq is a list of threads.  The list is sorted by the order
+  // in which Perl would explore that particular state -- the earlier
+  // choices appear earlier in the list.
+  typedef SparseArray<Thread*> Threadq;
+
+  inline Thread* AllocThread();
+  inline void FreeThread(Thread*);
+
+  // Add r (or its children, following unlabeled arrows)
+  // to the workqueue q with associated capture info.
+  void AddToThreadq(Threadq* q, int id, int flag,
+                    const char* p, const char** capture);
+
+  // Run runq on byte c, appending new states to nextq.
+  // Updates matched_ and match_ as new, better matches are found.
+  // p is position of the next byte (the one after c)
+  // in the input string, used when processing capturing parens.
+  // flag is the bitwise or of Bol, Eol, etc., specifying whether
+  // ^, $ and \b match the current input point (after c).
+  inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
+
+  // Returns text version of capture information, for debugging.
+  string FormatCapture(const char** capture);
+
+  inline void CopyCapture(const char** dst, const char** src);
+
+  // Computes whether all matches must begin with the same first
+  // byte, and if so, returns that byte.  If not, returns -1.
+  int ComputeFirstByte();
+
+  Prog* prog_;          // underlying program
+  int start_;           // start instruction in program
+  int ncapture_;        // number of submatches to track
+  bool longest_;        // whether searching for longest match
+  bool endmatch_;       // whether match must end at text.end()
+  const char* btext_;   // beginning of text being matched (for FormatSubmatch)
+  const char* etext_;   // end of text being matched (for endmatch_)
+  Threadq q0_, q1_;     // pre-allocated for Search.
+  const char** match_;  // best match so far
+  bool matched_;        // any match so far?
+  AddState* astack_;    // pre-allocated for AddToThreadq
+  int nastack_;
+  int first_byte_;      // required first byte for match, or -1 if none
+
+  Thread* free_threads_;  // free list
+
+  DISALLOW_EVIL_CONSTRUCTORS(NFA);
+};
+
+NFA::NFA(Prog* prog) {
+  prog_ = prog;
+  start_ = prog->start();
+  ncapture_ = 0;
+  longest_ = false;
+  endmatch_ = false;
+  btext_ = NULL;
+  etext_ = NULL;
+  q0_.resize(prog_->size());
+  q1_.resize(prog_->size());
+  nastack_ = 2*prog_->size();
+  astack_ = new AddState[nastack_];
+  match_ = NULL;
+  matched_ = false;
+  free_threads_ = NULL;
+  first_byte_ = ComputeFirstByte();
+}
+
+NFA::~NFA() {
+  delete[] match_;
+  delete[] astack_;
+  Thread* next;
+  for (Thread* t = free_threads_; t; t = next) {
+    next = t->next;
+    delete[] t->capture;
+    delete t;
+  }
+}
+
+void NFA::FreeThread(Thread *t) {
+  if (t == NULL)
+    return;
+  t->next = free_threads_;
+  free_threads_ = t;
+}
+
+NFA::Thread* NFA::AllocThread() {
+  Thread* t = free_threads_;
+  if (t == NULL) {
+    t = new Thread;
+    t->capture = new const char*[ncapture_];
+    return t;
+  }
+  free_threads_ = t->next;
+  return t;
+}
+
+void NFA::CopyCapture(const char** dst, const char** src) {
+  for (int i = 0; i < ncapture_; i+=2) {
+    dst[i] = src[i];
+    dst[i+1] = src[i+1];
+  }
+}
+
+// Follows all empty arrows from r and enqueues all the states reached.
+// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+// The pointer p is the current input position, and m is the
+// current set of match boundaries.
+void NFA::AddToThreadq(Threadq* q, int id0, int flag,
+                       const char* p, const char** capture) {
+  if (id0 == 0)
+    return;
+
+  // Astack_ is pre-allocated to avoid resize operations.
+  // It has room for 2*prog_->size() entries, which is enough:
+  // Each inst in prog can be processed at most once,
+  // pushing at most two entries on stk.
+
+  int nstk = 0;
+  AddState* stk = astack_;
+  stk[nstk++] = AddState(id0);
+
+  while (nstk > 0) {
+    DCHECK_LE(nstk, nastack_);
+    const AddState& a = stk[--nstk];
+    if (a.j >= 0)
+      capture[a.j] = a.cap_j;
+
+    int id = a.id;
+    if (id == 0)
+      continue;
+    if (q->has_index(id)) {
+      if (Debug)
+        fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
+      continue;
+    }
+
+    // Create entry in q no matter what.  We might fill it in below,
+    // or we might not.  Even if not, it is necessary to have it,
+    // so that we don't revisit r during the recursion.
+    q->set_new(id, NULL);
+
+    Thread** tp = &q->find(id)->second;
+    int j;
+    Thread* t;
+    Prog::Inst* ip = prog_->inst(id);
+    switch (ip->opcode()) {
+    default:
+      LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
+      break;
+
+    case kInstFail:
+      break;
+
+    case kInstAltMatch:
+      // Save state; will pick up at next byte.
+      t = AllocThread();
+      t->id = id;
+      CopyCapture(t->capture, capture);
+      *tp = t;
+      // fall through
+
+    case kInstAlt:
+      // Explore alternatives.
+      stk[nstk++] = AddState(ip->out1());
+      stk[nstk++] = AddState(ip->out());
+      break;
+
+    case kInstNop:
+      // Continue on.
+      stk[nstk++] = AddState(ip->out());
+      break;
+
+    case kInstCapture:
+      if ((j=ip->cap()) < ncapture_) {
+        // Push a dummy whose only job is to restore capture[j]
+        // once we finish exploring this possibility.
+        stk[nstk++] = AddState(0, capture[j], j);
+
+        // Record capture.
+        capture[j] = p;
+      }
+      stk[nstk++] = AddState(ip->out());
+      break;
+
+    case kInstMatch:
+    case kInstByteRange:
+      // Save state; will pick up at next byte.
+      t = AllocThread();
+      t->id = id;
+      CopyCapture(t->capture, capture);
+      *tp = t;
+      if (Debug)
+        fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
+      break;
+
+    case kInstEmptyWidth:
+      // Continue on if we have all the right flag bits.
+      if (ip->empty() & ~flag)
+        break;
+      stk[nstk++] = AddState(ip->out());
+      break;
+    }
+  }
+}
+
+// Run runq on byte c, appending new states to nextq.
+// Updates match as new, better matches are found.
+// p is position of the byte c in the input string,
+// used when processing capturing parens.
+// flag is the bitwise or of Bol, Eol, etc., specifying whether
+// ^, $ and \b match the current input point (after c).
+// Frees all the threads on runq.
+// If there is a shortcut to the end, returns that shortcut.
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
+  nextq->clear();
+
+  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+    Thread* t = i->second;
+    if (t == NULL)
+      continue;
+
+    if (longest_) {
+      // Can skip any threads started after our current best match.
+      if (matched_ && match_[0] < t->capture[0]) {
+        FreeThread(t);
+        continue;
+      }
+    }
+
+    int id = t->id;
+    Prog::Inst* ip = prog_->inst(id);
+
+    switch (ip->opcode()) {
+      default:
+        // Should only see the values handled below.
+        LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
+        break;
+
+      case kInstByteRange:
+        if (ip->Matches(c))
+          AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
+        break;
+
+      case kInstAltMatch:
+        if (i != runq->begin())
+          break;
+        // The match is ours if we want it.
+        if (ip->greedy(prog_) || longest_) {
+          CopyCapture((const char**)match_, t->capture);
+          FreeThread(t);
+          for (++i; i != runq->end(); ++i)
+            FreeThread(i->second);
+          runq->clear();
+          matched_ = true;
+          if (ip->greedy(prog_))
+            return ip->out1();
+          return ip->out();
+        }
+        break;
+
+      case kInstMatch:
+        if (endmatch_ && p != etext_)
+          break;
+
+        const char* old = t->capture[1];  // previous end pointer
+        t->capture[1] = p;
+        if (longest_) {
+          // Leftmost-longest mode: save this match only if
+          // it is either farther to the left or at the same
+          // point but longer than an existing match.
+          if (!matched_ || t->capture[0] < match_[0] ||
+              (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
+            CopyCapture((const char**)match_, t->capture);
+        } else {
+          // Leftmost-biased mode: this match is by definition
+          // better than what we've already found (see next line).
+          CopyCapture((const char**)match_, t->capture);
+
+          // Cut off the threads that can only find matches
+          // worse than the one we just found: don't run the
+          // rest of the current Threadq.
+          t->capture[0] = old;
+          FreeThread(t);
+          for (++i; i != runq->end(); ++i)
+            FreeThread(i->second);
+          runq->clear();
+          matched_ = true;
+          return 0;
+        }
+        t->capture[0] = old;
+        matched_ = true;
+        break;
+    }
+    FreeThread(t);
+  }
+  runq->clear();
+  return 0;
+}
+
+string NFA::FormatCapture(const char** capture) {
+  string s;
+
+  for (int i = 0; i < ncapture_; i+=2) {
+    if (capture[i] == NULL)
+      StringAppendF(&s, "(?,?)");
+    else if (capture[i+1] == NULL)
+      StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
+    else
+      StringAppendF(&s, "(%d,%d)",
+                    (int)(capture[i] - btext_),
+                    (int)(capture[i+1] - btext_));
+  }
+  return s;
+}
+
+// Returns whether haystack contains needle's memory.
+static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
+  return haystack.begin() <= needle.begin() &&
+         haystack.end() >= needle.end();
+}
+
+bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
+            bool anchored, bool longest,
+            StringPiece* submatch, int nsubmatch) {
+  if (start_ == 0)
+    return false;
+
+  StringPiece context = const_context;
+  if (context.begin() == NULL)
+    context = text;
+
+  if (!StringPieceContains(context, text)) {
+    LOG(FATAL) << "Bad args: context does not contain text "
+                << reinterpret_cast<const void*>(context.begin())
+                << "+" << context.size() << " "
+                << reinterpret_cast<const void*>(text.begin())
+                << "+" << text.size();
+    return false;
+  }
+
+  if (prog_->anchor_start() && context.begin() != text.begin())
+    return false;
+  if (prog_->anchor_end() && context.end() != text.end())
+    return false;
+  anchored |= prog_->anchor_start();
+  if (prog_->anchor_end()) {
+    longest = true;
+    endmatch_ = true;
+    etext_ = text.end();
+  }
+
+  if (nsubmatch < 0) {
+    LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
+    return false;
+  }
+
+  // Save search parameters.
+  ncapture_ = 2*nsubmatch;
+  longest_ = longest;
+
+  if (nsubmatch == 0) {
+    // We need to maintain match[0], both to distinguish the
+    // longest match (if longest is true) and also to tell
+    // whether we've seen any matches at all.
+    ncapture_ = 2;
+  }
+
+  match_ = new const char*[ncapture_];
+  matched_ = false;
+  memset(match_, 0, ncapture_*sizeof match_[0]);
+
+  // For debugging prints.
+  btext_ = context.begin();
+
+  if (Debug) {
+    fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
+            text.as_string().c_str(), context.as_string().c_str(), anchored,
+            longest);
+  }
+
+  // Set up search.
+  Threadq* runq = &q0_;
+  Threadq* nextq = &q1_;
+  runq->clear();
+  nextq->clear();
+  memset(&match_[0], 0, ncapture_*sizeof match_[0]);
+  const char* bp = context.begin();
+  int c = -1;
+  int wasword = 0;
+
+  if (text.begin() > context.begin()) {
+    c = text.begin()[-1] & 0xFF;
+    wasword = Prog::IsWordChar(c);
+  }
+
+  // Loop over the text, stepping the machine.
+  for (const char* p = text.begin();; p++) {
+    // Check for empty-width specials.
+    int flag = 0;
+
+    // ^ and \A
+    if (p == context.begin())
+      flag |= kEmptyBeginText | kEmptyBeginLine;
+    else if (p <= context.end() && p[-1] == '\n')
+      flag |= kEmptyBeginLine;
+
+    // $ and \z
+    if (p == context.end())
+      flag |= kEmptyEndText | kEmptyEndLine;
+    else if (p < context.end() && p[0] == '\n')
+      flag |= kEmptyEndLine;
+
+    // \b and \B
+    int isword = 0;
+    if (p < context.end())
+      isword = Prog::IsWordChar(p[0] & 0xFF);
+
+    if (isword != wasword)
+      flag |= kEmptyWordBoundary;
+    else
+      flag |= kEmptyNonWordBoundary;
+
+    if (Debug) {
+      fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
+      for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+        Thread* t = i->second;
+        if (t == NULL)
+          continue;
+        fprintf(stderr, " %d%s", t->id,
+                FormatCapture((const char**)t->capture).c_str());
+      }
+      fprintf(stderr, "\n");
+    }
+
+    // Process previous character (waited until now to avoid
+    // repeating the flag computation above).
+    // This is a no-op the first time around the loop, because
+    // runq is empty.
+    int id = Step(runq, nextq, c, flag, p-1);
+    DCHECK_EQ(runq->size(), 0);
+    swap(nextq, runq);
+    nextq->clear();
+    if (id != 0) {
+      // We're done: full match ahead.
+      p = text.end();
+      for (;;) {
+        Prog::Inst* ip = prog_->inst(id);
+        switch (ip->opcode()) {
+          default:
+            LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
+            break;
+
+          case kInstCapture:
+            match_[ip->cap()] = p;
+            id = ip->out();
+            continue;
+
+          case kInstNop:
+            id = ip->out();
+            continue;
+
+          case kInstMatch:
+            match_[1] = p;
+            matched_ = true;
+            break;
+
+          case kInstEmptyWidth:
+            if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
+              LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
+              break;
+            }
+            id = ip->out();
+            continue;
+        }
+        break;
+      }
+      break;
+    }
+
+    if (p > text.end())
+      break;
+
+    // Start a new thread if there have not been any matches.
+    // (No point in starting a new thread if there have been
+    // matches, since it would be to the right of the match
+    // we already found.)
+    if (!matched_ && (!anchored || p == text.begin())) {
+      // If there's a required first byte for an unanchored search
+      // and we're not in the middle of any possible matches,
+      // use memchr to search for the byte quickly.
+      if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
+          p < text.end() && (p[0] & 0xFF) != first_byte_) {
+        p = reinterpret_cast<const char*>(memchr(p, first_byte_,
+                                                 text.end() - p));
+        if (p == NULL) {
+          p = text.end();
+          isword = 0;
+        } else {
+          isword = Prog::IsWordChar(p[0] & 0xFF);
+        }
+        flag = Prog::EmptyFlags(context, p);
+      }
+
+      // Steal match storage (cleared but unused as of yet)
+      // temporarily to hold match boundaries for new thread.
+      match_[0] = p;
+      AddToThreadq(runq, start_, flag, p, match_);
+      match_[0] = NULL;
+    }
+
+    // If all the threads have died, stop early.
+    if (runq->size() == 0) {
+      if (Debug)
+        fprintf(stderr, "dead\n");
+      break;
+    }
+
+    if (p == text.end())
+      c = 0;
+    else
+      c = *p & 0xFF;
+    wasword = isword;
+
+    // Will run step(runq, nextq, c, ...) on next iteration.  See above.
+  }
+
+  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
+    FreeThread(i->second);
+
+  if (matched_) {
+    for (int i = 0; i < nsubmatch; i++)
+      submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
+    if (Debug)
+      fprintf(stderr, "match (%d,%d)\n",
+              static_cast<int>(match_[0] - btext_),
+              static_cast<int>(match_[1] - btext_));
+    return true;
+  }
+  VLOG(1) << "No matches found";
+  return false;
+}
+
+// Computes whether all successful matches have a common first byte,
+// and if so, returns that byte.  If not, returns -1.
+int NFA::ComputeFirstByte() {
+  if (start_ == 0)
+    return -1;
+
+  int b = -1;  // first byte, not yet computed
+
+  typedef SparseSet Workq;
+  Workq q(prog_->size());
+  q.insert(start_);
+  for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
+    int id = *it;
+    Prog::Inst* ip = prog_->inst(id);
+    switch (ip->opcode()) {
+      default:
+        LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
+        break;
+
+      case kInstMatch:
+        // The empty string matches: no first byte.
+        return -1;
+
+      case kInstByteRange:
+        // Must match only a single byte
+        if (ip->lo() != ip->hi())
+          return -1;
+        if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
+          return -1;
+        // If we haven't seen any bytes yet, record it;
+        // otherwise must match the one we saw before.
+        if (b == -1)
+          b = ip->lo();
+        else if (b != ip->lo())
+          return -1;
+        break;
+
+      case kInstNop:
+      case kInstCapture:
+      case kInstEmptyWidth:
+        // Continue on.
+        // Ignore ip->empty() flags for kInstEmptyWidth
+        // in order to be as conservative as possible
+        // (assume all possible empty-width flags are true).
+        if (ip->out())
+          q.insert(ip->out());
+        break;
+
+      case kInstAlt:
+      case kInstAltMatch:
+        // Explore alternatives.
+        if (ip->out())
+          q.insert(ip->out());
+        if (ip->out1())
+          q.insert(ip->out1());
+        break;
+
+      case kInstFail:
+        break;
+    }
+  }
+  return b;
+}
+
+bool
+Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
+                Anchor anchor, MatchKind kind,
+                StringPiece* match, int nmatch) {
+  if (NFA::Debug)
+    Dump();
+
+  NFA nfa(this);
+  StringPiece sp;
+  if (kind == kFullMatch) {
+    anchor = kAnchored;
+    if (nmatch == 0) {
+      match = &sp;
+      nmatch = 1;
+    }
+  }
+  if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
+    return false;
+  if (kind == kFullMatch && match[0].end() != text.end())
+    return false;
+  return true;
+}
+
+}  // namespace re2
+
--- a/re2/re2/onepass.cc
+++ b/re2/re2/onepass.cc
@ -0,0 +1,614 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc.
+//
+// Prog::SearchOnePass is an efficient implementation of
+// regular expression search with submatch tracking for
+// what I call "one-pass regular expressions".  (An alternate
+// name might be "backtracking-free regular expressions".)
+//
+// One-pass regular expressions have the property that
+// at each input byte during an anchored match, there may be
+// multiple alternatives but only one can proceed for any
+// given input byte.
+//
+// For example, the regexp /x*yx*/ is one-pass: you read
+// x's until a y, then you read the y, then you keep reading x's.
+// At no point do you have to guess what to do or back up
+// and try a different guess.
+//
+// On the other hand, /x*x/ is not one-pass: when you're
+// looking at an input "x", it's not clear whether you should
+// use it to extend the x* or as the final x.
+//
+// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
+// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
+//
+// A simple intuition for identifying one-pass regular expressions
+// is that it's always immediately obvious when a repetition ends.
+// It must also be immediately obvious which branch of an | to take:
+//
+// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
+//
+// The NFA-based search in nfa.cc does some bookkeeping to
+// avoid the need for backtracking and its associated exponential blowup.
+// But if we have a one-pass regular expression, there is no
+// possibility of backtracking, so there is no need for the
+// extra bookkeeping.  Hence, this code.
+//
+// On a one-pass regular expression, the NFA code in nfa.cc
+// runs at about 1/20 of the backtracking-based PCRE speed.
+// In contrast, the code in this file runs at about the same
+// speed as PCRE.
+//
+// One-pass regular expressions get used a lot when RE is
+// used for parsing simple strings, so it pays off to
+// notice them and handle them efficiently.
+//
+// See also Anne Brüggemann-Klein and Derick Wood,
+// "One-unambiguous regular languages", Information and Computation 142(2).
+
+#include <string.h>
+#include <map>
+#include "util/util.h"
+#include "util/arena.h"
+#include "util/sparse_set.h"
+#include "re2/prog.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+static const int Debug = 0;
+
+// The key insight behind this implementation is that the
+// non-determinism in an NFA for a one-pass regular expression
+// is contained.  To explain what that means, first a
+// refresher about what regular expression programs look like
+// and how the usual NFA execution runs.
+//
+// In a regular expression program, only the kInstByteRange
+// instruction processes an input byte c and moves on to the
+// next byte in the string (it does so if c is in the given range).
+// The kInstByteRange instructions correspond to literal characters
+// and character classes in the regular expression.
+//
+// The kInstAlt instructions are used as wiring to connect the
+// kInstByteRange instructions together in interesting ways when
+// implementing | + and *.
+// The kInstAlt instruction forks execution, like a goto that
+// jumps to ip->out() and ip->out1() in parallel.  Each of the
+// resulting computation paths is called a thread.
+//
+// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
+// are interesting in their own right but like kInstAlt they don't
+// advance the input pointer.  Only kInstByteRange does.
+//
+// The automaton execution in nfa.cc runs all the possible
+// threads of execution in lock-step over the input.  To process
+// a particular byte, each thread gets run until it either dies
+// or finds a kInstByteRange instruction matching the byte.
+// If the latter happens, the thread stops just past the
+// kInstByteRange instruction (at ip->out()) and waits for
+// the other threads to finish processing the input byte.
+// Then, once all the threads have processed that input byte,
+// the whole process repeats.  The kInstAlt state instruction
+// might create new threads during input processing, but no
+// matter what, all the threads stop after a kInstByteRange
+// and wait for the other threads to "catch up".
+// Running in lock step like this ensures that the NFA reads
+// the input string only once.
+//
+// Each thread maintains its own set of capture registers
+// (the string positions at which it executed the kInstCapture
+// instructions corresponding to capturing parentheses in the
+// regular expression).  Repeated copying of the capture registers
+// is the main performance bottleneck in the NFA implementation.
+//
+// A regular expression program is "one-pass" if, no matter what
+// the input string, there is only one thread that makes it
+// past a kInstByteRange instruction at each input byte.  This means
+// that there is in some sense only one active thread throughout
+// the execution.  Other threads might be created during the
+// processing of an input byte, but they are ephemeral: only one
+// thread is left to start processing the next input byte.
+// This is what I meant above when I said the non-determinism
+// was "contained".
+//
+// To execute a one-pass regular expression program, we can build
+// a DFA (no non-determinism) that has at most as many states as
+// the NFA (compare this to the possibly exponential number of states
+// in the general case).  Each state records, for each possible
+// input byte, the next state along with the conditions required
+// before entering that state -- empty-width flags that must be true
+// and capture operations that must be performed.  It also records
+// whether a set of conditions required to finish a match at that
+// point in the input rather than process the next byte.
+
+// A state in the one-pass NFA (aka DFA) - just an array of actions.
+struct OneState;
+
+// A state in the one-pass NFA - just an array of actions indexed
+// by the bytemap_[] of the next input byte.  (The bytemap
+// maps next input bytes into equivalence classes, to reduce
+// the memory footprint.)
+struct OneState {
+  uint32 matchcond;   // conditions to match right now.
+  uint32 action[1];
+};
+
+// The uint32 conditions in the action are a combination of
+// condition and capture bits and the next state.  The bottom 16 bits
+// are the condition and capture bits, and the top 16 are the index of
+// the next state.
+//
+// Bits 0-5 are the empty-width flags from prog.h.
+// Bit 6 is kMatchWins, which means the match takes
+// priority over moving to next in a first-match search.
+// The remaining bits mark capture registers that should
+// be set to the current input position.  The capture bits
+// start at index 2, since the search loop can take care of
+// cap[0], cap[1] (the overall match position).
+// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
+// No input position can satisfy both kEmptyWordBoundary
+// and kEmptyNonWordBoundary, so we can use that as a sentinel
+// instead of needing an extra bit.
+
+static const int    kIndexShift    = 16;  // number of bits below index
+static const int    kEmptyShift   = 6;  // number of empty flags in prog.h
+static const int    kRealCapShift = kEmptyShift + 1;
+static const int    kRealMaxCap   = (kIndexShift - kRealCapShift) / 2 * 2;
+
+// Parameters used to skip over cap[0], cap[1].
+static const int    kCapShift     = kRealCapShift - 2;
+static const int    kMaxCap       = kRealMaxCap + 2;
+
+static const uint32 kMatchWins    = 1 << kEmptyShift;
+static const uint32 kCapMask      = ((1 << kRealMaxCap) - 1) << kRealCapShift;
+
+static const uint32 kImpossible   = kEmptyWordBoundary | kEmptyNonWordBoundary;
+
+// Check, at compile time, that prog.h agrees with math above.
+// This function is never called.
+void OnePass_Checks() {
+  COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
+                 kEmptyShift_disagrees_with_kEmptyAllFlags);
+  // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
+  COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
+                 kMaxCap_disagrees_with_kMaxOnePassCapture);
+}
+
+static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
+  uint32 satisfied = Prog::EmptyFlags(context, p);
+  if (cond & kEmptyAllFlags & ~satisfied)
+    return false;
+  return true;
+}
+
+// Apply the capture bits in cond, saving p to the appropriate
+// locations in cap[].
+static void ApplyCaptures(uint32 cond, const char* p,
+                          const char** cap, int ncap) {
+  for (int i = 2; i < ncap; i++)
+    if (cond & (1 << kCapShift << i))
+      cap[i] = p;
+}
+
+// Compute a node pointer.
+// Basically (OneState*)(nodes + statesize*nodeindex)
+// but the version with the C++ casts overflows 80 characters (and is ugly).
+static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
+                                    int nodeindex) {
+  return reinterpret_cast<OneState*>(
+    const_cast<uint8*>(nodes + statesize*nodeindex));
+}
+
+bool Prog::SearchOnePass(const StringPiece& text,
+                         const StringPiece& const_context,
+                         Anchor anchor, MatchKind kind,
+                         StringPiece* match, int nmatch) {
+  if (anchor != kAnchored && kind != kFullMatch) {
+    LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
+    return false;
+  }
+
+  // Make sure we have at least cap[1],
+  // because we use it to tell if we matched.
+  int ncap = 2*nmatch;
+  if (ncap < 2)
+    ncap = 2;
+
+  const char* cap[kMaxCap];
+  for (int i = 0; i < ncap; i++)
+    cap[i] = NULL;
+
+  const char* matchcap[kMaxCap];
+  for (int i = 0; i < ncap; i++)
+    matchcap[i] = NULL;
+
+  StringPiece context = const_context;
+  if (context.begin() == NULL)
+    context = text;
+  if (anchor_start() && context.begin() != text.begin())
+    return false;
+  if (anchor_end() && context.end() != text.end())
+    return false;
+  if (anchor_end())
+    kind = kFullMatch;
+
+  // State and act are marked volatile to
+  // keep the compiler from re-ordering the
+  // memory accesses walking over the NFA.
+  // This is worth about 5%.
+  volatile OneState* state = onepass_start_;
+  volatile uint8* nodes = onepass_nodes_;
+  volatile uint32 statesize = onepass_statesize_;
+  uint8* bytemap = bytemap_;
+  const char* bp = text.begin();
+  const char* ep = text.end();
+  const char* p;
+  bool matched = false;
+  matchcap[0] = bp;
+  cap[0] = bp;
+  uint32 nextmatchcond = state->matchcond;
+  for (p = bp; p < ep; p++) {
+    int c = bytemap[*p & 0xFF];
+    uint32 matchcond = nextmatchcond;
+    uint32 cond = state->action[c];
+
+    // Determine whether we can reach act->next.
+    // If so, advance state and nextmatchcond.
+    if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
+      uint32 nextindex = cond >> kIndexShift;
+      state = IndexToNode(nodes, statesize, nextindex);
+      nextmatchcond = state->matchcond;
+    } else {
+      state = NULL;
+      nextmatchcond = kImpossible;
+    }
+
+    // This code section is carefully tuned.
+    // The goto sequence is about 10% faster than the
+    // obvious rewrite as a large if statement in the
+    // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
+
+    // Saving the match capture registers is expensive.
+    // Is this intermediate match worth thinking about?
+
+    // Not if we want a full match.
+    if (kind == kFullMatch)
+      goto skipmatch;
+
+    // Not if it's impossible.
+    if (matchcond == kImpossible)
+      goto skipmatch;
+
+    // Not if the possible match is beaten by the certain
+    // match at the next byte.  When this test is useless
+    // (e.g., HTTPPartialMatchRE2) it slows the loop by
+    // about 10%, but when it avoids work (e.g., DotMatchRE2),
+    // it cuts the loop execution by about 45%.
+    if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
+      goto skipmatch;
+
+    // Finally, the match conditions must be satisfied.
+    if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
+      for (int i = 2; i < 2*nmatch; i++)
+        matchcap[i] = cap[i];
+      if (nmatch > 1 && (matchcond & kCapMask))
+        ApplyCaptures(matchcond, p, matchcap, ncap);
+      matchcap[1] = p;
+      matched = true;
+
+      // If we're in longest match mode, we have to keep
+      // going and see if we find a longer match.
+      // In first match mode, we can stop if the match
+      // takes priority over the next state for this input byte.
+      // That bit is per-input byte and thus in cond, not matchcond.
+      if (kind == kFirstMatch && (cond & kMatchWins))
+        goto done;
+    }
+
+  skipmatch:
+    if (state == NULL)
+      goto done;
+    if ((cond & kCapMask) && nmatch > 1)
+      ApplyCaptures(cond, p, cap, ncap);
+  }
+
+  // Look for match at end of input.
+  {
+    uint32 matchcond = state->matchcond;
+    if (matchcond != kImpossible &&
+        ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
+      if (nmatch > 1 && (matchcond & kCapMask))
+        ApplyCaptures(matchcond, p, cap, ncap);
+      for (int i = 2; i < ncap; i++)
+        matchcap[i] = cap[i];
+      matchcap[1] = p;
+      matched = true;
+    }
+  }
+
+done:
+  if (!matched)
+    return false;
+  for (int i = 0; i < nmatch; i++)
+    match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
+  return true;
+}
+
+
+// Analysis to determine whether a given regexp program is one-pass.
+
+// If ip is not on workq, adds ip to work queue and returns true.
+// If ip is already on work queue, does nothing and returns false.
+// If ip is NULL, does nothing and returns true (pretends to add it).
+typedef SparseSet Instq;
+static bool AddQ(Instq *q, int id) {
+  if (id == 0)
+    return true;
+  if (q->contains(id))
+    return false;
+  q->insert(id);
+  return true;
+}
+
+struct InstCond {
+  int id;
+  uint32 cond;
+};
+
+// Returns whether this is a one-pass program; that is,
+// returns whether it is safe to use SearchOnePass on this program.
+// These conditions must be true for any instruction ip:
+//
+//   (1) for any other Inst nip, there is at most one input-free
+//       path from ip to nip.
+//   (2) there is at most one kInstByte instruction reachable from
+//       ip that matches any particular byte c.
+//   (3) there is at most one input-free path from ip to a kInstMatch
+//       instruction.
+//
+// This is actually just a conservative approximation: it might
+// return false when the answer is true, when kInstEmptyWidth
+// instructions are involved.
+// Constructs and saves corresponding one-pass NFA on success.
+bool Prog::IsOnePass() {
+  if (did_onepass_)
+    return onepass_start_ != NULL;
+  did_onepass_ = true;
+
+  if (start() == 0)  // no match
+    return false;
+
+  // Steal memory for the one-pass NFA from the overall DFA budget.
+  // Willing to use at most 1/4 of the DFA budget (heuristic).
+  // Limit max node count to 65000 as a conservative estimate to
+  // avoid overflowing 16-bit node index in encoding.
+  int maxnodes = 2 + byte_inst_count_;
+  int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
+  if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
+    return false;
+
+  // Flood the graph starting at the start state, and check
+  // that in each reachable state, each possible byte leads
+  // to a unique next state.
+  int size = this->size();
+  InstCond *stack = new InstCond[size];
+
+  int* nodebyid = new int[size];  // indexed by ip
+  memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
+
+  uint8* nodes = new uint8[maxnodes*statesize];
+  uint8* nodep = nodes;
+
+  Instq tovisit(size), workq(size);
+  AddQ(&tovisit, start());
+  nodebyid[start()] = 0;
+  nodep += statesize;
+  int nalloc = 1;
+  for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
+    int id = *it;
+    int nodeindex = nodebyid[id];
+    OneState* node = IndexToNode(nodes, statesize, nodeindex);
+
+    // Flood graph using manual stack, filling in actions as found.
+    // Default is none.
+    for (int b = 0; b < bytemap_range_; b++)
+      node->action[b] = kImpossible;
+    node->matchcond = kImpossible;
+
+    workq.clear();
+    bool matched = false;
+    int nstack = 0;
+    stack[nstack].id = id;
+    stack[nstack++].cond = 0;
+    while (nstack > 0) {
+      int id = stack[--nstack].id;
+      Prog::Inst* ip = inst(id);
+      uint32 cond = stack[nstack].cond;
+      switch (ip->opcode()) {
+        case kInstAltMatch:
+          // TODO(rsc): Ignoring kInstAltMatch optimization.
+          // Should implement it in this engine, but it's subtle.
+          // Fall through.
+        case kInstAlt:
+          // If already on work queue, (1) is violated: bail out.
+          if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
+            goto fail;
+          stack[nstack].id = ip->out1();
+          stack[nstack++].cond = cond;
+          stack[nstack].id = ip->out();
+          stack[nstack++].cond = cond;
+          break;
+
+        case kInstByteRange: {
+          int nextindex = nodebyid[ip->out()];
+          if (nextindex == -1) {
+            if (nalloc >= maxnodes) {
+              if (Debug)
+                LOG(ERROR)
+                  << StringPrintf("Not OnePass: hit node limit %d > %d",
+                                  nalloc, maxnodes);
+              goto fail;
+            }
+            nextindex = nalloc;
+            nodep += statesize;
+            nodebyid[ip->out()] = nextindex;
+            nalloc++;
+            AddQ(&tovisit, ip->out());
+          }
+          if (matched)
+            cond |= kMatchWins;
+          for (int c = ip->lo(); c <= ip->hi(); c++) {
+            int b = bytemap_[c];
+            c = unbytemap_[b];  // last c in byte class
+            uint32 act = node->action[b];
+            uint32 newact = (nextindex << kIndexShift) | cond;
+            if ((act & kImpossible) == kImpossible) {
+              node->action[b] = newact;
+            } else if (act != newact) {
+              if (Debug) {
+                LOG(ERROR)
+                  << StringPrintf("Not OnePass: conflict on byte "
+                                  "%#x at state %d",
+                                  c, *it);
+              }
+              goto fail;
+            }
+          }
+          if (ip->foldcase()) {
+            Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
+            Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
+            for (int c = lo; c <= hi; c++) {
+              int b = bytemap_[c];
+              c = unbytemap_[b];  // last c in class
+              uint32 act = node->action[b];
+              uint32 newact = (nextindex << kIndexShift) | cond;
+              if ((act & kImpossible) == kImpossible) {
+                node->action[b] = newact;
+              } else if (act != newact) {
+                if (Debug) {
+                  LOG(ERROR)
+                    << StringPrintf("Not OnePass: conflict on byte "
+                                    "%#x at state %d",
+                                    c, *it);
+                }
+                goto fail;
+              }
+            }
+          }
+          break;
+        }
+
+        case kInstCapture:
+          if (ip->cap() < kMaxCap)
+            cond |= (1 << kCapShift) << ip->cap();
+          goto QueueEmpty;
+
+        case kInstEmptyWidth:
+          cond |= ip->empty();
+          goto QueueEmpty;
+
+        case kInstNop:
+        QueueEmpty:
+          // kInstCapture and kInstNop always proceed to ip->out().
+          // kInstEmptyWidth only sometimes proceeds to ip->out(),
+          // but as a conservative approximation we assume it always does.
+          // We could be a little more precise by looking at what c
+          // is, but that seems like overkill.
+
+          // If already on work queue, (1) is violated: bail out.
+          if (!AddQ(&workq, ip->out())) {
+            if (Debug) {
+              LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
+                                         " %d -> %d\n",
+                                         *it, ip->out());
+            }
+            goto fail;
+          }
+          stack[nstack].id = ip->out();
+          stack[nstack++].cond = cond;
+          break;
+
+        case kInstMatch:
+          if (matched) {
+            // (3) is violated
+            if (Debug) {
+              LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
+                                         " from %d\n", *it);
+            }
+            goto fail;
+          }
+          matched = true;
+          node->matchcond = cond;
+          break;
+
+        case kInstFail:
+          break;
+      }
+    }
+  }
+
+  if (Debug) {  // For debugging, dump one-pass NFA to LOG(ERROR).
+    string dump = "prog dump:\n" + Dump() + "node dump\n";
+    map<int, int> idmap;
+    for (int i = 0; i < size; i++)
+      if (nodebyid[i] != -1)
+        idmap[nodebyid[i]] = i;
+
+    StringAppendF(&dump, "byte ranges:\n");
+    int i = 0;
+    for (int b = 0; b < bytemap_range_; b++) {
+      int lo = i;
+      while (bytemap_[i] == b)
+        i++;
+      StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
+    }
+
+    for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
+      int id = *it;
+      int nodeindex = nodebyid[id];
+      if (nodeindex == -1)
+      	continue;
+      OneState* node = IndexToNode(nodes, statesize, nodeindex);
+      string s;
+      StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
+                    nodeindex, id, node->matchcond);
+      for (int i = 0; i < bytemap_range_; i++) {
+        if ((node->action[i] & kImpossible) == kImpossible)
+          continue;
+        StringAppendF(&dump, "  %d cond %#x -> %d id=%d\n",
+                      i, node->action[i] & 0xFFFF,
+                      node->action[i] >> kIndexShift,
+                      idmap[node->action[i] >> kIndexShift]);
+      }
+    }
+    LOG(ERROR) << dump;
+  }
+
+  // Overallocated earlier; cut down to actual size.
+  nodep = new uint8[nalloc*statesize];
+  memmove(nodep, nodes, nalloc*statesize);
+  delete[] nodes;
+  nodes = nodep;
+
+  onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
+  onepass_nodes_ = nodes;
+  onepass_statesize_ = statesize;
+  dfa_mem_ -= nalloc*statesize;
+
+  delete[] stack;
+  delete[] nodebyid;
+  return true;
+
+fail:
+  delete[] stack;
+  delete[] nodebyid;
+  delete[] nodes;
+  return false;
+}
+
+}  // namespace re2
--- a/re2/re2/parse.cc
+++ b/re2/re2/parse.cc
--- a/re2/re2/perl_groups.cc
+++ b/re2/re2/perl_groups.cc
@ -0,0 +1,119 @@
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+static URange16 code1[] = {  /* \d */
+	{ 0x30, 0x39 },
+};
+static URange16 code2[] = {  /* \s */
+	{ 0x9, 0xa },
+	{ 0xc, 0xd },
+	{ 0x20, 0x20 },
+};
+static URange16 code3[] = {  /* \w */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x5a },
+	{ 0x5f, 0x5f },
+	{ 0x61, 0x7a },
+};
+UGroup perl_groups[] = {
+	{ "\\d", +1, code1, 1 },
+	{ "\\D", -1, code1, 1 },
+	{ "\\s", +1, code2, 3 },
+	{ "\\S", -1, code2, 3 },
+	{ "\\w", +1, code3, 4 },
+	{ "\\W", -1, code3, 4 },
+};
+int num_perl_groups = 6;
+static URange16 code4[] = {  /* [:alnum:] */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x5a },
+	{ 0x61, 0x7a },
+};
+static URange16 code5[] = {  /* [:alpha:] */
+	{ 0x41, 0x5a },
+	{ 0x61, 0x7a },
+};
+static URange16 code6[] = {  /* [:ascii:] */
+	{ 0x0, 0x7f },
+};
+static URange16 code7[] = {  /* [:blank:] */
+	{ 0x9, 0x9 },
+	{ 0x20, 0x20 },
+};
+static URange16 code8[] = {  /* [:cntrl:] */
+	{ 0x0, 0x1f },
+	{ 0x7f, 0x7f },
+};
+static URange16 code9[] = {  /* [:digit:] */
+	{ 0x30, 0x39 },
+};
+static URange16 code10[] = {  /* [:graph:] */
+	{ 0x21, 0x7e },
+};
+static URange16 code11[] = {  /* [:lower:] */
+	{ 0x61, 0x7a },
+};
+static URange16 code12[] = {  /* [:print:] */
+	{ 0x20, 0x7e },
+};
+static URange16 code13[] = {  /* [:punct:] */
+	{ 0x21, 0x2f },
+	{ 0x3a, 0x40 },
+	{ 0x5b, 0x60 },
+	{ 0x7b, 0x7e },
+};
+static URange16 code14[] = {  /* [:space:] */
+	{ 0x9, 0xd },
+	{ 0x20, 0x20 },
+};
+static URange16 code15[] = {  /* [:upper:] */
+	{ 0x41, 0x5a },
+};
+static URange16 code16[] = {  /* [:word:] */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x5a },
+	{ 0x5f, 0x5f },
+	{ 0x61, 0x7a },
+};
+static URange16 code17[] = {  /* [:xdigit:] */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x46 },
+	{ 0x61, 0x66 },
+};
+UGroup posix_groups[] = {
+	{ "[:alnum:]", +1, code4, 3 },
+	{ "[:^alnum:]", -1, code4, 3 },
+	{ "[:alpha:]", +1, code5, 2 },
+	{ "[:^alpha:]", -1, code5, 2 },
+	{ "[:ascii:]", +1, code6, 1 },
+	{ "[:^ascii:]", -1, code6, 1 },
+	{ "[:blank:]", +1, code7, 2 },
+	{ "[:^blank:]", -1, code7, 2 },
+	{ "[:cntrl:]", +1, code8, 2 },
+	{ "[:^cntrl:]", -1, code8, 2 },
+	{ "[:digit:]", +1, code9, 1 },
+	{ "[:^digit:]", -1, code9, 1 },
+	{ "[:graph:]", +1, code10, 1 },
+	{ "[:^graph:]", -1, code10, 1 },
+	{ "[:lower:]", +1, code11, 1 },
+	{ "[:^lower:]", -1, code11, 1 },
+	{ "[:print:]", +1, code12, 1 },
+	{ "[:^print:]", -1, code12, 1 },
+	{ "[:punct:]", +1, code13, 4 },
+	{ "[:^punct:]", -1, code13, 4 },
+	{ "[:space:]", +1, code14, 2 },
+	{ "[:^space:]", -1, code14, 2 },
+	{ "[:upper:]", +1, code15, 1 },
+	{ "[:^upper:]", -1, code15, 1 },
+	{ "[:word:]", +1, code16, 4 },
+	{ "[:^word:]", -1, code16, 4 },
+	{ "[:xdigit:]", +1, code17, 3 },
+	{ "[:^xdigit:]", -1, code17, 3 },
+};
+int num_posix_groups = 28;
+
+}  // namespace re2
--- a/re2/re2/prefilter.cc
+++ b/re2/re2/prefilter.cc
@ -0,0 +1,671 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/util.h"
+#include "re2/prefilter.h"
+#include "re2/re2.h"
+#include "re2/unicode_casefold.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+static const int Trace = false;
+
+typedef set<string>::iterator SSIter;
+typedef set<string>::const_iterator ConstSSIter;
+
+static int alloc_id = 100000;  // Used for debugging.
+// Initializes a Prefilter, allocating subs_ as necessary.
+Prefilter::Prefilter(Op op) {
+  op_ = op;
+  subs_ = NULL;
+  if (op_ == AND || op_ == OR)
+    subs_ = new vector<Prefilter*>;
+
+  alloc_id_ = alloc_id++;
+  VLOG(10) << "alloc_id: " << alloc_id_;
+}
+
+// Destroys a Prefilter.
+Prefilter::~Prefilter() {
+  VLOG(10) << "Deleted: " << alloc_id_;
+  if (subs_) {
+    for (int i = 0; i < subs_->size(); i++)
+      delete (*subs_)[i];
+    delete subs_;
+    subs_ = NULL;
+  }
+}
+
+// Simplify if the node is an empty Or or And.
+Prefilter* Prefilter::Simplify() {
+  if (op_ != AND && op_ != OR) {
+    return this;
+  }
+
+  // Nothing left in the AND/OR.
+  if (subs_->size() == 0) {
+    if (op_ == AND)
+      op_ = ALL;  // AND of nothing is true
+    else
+      op_ = NONE;  // OR of nothing is false
+
+    return this;
+  }
+
+  // Just one subnode: throw away wrapper.
+  if (subs_->size() == 1) {
+    Prefilter* a = (*subs_)[0];
+    subs_->clear();
+    delete this;
+    return a->Simplify();
+  }
+
+  return this;
+}
+
+// Combines two Prefilters together to create an "op" (AND or OR).
+// The passed Prefilters will be part of the returned Prefilter or deleted.
+// Does lots of work to avoid creating unnecessarily complicated structures.
+Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
+  // If a, b can be rewritten as op, do so.
+  a = a->Simplify();
+  b = b->Simplify();
+
+  // Canonicalize: a->op <= b->op.
+  if (a->op() > b->op()) {
+    Prefilter* t = a;
+    a = b;
+    b = t;
+  }
+
+  // Trivial cases.
+  //    ALL AND b = b
+  //    NONE OR b = b
+  //    ALL OR b   = ALL
+  //    NONE AND b = NONE
+  // Don't need to look at b, because of canonicalization above.
+  // ALL and NONE are smallest opcodes.
+  if (a->op() == ALL || a->op() == NONE) {
+    if ((a->op() == ALL && op == AND) ||
+        (a->op() == NONE && op == OR)) {
+      delete a;
+      return b;
+    } else {
+      delete b;
+      return a;
+    }
+  }
+
+  // If a and b match op, merge their contents.
+  if (a->op() == op && b->op() == op) {
+    for (int i = 0; i < b->subs()->size(); i++) {
+      Prefilter* bb = (*b->subs())[i];
+      a->subs()->push_back(bb);
+    }
+    b->subs()->clear();
+    delete b;
+    return a;
+  }
+
+  // If a already has the same op as the op that is under construction
+  // add in b (similarly if b already has the same op, add in a).
+  if (b->op() == op) {
+    Prefilter* t = a;
+    a = b;
+    b = t;
+  }
+  if (a->op() == op) {
+    a->subs()->push_back(b);
+    return a;
+  }
+
+  // Otherwise just return the op.
+  Prefilter* c = new Prefilter(op);
+  c->subs()->push_back(a);
+  c->subs()->push_back(b);
+  return c;
+}
+
+Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
+  return AndOr(AND, a, b);
+}
+
+Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
+  return AndOr(OR, a, b);
+}
+
+static void SimplifyStringSet(set<string> *ss) {
+  // Now make sure that the strings aren't redundant.  For example, if
+  // we know "ab" is a required string, then it doesn't help at all to
+  // know that "abc" is also a required string, so delete "abc". This
+  // is because, when we are performing a string search to filter
+  // regexps, matching ab will already allow this regexp to be a
+  // candidate for match, so further matching abc is redundant.
+
+  for (SSIter i = ss->begin(); i != ss->end(); ++i) {
+    SSIter j = i;
+    ++j;
+    while (j != ss->end()) {
+      // Increment j early so that we can erase the element it points to.
+      SSIter old_j = j;
+      ++j;
+      if (old_j->find(*i) != string::npos)
+        ss->erase(old_j);
+    }
+  }
+}
+
+Prefilter* Prefilter::OrStrings(set<string>* ss) {
+  SimplifyStringSet(ss);
+  Prefilter* or_prefilter = NULL;
+  if (!ss->empty()) {
+    or_prefilter = new Prefilter(NONE);
+    for (SSIter i = ss->begin(); i != ss->end(); ++i)
+      or_prefilter = Or(or_prefilter, FromString(*i));
+  }
+  return or_prefilter;
+}
+
+static Rune ToLowerRune(Rune r) {
+  if (r < Runeself) {
+    if ('A' <= r && r <= 'Z')
+      r += 'a' - 'A';
+    return r;
+  }
+
+  CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
+  if (f == NULL || r < f->lo)
+    return r;
+  return ApplyFold(f, r);
+}
+
+Prefilter* Prefilter::FromString(const string& str) {
+  Prefilter* m = new Prefilter(Prefilter::ATOM);
+  m->atom_ = str;
+  return m;
+}
+
+// Information about a regexp used during computation of Prefilter.
+// Can be thought of as information about the set of strings matching
+// the given regular expression.
+class Prefilter::Info {
+ public:
+  Info();
+  ~Info();
+
+  // More constructors.  They delete their Info* arguments.
+  static Info* Alt(Info* a, Info* b);
+  static Info* Concat(Info* a, Info* b);
+  static Info* And(Info* a, Info* b);
+  static Info* Star(Info* a);
+  static Info* Plus(Info* a);
+  static Info* Quest(Info* a);
+  static Info* EmptyString();
+  static Info* NoMatch();
+  static Info* AnyChar();
+  static Info* CClass(CharClass* cc);
+  static Info* Literal(Rune r);
+  static Info* AnyMatch();
+
+  // Format Info as a string.
+  string ToString();
+
+  // Caller takes ownership of the Prefilter.
+  Prefilter* TakeMatch();
+
+  set<string>& exact() { return exact_; }
+
+  bool is_exact() const { return is_exact_; }
+
+  class Walker;
+
+ private:
+  set<string> exact_;
+
+  // When is_exact_ is true, the strings that match
+  // are placed in exact_. When it is no longer an exact
+  // set of strings that match this RE, then is_exact_
+  // is false and the match_ contains the required match
+  // criteria.
+  bool is_exact_;
+
+  // Accumulated Prefilter query that any
+  // match for this regexp is guaranteed to match.
+  Prefilter* match_;
+};
+
+
+Prefilter::Info::Info()
+  : is_exact_(false),
+    match_(NULL) {
+}
+
+Prefilter::Info::~Info() {
+  delete match_;
+}
+
+Prefilter* Prefilter::Info::TakeMatch() {
+  if (is_exact_) {
+    match_ = Prefilter::OrStrings(&exact_);
+    is_exact_ = false;
+  }
+  Prefilter* m = match_;
+  match_ = NULL;
+  return m;
+}
+
+// Format a Info in string form.
+string Prefilter::Info::ToString() {
+  if (this == NULL) {
+    // Sometimes when iterating on children of a node,
+    // some children might have NULL Info. Adding
+    // the check here for NULL to take care of cases where
+    // the caller is not checking.
+    return "";
+  }
+
+  if (is_exact_) {
+    int n = 0;
+    string s;
+    for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
+      if (n++ > 0)
+        s += ",";
+      s += *i;
+    }
+    return s;
+  }
+
+  if (match_)
+    return match_->DebugString();
+
+  return "";
+}
+
+// Add the strings from src to dst.
+static void CopyIn(const set<string>& src, set<string>* dst) {
+  for (ConstSSIter i = src.begin(); i != src.end(); ++i)
+    dst->insert(*i);
+}
+
+// Add the cross-product of a and b to dst.
+// (For each string i in a and j in b, add i+j.)
+static void CrossProduct(const set<string>& a,
+                         const set<string>& b,
+                         set<string>* dst) {
+  for (ConstSSIter i = a.begin(); i != a.end(); ++i)
+    for (ConstSSIter j = b.begin(); j != b.end(); ++j)
+      dst->insert(*i + *j);
+}
+
+// Concats a and b. Requires that both are exact sets.
+// Forms an exact set that is a crossproduct of a and b.
+Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
+  if (a == NULL)
+    return b;
+  DCHECK(a->is_exact_);
+  DCHECK(b && b->is_exact_);
+  Info *ab = new Info();
+
+  CrossProduct(a->exact_, b->exact_, &ab->exact_);
+  ab->is_exact_ = true;
+
+  delete a;
+  delete b;
+  return ab;
+}
+
+// Constructs an inexact Info for ab given a and b.
+// Used only when a or b is not exact or when the
+// exact cross product is likely to be too big.
+Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
+  if (a == NULL)
+    return b;
+  if (b == NULL)
+    return a;
+
+  Info *ab = new Info();
+
+  ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
+  ab->is_exact_ = false;
+  delete a;
+  delete b;
+  return ab;
+}
+
+// Constructs Info for a|b given a and b.
+Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
+  Info *ab = new Info();
+
+  if (a->is_exact_ && b->is_exact_) {
+    CopyIn(a->exact_, &ab->exact_);
+    CopyIn(b->exact_, &ab->exact_);
+    ab->is_exact_ = true;
+  } else {
+    // Either a or b has is_exact_ = false. If the other
+    // one has is_exact_ = true, we move it to match_ and
+    // then create a OR of a,b. The resulting Info has
+    // is_exact_ = false.
+    ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
+    ab->is_exact_ = false;
+  }
+
+  delete a;
+  delete b;
+  return ab;
+}
+
+// Constructs Info for a? given a.
+Prefilter::Info* Prefilter::Info::Quest(Info *a) {
+  Info *ab = new Info();
+
+  ab->is_exact_ = false;
+  ab->match_ = new Prefilter(ALL);
+  delete a;
+  return ab;
+}
+
+// Constructs Info for a* given a.
+// Same as a? -- not much to do.
+Prefilter::Info* Prefilter::Info::Star(Info *a) {
+  return Quest(a);
+}
+
+// Constructs Info for a+ given a. If a was exact set, it isn't
+// anymore.
+Prefilter::Info* Prefilter::Info::Plus(Info *a) {
+  Info *ab = new Info();
+
+  ab->match_ = a->TakeMatch();
+  ab->is_exact_ = false;
+
+  delete a;
+  return ab;
+}
+
+static string RuneToString(Rune r) {
+  char buf[UTFmax];
+  int n = runetochar(buf, &r);
+  return string(buf, n);
+}
+
+// Constructs Info for literal rune.
+Prefilter::Info* Prefilter::Info::Literal(Rune r) {
+  Info* info = new Info();
+  info->exact_.insert(RuneToString(ToLowerRune(r)));
+  info->is_exact_ = true;
+  return info;
+}
+
+// Constructs Info for dot (any character).
+Prefilter::Info* Prefilter::Info::AnyChar() {
+  Prefilter::Info* info = new Prefilter::Info();
+  info->match_ = new Prefilter(ALL);
+  return info;
+}
+
+// Constructs Prefilter::Info for no possible match.
+Prefilter::Info* Prefilter::Info::NoMatch() {
+  Prefilter::Info* info = new Prefilter::Info();
+  info->match_ = new Prefilter(NONE);
+  return info;
+}
+
+// Constructs Prefilter::Info for any possible match.
+// This Prefilter::Info is valid for any regular expression,
+// since it makes no assertions whatsoever about the
+// strings being matched.
+Prefilter::Info* Prefilter::Info::AnyMatch() {
+  Prefilter::Info *info = new Prefilter::Info();
+  info->match_ = new Prefilter(ALL);
+  return info;
+}
+
+// Constructs Prefilter::Info for just the empty string.
+Prefilter::Info* Prefilter::Info::EmptyString() {
+  Prefilter::Info* info = new Prefilter::Info();
+  info->is_exact_ = true;
+  info->exact_.insert("");
+  return info;
+}
+
+// Constructs Prefilter::Info for a character class.
+typedef CharClass::iterator CCIter;
+Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
+  if (Trace) {
+    VLOG(0) << "CharClassInfo:";
+    for (CCIter i = cc->begin(); i != cc->end(); ++i)
+      VLOG(0) << "  " << i->lo << "-" << i->hi;
+  }
+
+  // If the class is too large, it's okay to overestimate.
+  if (cc->size() > 10)
+    return AnyChar();
+
+  Prefilter::Info *a = new Prefilter::Info();
+  for (CCIter i = cc->begin(); i != cc->end(); ++i)
+    for (Rune r = i->lo; r <= i->hi; r++)
+      a->exact_.insert(RuneToString(ToLowerRune(r)));
+
+  a->is_exact_ = true;
+
+  if (Trace) {
+    VLOG(0) << " = " << a->ToString();
+  }
+
+  return a;
+}
+
+class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
+ public:
+  Walker() {}
+
+  virtual Info* PostVisit(
+      Regexp* re, Info* parent_arg,
+      Info* pre_arg,
+      Info** child_args, int nchild_args);
+
+  virtual Info* ShortVisit(
+      Regexp* re,
+      Info* parent_arg);
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(Walker);
+};
+
+Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
+  if (Trace) {
+    LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
+  }
+  Prefilter::Info::Walker w;
+  Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
+
+  if (w.stopped_early()) {
+    delete info;
+    return NULL;
+  }
+
+  return info;
+}
+
+Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
+    Regexp* re, Prefilter::Info* parent_arg) {
+  return AnyMatch();
+}
+
+// Constructs the Prefilter::Info for the given regular expression.
+// Assumes re is simplified.
+Prefilter::Info* Prefilter::Info::Walker::PostVisit(
+    Regexp* re, Prefilter::Info* parent_arg,
+    Prefilter::Info* pre_arg, Prefilter::Info** child_args,
+    int nchild_args) {
+  Prefilter::Info *info;
+  switch (re->op()) {
+    default:
+    case kRegexpRepeat:
+      LOG(DFATAL) << "Bad regexp op " << re->op();
+      info = EmptyString();
+      break;
+
+    case kRegexpNoMatch:
+      info = NoMatch();
+      break;
+
+    // These ops match the empty string:
+    case kRegexpEmptyMatch:      // anywhere
+    case kRegexpBeginLine:       // at beginning of line
+    case kRegexpEndLine:         // at end of line
+    case kRegexpBeginText:       // at beginning of text
+    case kRegexpEndText:         // at end of text
+    case kRegexpWordBoundary:    // at word boundary
+    case kRegexpNoWordBoundary:  // not at word boundary
+      info = EmptyString();
+      break;
+
+    case kRegexpLiteral:
+      info = Literal(re->rune());
+      break;
+
+    case kRegexpLiteralString:
+      if (re->nrunes() == 0) {
+        info = NoMatch();
+        break;
+      }
+      info = Literal(re->runes()[0]);
+      for (int i = 1; i < re->nrunes(); i++)
+        info = Concat(info, Literal(re->runes()[i]));
+      break;
+
+    case kRegexpConcat: {
+      // Accumulate in info.
+      // Exact is concat of recent contiguous exact nodes.
+      info = NULL;
+      Info* exact = NULL;
+      for (int i = 0; i < nchild_args; i++) {
+        Info* ci = child_args[i];  // child info
+        if (!ci->is_exact() ||
+            (exact && ci->exact().size() * exact->exact().size() > 16)) {
+          // Exact run is over.
+          info = And(info, exact);
+          exact = NULL;
+          // Add this child's info.
+          info = And(info, ci);
+        } else {
+          // Append to exact run.
+          exact = Concat(exact, ci);
+        }
+      }
+      info = And(info, exact);
+    }
+      break;
+
+    case kRegexpAlternate:
+      info = child_args[0];
+      for (int i = 1; i < nchild_args; i++)
+        info = Alt(info, child_args[i]);
+      VLOG(10) << "Alt: " << info->ToString();
+      break;
+
+    case kRegexpStar:
+      info = Star(child_args[0]);
+      break;
+
+    case kRegexpQuest:
+      info = Quest(child_args[0]);
+      break;
+
+    case kRegexpPlus:
+      info = Plus(child_args[0]);
+      break;
+
+    case kRegexpAnyChar:
+      // Claim nothing, except that it's not empty.
+      info = AnyChar();
+      break;
+
+    case kRegexpCharClass:
+      info = CClass(re->cc());
+      break;
+
+    case kRegexpCapture:
+      // These don't affect the set of matching strings.
+      info = child_args[0];
+      break;
+  }
+
+  if (Trace) {
+    VLOG(0) << "BuildInfo " << re->ToString()
+            << ": " << info->ToString();
+  }
+
+  return info;
+}
+
+
+Prefilter* Prefilter::FromRegexp(Regexp* re) {
+  if (re == NULL)
+    return NULL;
+
+  Regexp* simple = re->Simplify();
+  Prefilter::Info *info = BuildInfo(simple);
+
+  simple->Decref();
+  if (info == NULL)
+    return NULL;
+
+  Prefilter* m = info->TakeMatch();
+
+  delete info;
+  return m;
+}
+
+string Prefilter::DebugString() const {
+  if (this == NULL)
+    return "<nil>";
+
+  switch (op_) {
+    default:
+      LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
+      return StringPrintf("op%d", op_);
+    case NONE:
+      return "*no-matches*";
+    case ATOM:
+      return atom_;
+    case ALL:
+      return "";
+    case AND: {
+      string s = "";
+      for (int i = 0; i < subs_->size(); i++) {
+        if (i > 0)
+          s += " ";
+        s += (*subs_)[i]->DebugString();
+      }
+      return s;
+    }
+    case OR: {
+      string s = "(";
+      for (int i = 0; i < subs_->size(); i++) {
+        if (i > 0)
+          s += "|";
+        s += (*subs_)[i]->DebugString();
+      }
+      s += ")";
+      return s;
+    }
+  }
+}
+
+Prefilter* Prefilter::FromRE2(const RE2* re2) {
+  if (re2 == NULL)
+    return NULL;
+
+  Regexp* regexp = re2->Regexp();
+  if (regexp == NULL)
+    return NULL;
+
+  return FromRegexp(regexp);
+}
+
+
+}  // namespace re2
--- a/re2/re2/prefilter.h
+++ b/re2/re2/prefilter.h
@ -0,0 +1,105 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Prefilter is the class used to extract string guards from regexps.
+// Rather than using Prefilter class directly, use FilteredRE2.
+// See filtered_re2.h
+
+#ifndef RE2_PREFILTER_H_
+#define RE2_PREFILTER_H_
+
+#include "util/util.h"
+
+namespace re2 {
+
+class RE2;
+
+class Regexp;
+
+class Prefilter {
+  // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
+ public:
+  enum Op {
+    ALL = 0,  // Everything matches
+    NONE,  // Nothing matches
+    ATOM,  // The string atom() must match
+    AND,   // All in subs() must match
+    OR,   // One of subs() must match
+  };
+
+  explicit Prefilter(Op op);
+  ~Prefilter();
+
+  Op op() { return op_; }
+  const string& atom() const { return atom_; }
+  void set_unique_id(int id) { unique_id_ = id; }
+  int unique_id() const { return unique_id_; }
+
+  // The children of the Prefilter node.
+  vector<Prefilter*>* subs() {
+    CHECK(op_ == AND || op_ == OR);
+    return subs_;
+  }
+
+  // Set the children vector. Prefilter takes ownership of subs and
+  // subs_ will be deleted when Prefilter is deleted.
+  void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
+
+  // Given a RE2, return a Prefilter. The caller takes ownership of
+  // the Prefilter and should deallocate it. Returns NULL if Prefilter
+  // cannot be formed.
+  static Prefilter* FromRE2(const RE2* re2);
+
+  // Returns a readable debug string of the prefilter.
+  string DebugString() const;
+
+ private:
+  class Info;
+
+  // Combines two prefilters together to create an AND. The passed
+  // Prefilters will be part of the returned Prefilter or deleted.
+  static Prefilter* And(Prefilter* a, Prefilter* b);
+
+  // Combines two prefilters together to create an OR. The passed
+  // Prefilters will be part of the returned Prefilter or deleted.
+  static Prefilter* Or(Prefilter* a, Prefilter* b);
+
+  // Generalized And/Or
+  static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
+
+  static Prefilter* FromRegexp(Regexp* a);
+
+  static Prefilter* FromString(const string& str);
+
+  static Prefilter* OrStrings(set<string>* ss);
+
+  static Info* BuildInfo(Regexp* re);
+
+  Prefilter* Simplify();
+
+  // Kind of Prefilter.
+  Op op_;
+
+  // Sub-matches for AND or OR Prefilter.
+  vector<Prefilter*>* subs_;
+
+  // Actual string to match in leaf node.
+  string atom_;
+
+  // If different prefilters have the same string atom, or if they are
+  // structurally the same (e.g., OR of same atom strings) they are
+  // considered the same unique nodes. This is the id for each unique
+  // node. This field is populated with a unique id for every node,
+  // and -1 for duplicate nodes.
+  int unique_id_;
+
+  // Used for debugging, helps in tracking memory leaks.
+  int alloc_id_;
+
+  DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
+};
+
+}  // namespace re2
+
+#endif  // RE2_PREFILTER_H_
--- a/re2/re2/prefilter_tree.cc
+++ b/re2/re2/prefilter_tree.cc
@ -0,0 +1,398 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/util.h"
+#include "util/flags.h"
+#include "re2/prefilter.h"
+#include "re2/prefilter_tree.h"
+#include "re2/re2.h"
+
+DEFINE_int32(filtered_re2_min_atom_len,
+             3,
+             "Strings less than this length are not stored as atoms");
+
+namespace re2 {
+
+PrefilterTree::PrefilterTree()
+    : compiled_(false) {
+}
+
+PrefilterTree::~PrefilterTree() {
+  for (int i = 0; i < prefilter_vec_.size(); i++)
+    delete prefilter_vec_[i];
+
+  for (int i = 0; i < entries_.size(); i++)
+    delete entries_[i].parents;
+}
+
+// Functions used for adding and Compiling prefilters to the
+// PrefilterTree.
+static bool KeepPart(Prefilter* prefilter, int level) {
+  if (prefilter == NULL)
+    return false;
+
+  switch (prefilter->op()) {
+    default:
+      LOG(DFATAL) << "Unexpected op in KeepPart: "
+                  << prefilter->op();
+      return false;
+
+    case Prefilter::ALL:
+      return false;
+
+    case Prefilter::ATOM:
+      return prefilter->atom().size() >=
+          FLAGS_filtered_re2_min_atom_len;
+
+    case Prefilter::AND: {
+      int j = 0;
+      vector<Prefilter*>* subs = prefilter->subs();
+      for (int i = 0; i < subs->size(); i++)
+        if (KeepPart((*subs)[i], level + 1))
+          (*subs)[j++] = (*subs)[i];
+        else
+          delete (*subs)[i];
+
+      subs->resize(j);
+      return j > 0;
+    }
+
+    case Prefilter::OR:
+      for (int i = 0; i < prefilter->subs()->size(); i++)
+        if (!KeepPart((*prefilter->subs())[i], level + 1))
+          return false;
+      return true;
+  }
+}
+
+void PrefilterTree::Add(Prefilter *f) {
+  if (compiled_) {
+    LOG(DFATAL) << "Add after Compile.";
+    return;
+  }
+  if (f != NULL && !KeepPart(f, 0)) {
+    delete f;
+    f = NULL;
+  }
+
+  prefilter_vec_.push_back(f);
+}
+
+void PrefilterTree::Compile(vector<string>* atom_vec) {
+  if (compiled_) {
+    LOG(DFATAL) << "Compile after Compile.";
+    return;
+  }
+
+  // We do this check to support some legacy uses of
+  // PrefilterTree that call Compile before adding any regexps,
+  // and expect Compile not to have effect.
+  if (prefilter_vec_.empty())
+    return;
+
+  compiled_ = true;
+
+  AssignUniqueIds(atom_vec);
+
+  // Identify nodes that are too common among prefilters and are
+  // triggering too many parents. Then get rid of them if possible.
+  // Note that getting rid of a prefilter node simply means they are
+  // no longer necessary for their parent to trigger; that is, we do
+  // not miss out on any regexps triggering by getting rid of a
+  // prefilter node.
+  for (int i = 0; i < entries_.size(); i++) {
+    IntMap* parents = entries_[i].parents;
+    if (parents->size() > 8) {
+      // This one triggers too many things. If all the parents are AND
+      // nodes and have other things guarding them, then get rid of
+      // this trigger. TODO(vsri): Adjust the threshold appropriately,
+      // make it a function of total number of nodes?
+      bool have_other_guard = true;
+      for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
+        have_other_guard = have_other_guard &&
+            (entries_[it->index()].propagate_up_at_count > 1);
+
+      if (have_other_guard) {
+        for (IntMap::iterator it = parents->begin();
+             it != parents->end(); ++it)
+          entries_[it->index()].propagate_up_at_count -= 1;
+
+        parents->clear();  // Forget the parents
+      }
+    }
+  }
+
+  PrintDebugInfo();
+}
+
+Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
+  string node_string = NodeString(node);
+  map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
+  if (iter == node_map_.end())
+    return NULL;
+  return (*iter).second;
+}
+
+static string Itoa(int n) {
+  char buf[100];
+  snprintf(buf, sizeof buf, "%d", n);
+  return string(buf);
+}
+
+string PrefilterTree::NodeString(Prefilter* node) const {
+  // Adding the operation disambiguates AND/OR/atom nodes.
+  string s = Itoa(node->op()) + ":";
+  if (node->op() == Prefilter::ATOM) {
+    s += node->atom();
+  } else {
+    for (int i = 0; i < node->subs()->size() ; i++) {
+      if (i > 0)
+        s += ',';
+      s += Itoa((*node->subs())[i]->unique_id());
+    }
+  }
+  return s;
+}
+
+void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
+  atom_vec->clear();
+
+  // Build vector of all filter nodes, sorted topologically
+  // from top to bottom in v.
+  vector<Prefilter*> v;
+
+  // Add the top level nodes of each regexp prefilter.
+  for (int i = 0; i < prefilter_vec_.size(); i++) {
+    Prefilter* f = prefilter_vec_[i];
+    if (f == NULL)
+      unfiltered_.push_back(i);
+
+    // We push NULL also on to v, so that we maintain the
+    // mapping of index==regexpid for level=0 prefilter nodes.
+    v.push_back(f);
+  }
+
+  // Now add all the descendant nodes.
+  for (int i = 0; i < v.size(); i++) {
+    Prefilter* f = v[i];
+    if (f == NULL)
+      continue;
+    if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
+      const vector<Prefilter*>& subs = *f->subs();
+      for (int j = 0; j < subs.size(); j++)
+        v.push_back(subs[j]);
+    }
+  }
+
+  // Identify unique nodes.
+  int unique_id = 0;
+  for (int i = v.size() - 1; i >= 0; i--) {
+    Prefilter *node = v[i];
+    if (node == NULL)
+      continue;
+    node->set_unique_id(-1);
+    Prefilter* canonical = CanonicalNode(node);
+    if (canonical == NULL) {
+      // Any further nodes that have the same node string
+      // will find this node as the canonical node.
+      node_map_[NodeString(node)] = node;
+      if (node->op() == Prefilter::ATOM) {
+        atom_vec->push_back(node->atom());
+        atom_index_to_id_.push_back(unique_id);
+      }
+      node->set_unique_id(unique_id++);
+    } else {
+      node->set_unique_id(canonical->unique_id());
+    }
+  }
+  entries_.resize(node_map_.size());
+
+  // Create parent IntMap for the entries.
+  for (int i = v.size()  - 1; i >= 0; i--) {
+    Prefilter* prefilter = v[i];
+    if (prefilter == NULL)
+      continue;
+
+    if (CanonicalNode(prefilter) != prefilter)
+      continue;
+
+    Entry* entry = &entries_[prefilter->unique_id()];
+    entry->parents = new IntMap(node_map_.size());
+  }
+
+  // Fill the entries.
+  for (int i = v.size()  - 1; i >= 0; i--) {
+    Prefilter* prefilter = v[i];
+    if (prefilter == NULL)
+      continue;
+
+    if (CanonicalNode(prefilter) != prefilter)
+      continue;
+
+    Entry* entry = &entries_[prefilter->unique_id()];
+
+    switch (prefilter->op()) {
+      default:
+      case Prefilter::ALL:
+        LOG(DFATAL) << "Unexpected op: " << prefilter->op();
+        return;
+
+      case Prefilter::ATOM:
+        entry->propagate_up_at_count = 1;
+        break;
+
+      case Prefilter::OR:
+      case Prefilter::AND: {
+        IntMap uniq_child(node_map_.size());
+        for (int j = 0; j < prefilter->subs()->size() ; j++) {
+          Prefilter* child = (*prefilter->subs())[j];
+          Prefilter* canonical = CanonicalNode(child);
+          if (canonical == NULL) {
+            LOG(DFATAL) << "Null canonical node";
+            return;
+          }
+          int child_id = canonical->unique_id();
+          if (!uniq_child.has_index(child_id))
+            uniq_child.set_new(child_id, 1);
+          // To the child, we want to add to parent indices.
+          Entry* child_entry = &entries_[child_id];
+          if (!child_entry->parents->has_index(prefilter->unique_id()))
+            child_entry->parents->set_new(prefilter->unique_id(), 1);
+        }
+        entry->propagate_up_at_count =
+            prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
+
+        break;
+      }
+    }
+  }
+
+  // For top level nodes, populate regexp id.
+  for (int i = 0; i < prefilter_vec_.size(); i++) {
+    if (prefilter_vec_[i] == NULL)
+      continue;
+    int id = CanonicalNode(prefilter_vec_[i])->unique_id();
+    DCHECK_LE(0, id);
+    Entry* entry = &entries_[id];
+    entry->regexps.push_back(i);
+  }
+}
+
+// Functions for triggering during search.
+void PrefilterTree::RegexpsGivenStrings(
+    const vector<int>& matched_atoms,
+    vector<int>* regexps) const {
+  regexps->clear();
+  if (!compiled_) {
+    LOG(WARNING) << "Compile() not called";
+    for (int i = 0; i < prefilter_vec_.size(); ++i)
+      regexps->push_back(i);
+  } else {
+    if (!prefilter_vec_.empty()) {
+      IntMap regexps_map(prefilter_vec_.size());
+      vector<int> matched_atom_ids;
+      for (int j = 0; j < matched_atoms.size(); j++) {
+        matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
+        VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
+      }
+      PropagateMatch(matched_atom_ids, &regexps_map);
+      for (IntMap::iterator it = regexps_map.begin();
+           it != regexps_map.end();
+           ++it)
+        regexps->push_back(it->index());
+
+      regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
+    }
+  }
+  sort(regexps->begin(), regexps->end());
+}
+
+void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
+                                   IntMap* regexps) const {
+  IntMap count(entries_.size());
+  IntMap work(entries_.size());
+  for (int i = 0; i < atom_ids.size(); i++)
+    work.set(atom_ids[i], 1);
+  for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
+    const Entry& entry = entries_[it->index()];
+    VLOG(10) << "Processing: " << it->index();
+    // Record regexps triggered.
+    for (int i = 0; i < entry.regexps.size(); i++) {
+      VLOG(10) << "Regexp triggered: " << entry.regexps[i];
+      regexps->set(entry.regexps[i], 1);
+    }
+    int c;
+    // Pass trigger up to parents.
+    for (IntMap::iterator it = entry.parents->begin();
+         it != entry.parents->end();
+         ++it) {
+      int j = it->index();
+      const Entry& parent = entries_[j];
+      VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
+      // Delay until all the children have succeeded.
+      if (parent.propagate_up_at_count > 1) {
+        if (count.has_index(j)) {
+          c = count.get_existing(j) + 1;
+          count.set_existing(j, c);
+        } else {
+          c = 1;
+          count.set_new(j, c);
+        }
+        if (c < parent.propagate_up_at_count)
+          continue;
+      }
+      VLOG(10) << "Triggering: " << j;
+      // Trigger the parent.
+      work.set(j, 1);
+    }
+  }
+}
+
+// Debugging help.
+void PrefilterTree::PrintPrefilter(int regexpid) {
+  LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
+}
+
+void PrefilterTree::PrintDebugInfo() {
+  VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
+  VLOG(10) << "#Unique Nodes: " << entries_.size();
+
+  for (int i = 0; i < entries_.size(); ++i) {
+    IntMap* parents = entries_[i].parents;
+    const vector<int>& regexps = entries_[i].regexps;
+    VLOG(10) << "EntryId: " << i
+            << " N: " << parents->size() << " R: " << regexps.size();
+    for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
+      VLOG(10) << it->index();
+  }
+  VLOG(10) << "Map:";
+  for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
+       iter != node_map_.end(); ++iter)
+    VLOG(10) << "NodeId: " << (*iter).second->unique_id()
+            << " Str: " << (*iter).first;
+}
+
+string PrefilterTree::DebugNodeString(Prefilter* node) const {
+  string node_string = "";
+
+  if (node->op() == Prefilter::ATOM) {
+    DCHECK(!node->atom().empty());
+    node_string += node->atom();
+  } else {
+    // Adding the operation disambiguates AND and OR nodes.
+    node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
+    node_string += "(";
+    for (int i = 0; i < node->subs()->size() ; i++) {
+      if (i > 0)
+        node_string += ',';
+      node_string += Itoa((*node->subs())[i]->unique_id());
+      node_string += ":";
+      node_string += DebugNodeString((*node->subs())[i]);
+    }
+    node_string += ")";
+  }
+  return node_string;
+}
+
+}  // namespace re2
--- a/re2/re2/prefilter_tree.h
+++ b/re2/re2/prefilter_tree.h
@ -0,0 +1,130 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The PrefilterTree class is used to form an AND-OR tree of strings
+// that would trigger each regexp. The 'prefilter' of each regexp is
+// added tp PrefilterTree, and then PrefilterTree is used to find all
+// the unique strings across the prefilters. During search, by using
+// matches from a string matching engine, PrefilterTree deduces the
+// set of regexps that are to be triggered. The 'string matching
+// engine' itself is outside of this class, and the caller can use any
+// favorite engine. PrefilterTree provides a set of strings (called
+// atoms) that the user of this class should use to do the string
+// matching.
+//
+#ifndef RE2_PREFILTER_TREE_H_
+#define RE2_PREFILTER_TREE_H_
+
+#include "util/util.h"
+#include "util/sparse_array.h"
+
+namespace re2 {
+
+typedef SparseArray<int> IntMap;
+
+class Prefilter;
+
+class PrefilterTree {
+ public:
+  PrefilterTree();
+  ~PrefilterTree();
+
+  // Adds the prefilter for the next regexp. Note that we assume that
+  // Add called sequentially for all regexps. All Add calls
+  // must precede Compile.
+  void Add(Prefilter* prefilter);
+
+  // The Compile returns a vector of string in atom_vec.
+  // Call this after all the prefilters are added through Add.
+  // No calls to Add after Compile are allowed.
+  // The caller should use the returned set of strings to do string matching.
+  // Each time a string matches, the corresponding index then has to be
+  // and passed to RegexpsGivenStrings below.
+  void Compile(vector<string>* atom_vec);
+
+  // Given the indices of the atoms that matched, returns the indexes
+  // of regexps that should be searched.  The matched_atoms should
+  // contain all the ids of string atoms that were found to match the
+  // content. The caller can use any string match engine to perform
+  // this function. This function is thread safe.
+  void RegexpsGivenStrings(const vector<int>& matched_atoms,
+                           vector<int>* regexps) const;
+
+  // Print debug prefilter. Also prints unique ids associated with
+  // nodes of the prefilter of the regexp.
+  void PrintPrefilter(int regexpid);
+
+
+  // Each unique node has a corresponding Entry that helps in
+  // passing the matching trigger information along the tree.
+  struct Entry {
+   public:
+    // How many children should match before this node triggers the
+    // parent. For an atom and an OR node, this is 1 and for an AND
+    // node, it is the number of unique children.
+    int propagate_up_at_count;
+
+    // When this node is ready to trigger the parent, what are the indices
+    // of the parent nodes to trigger. The reason there may be more than
+    // one is because of sharing. For example (abc | def) and (xyz | def)
+    // are two different nodes, but they share the atom 'def'. So when
+    // 'def' matches, it triggers two parents, corresponding to the two
+    // different OR nodes.
+    IntMap* parents;
+
+    // When this node is ready to trigger the parent, what are the
+    // regexps that are triggered.
+    vector<int> regexps;
+  };
+
+ private:
+  // This function assigns unique ids to various parts of the
+  // prefilter, by looking at if these nodes are already in the
+  // PrefilterTree.
+  void AssignUniqueIds(vector<string>* atom_vec);
+
+  // Given the matching atoms, find the regexps to be triggered.
+  void PropagateMatch(const vector<int>& atom_ids,
+                      IntMap* regexps) const;
+
+  // Returns the prefilter node that has the same NodeString as this
+  // node. For the canonical node, returns node.
+  Prefilter* CanonicalNode(Prefilter* node);
+
+  // A string that uniquely identifies the node. Assumes that the
+  // children of node has already been assigned unique ids.
+  string NodeString(Prefilter* node) const;
+
+  // Recursively constructs a readable prefilter string.
+  string DebugNodeString(Prefilter* node) const;
+
+  // Used for debugging.
+  void PrintDebugInfo();
+
+  // These are all the nodes formed by Compile. Essentially, there is
+  // one node for each unique atom and each unique AND/OR node.
+  vector<Entry> entries_;
+
+  // Map node string to canonical Prefilter node.
+  map<string, Prefilter*> node_map_;
+
+  // indices of regexps that always pass through the filter (since we
+  // found no required literals in these regexps).
+  vector<int> unfiltered_;
+
+  // vector of Prefilter for all regexps.
+  vector<Prefilter*> prefilter_vec_;
+
+  // Atom index in returned strings to entry id mapping.
+  vector<int> atom_index_to_id_;
+
+  // Has the prefilter tree been compiled.
+  bool compiled_;
+
+  DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
+};
+
+}  // namespace
+
+#endif  // RE2_PREFILTER_TREE_H_
--- a/re2/re2/prog.cc
+++ b/re2/re2/prog.cc
@ -0,0 +1,341 @@
+// Copyright 2007 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Compiled regular expression representation.
+// Tested by compile_test.cc
+
+#include "util/util.h"
+#include "util/sparse_set.h"
+#include "re2/prog.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+// Constructors per Inst opcode
+
+void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstAlt);
+  out1_ = out1;
+}
+
+void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstByteRange);
+  lo_ = lo & 0xFF;
+  hi_ = hi & 0xFF;
+  foldcase_ = foldcase;
+}
+
+void Prog::Inst::InitCapture(int cap, uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstCapture);
+  cap_ = cap;
+}
+
+void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstEmptyWidth);
+  empty_ = empty;
+}
+
+void Prog::Inst::InitMatch(int32 id) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_opcode(kInstMatch);
+  match_id_ = id;
+}
+
+void Prog::Inst::InitNop(uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_opcode(kInstNop);
+}
+
+void Prog::Inst::InitFail() {
+  DCHECK_EQ(out_opcode_, 0);
+  set_opcode(kInstFail);
+}
+
+string Prog::Inst::Dump() {
+  switch (opcode()) {
+    default:
+      return StringPrintf("opcode %d", static_cast<int>(opcode()));
+
+    case kInstAlt:
+      return StringPrintf("alt -> %d | %d", out(), out1_);
+
+    case kInstAltMatch:
+      return StringPrintf("altmatch -> %d | %d", out(), out1_);
+
+    case kInstByteRange:
+      return StringPrintf("byte%s [%02x-%02x] -> %d",
+                          foldcase_ ? "/i" : "",
+                          lo_, hi_, out());
+
+    case kInstCapture:
+      return StringPrintf("capture %d -> %d", cap_, out());
+
+    case kInstEmptyWidth:
+      return StringPrintf("emptywidth %#x -> %d",
+                          static_cast<int>(empty_), out());
+
+    case kInstMatch:
+      return StringPrintf("match! %d", match_id());
+
+    case kInstNop:
+      return StringPrintf("nop -> %d", out());
+
+    case kInstFail:
+      return StringPrintf("fail");
+  }
+}
+
+Prog::Prog()
+  : anchor_start_(false),
+    anchor_end_(false),
+    reversed_(false),
+    did_onepass_(false),
+    start_(0),
+    start_unanchored_(0),
+    size_(0),
+    byte_inst_count_(0),
+    bytemap_range_(0),
+    flags_(0),
+    onepass_statesize_(0),
+    inst_(NULL),
+    dfa_first_(NULL),
+    dfa_longest_(NULL),
+    dfa_mem_(0),
+    delete_dfa_(NULL),
+    unbytemap_(NULL),
+    onepass_nodes_(NULL),
+    onepass_start_(NULL) {
+}
+
+Prog::~Prog() {
+  if (delete_dfa_) {
+    if (dfa_first_)
+      delete_dfa_(dfa_first_);
+    if (dfa_longest_)
+      delete_dfa_(dfa_longest_);
+  }
+  delete[] onepass_nodes_;
+  delete[] inst_;
+  delete[] unbytemap_;
+}
+
+typedef SparseSet Workq;
+
+static inline void AddToQueue(Workq* q, int id) {
+  if (id != 0)
+    q->insert(id);
+}
+
+static string ProgToString(Prog* prog, Workq* q) {
+  string s;
+
+  for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
+    int id = *i;
+    Prog::Inst* ip = prog->inst(id);
+    StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
+    AddToQueue(q, ip->out());
+    if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
+      AddToQueue(q, ip->out1());
+  }
+  return s;
+}
+
+string Prog::Dump() {
+  string map;
+  if (false) {  // Debugging
+    int lo = 0;
+    StringAppendF(&map, "byte map:\n");
+    for (int i = 0; i < bytemap_range_; i++) {
+      StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
+      lo = unbytemap_[i] + 1;
+    }
+    StringAppendF(&map, "\n");
+  }
+
+  Workq q(size_);
+  AddToQueue(&q, start_);
+  return map + ProgToString(this, &q);
+}
+
+string Prog::DumpUnanchored() {
+  Workq q(size_);
+  AddToQueue(&q, start_unanchored_);
+  return ProgToString(this, &q);
+}
+
+static bool IsMatch(Prog*, Prog::Inst*);
+
+// Peep-hole optimizer.
+void Prog::Optimize() {
+  Workq q(size_);
+
+  // Eliminate nops.  Most are taken out during compilation
+  // but a few are hard to avoid.
+  q.clear();
+  AddToQueue(&q, start_);
+  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
+    int id = *i;
+
+    Inst* ip = inst(id);
+    int j = ip->out();
+    Inst* jp;
+    while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+      j = jp->out();
+    }
+    ip->set_out(j);
+    AddToQueue(&q, ip->out());
+
+    if (ip->opcode() == kInstAlt) {
+      j = ip->out1();
+      while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+        j = jp->out();
+      }
+      ip->out1_ = j;
+      AddToQueue(&q, ip->out1());
+    }
+  }
+
+  // Insert kInstAltMatch instructions
+  // Look for
+  //   ip: Alt -> j | k
+  //	  j: ByteRange [00-FF] -> ip
+  //    k: Match
+  // or the reverse (the above is the greedy one).
+  // Rewrite Alt to AltMatch.
+  q.clear();
+  AddToQueue(&q, start_);
+  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
+    int id = *i;
+    Inst* ip = inst(id);
+    AddToQueue(&q, ip->out());
+    if (ip->opcode() == kInstAlt)
+      AddToQueue(&q, ip->out1());
+
+    if (ip->opcode() == kInstAlt) {
+      Inst* j = inst(ip->out());
+      Inst* k = inst(ip->out1());
+      if (j->opcode() == kInstByteRange && j->out() == id &&
+          j->lo() == 0x00 && j->hi() == 0xFF &&
+          IsMatch(this, k)) {
+        ip->set_opcode(kInstAltMatch);
+        continue;
+      }
+      if (IsMatch(this, j) &&
+          k->opcode() == kInstByteRange && k->out() == id &&
+          k->lo() == 0x00 && k->hi() == 0xFF) {
+        ip->set_opcode(kInstAltMatch);
+      }
+    }
+  }
+}
+
+// Is ip a guaranteed match at end of text, perhaps after some capturing?
+static bool IsMatch(Prog* prog, Prog::Inst* ip) {
+  for (;;) {
+    switch (ip->opcode()) {
+      default:
+        LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
+        return false;
+
+      case kInstAlt:
+      case kInstAltMatch:
+      case kInstByteRange:
+      case kInstFail:
+      case kInstEmptyWidth:
+        return false;
+
+      case kInstCapture:
+      case kInstNop:
+        ip = prog->inst(ip->out());
+        break;
+
+      case kInstMatch:
+        return true;
+    }
+  }
+}
+
+uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
+  int flags = 0;
+
+  // ^ and \A
+  if (p == text.begin())
+    flags |= kEmptyBeginText | kEmptyBeginLine;
+  else if (p[-1] == '\n')
+    flags |= kEmptyBeginLine;
+
+  // $ and \z
+  if (p == text.end())
+    flags |= kEmptyEndText | kEmptyEndLine;
+  else if (p < text.end() && p[0] == '\n')
+    flags |= kEmptyEndLine;
+
+  // \b and \B
+  if (p == text.begin() && p == text.end()) {
+    // no word boundary here
+  } else if (p == text.begin()) {
+    if (IsWordChar(p[0]))
+      flags |= kEmptyWordBoundary;
+  } else if (p == text.end()) {
+    if (IsWordChar(p[-1]))
+      flags |= kEmptyWordBoundary;
+  } else {
+    if (IsWordChar(p[-1]) != IsWordChar(p[0]))
+      flags |= kEmptyWordBoundary;
+  }
+  if (!(flags & kEmptyWordBoundary))
+    flags |= kEmptyNonWordBoundary;
+
+  return flags;
+}
+
+void Prog::MarkByteRange(int lo, int hi) {
+  CHECK_GE(lo, 0);
+  CHECK_GE(hi, 0);
+  CHECK_LE(lo, 255);
+  CHECK_LE(hi, 255);
+  if (lo > 0)
+    byterange_.Set(lo - 1);
+  byterange_.Set(hi);
+}
+
+void Prog::ComputeByteMap() {
+  // Fill in bytemap with byte classes for prog_.
+  // Ranges of bytes that are treated as indistinguishable
+  // by the regexp program are mapped to a single byte class.
+  // The vector prog_->byterange() marks the end of each
+  // such range.
+  const Bitmap<256>& v = byterange();
+
+  COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
+  uint8 n = 0;
+  uint32 bits = 0;
+  for (int i = 0; i < 256; i++) {
+    if ((i&31) == 0)
+      bits = v.Word(i >> 5);
+    bytemap_[i] = n;
+    n += bits & 1;
+    bits >>= 1;
+  }
+  bytemap_range_ = bytemap_[255] + 1;
+  unbytemap_ = new uint8[bytemap_range_];
+  for (int i = 0; i < 256; i++)
+    unbytemap_[bytemap_[i]] = i;
+
+  if (0) {  // For debugging: use trivial byte map.
+    for (int i = 0; i < 256; i++) {
+      bytemap_[i] = i;
+      unbytemap_[i] = i;
+    }
+    bytemap_range_ = 256;
+    LOG(INFO) << "Using trivial bytemap.";
+  }
+}
+
+}  // namespace re2
+
--- a/re2/re2/prog.h
+++ b/re2/re2/prog.h
@ -0,0 +1,376 @@
+// Copyright 2007 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Compiled representation of regular expressions.
+// See regexp.h for the Regexp class, which represents a regular
+// expression symbolically.
+
+#ifndef RE2_PROG_H__
+#define RE2_PROG_H__
+
+#include "util/util.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+// Simple fixed-size bitmap.
+template<int Bits>
+class Bitmap {
+ public:
+  Bitmap() { Reset(); }
+  int Size() { return Bits; }
+
+  void Reset() {
+    for (int i = 0; i < Words; i++)
+      w_[i] = 0;
+  }
+  bool Get(int k) const {
+    return w_[k >> WordLog] & (1<<(k & 31));
+  }
+  void Set(int k) {
+    w_[k >> WordLog] |= 1<<(k & 31);
+  }
+  void Clear(int k) {
+    w_[k >> WordLog] &= ~(1<<(k & 31));
+  }
+  uint32 Word(int i) const {
+    return w_[i];
+  }
+
+ private:
+  static const int WordLog = 5;
+  static const int Words = (Bits+31)/32;
+  uint32 w_[Words];
+  DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
+};
+
+
+// Opcodes for Inst
+enum InstOp {
+  kInstAlt = 0,      // choose between out_ and out1_
+  kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
+  kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
+  kInstCapture,      // capturing parenthesis number cap_
+  kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
+  kInstMatch,        // found a match!
+  kInstNop,          // no-op; occasionally unavoidable
+  kInstFail,         // never match; occasionally unavoidable
+};
+
+// Bit flags for empty-width specials
+enum EmptyOp {
+  kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
+  kEmptyEndLine          = 1<<1,      // $ - end of line
+  kEmptyBeginText        = 1<<2,      // \A - beginning of text
+  kEmptyEndText          = 1<<3,      // \z - end of text
+  kEmptyWordBoundary     = 1<<4,      // \b - word boundary
+  kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
+  kEmptyAllFlags         = (1<<6)-1,
+};
+
+class Regexp;
+
+class DFA;
+struct OneState;
+
+// Compiled form of regexp program.
+class Prog {
+ public:
+  Prog();
+  ~Prog();
+
+  // Single instruction in regexp program.
+  class Inst {
+   public:
+    Inst() : out_opcode_(0), out1_(0) { }
+
+    // Constructors per opcode
+    void InitAlt(uint32 out, uint32 out1);
+    void InitByteRange(int lo, int hi, int foldcase, uint32 out);
+    void InitCapture(int cap, uint32 out);
+    void InitEmptyWidth(EmptyOp empty, uint32 out);
+    void InitMatch(int id);
+    void InitNop(uint32 out);
+    void InitFail();
+
+    // Getters
+    int id(Prog* p) { return this - p->inst_; }
+    InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
+    int out()     { return out_opcode_>>3; }
+    int out1()    { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
+    int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
+    int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
+    int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
+    int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
+    int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
+    EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
+    bool greedy(Prog *p) {
+      DCHECK_EQ(opcode(), kInstAltMatch);
+      return p->inst(out())->opcode() == kInstByteRange;
+    }
+
+    // Does this inst (an kInstByteRange) match c?
+    inline bool Matches(int c) {
+      DCHECK_EQ(opcode(), kInstByteRange);
+      if (foldcase_ && 'A' <= c && c <= 'Z')
+        c += 'a' - 'A';
+      return lo_ <= c && c <= hi_;
+    }
+
+    // Returns string representation for debugging.
+    string Dump();
+
+    // Maximum instruction id.
+    // (Must fit in out_opcode_, and PatchList steals another bit.)
+    static const int kMaxInst = (1<<28) - 1;
+
+   private:
+    void set_opcode(InstOp opcode) {
+      out_opcode_ = (out()<<3) | opcode;
+    }
+
+    void set_out(int out) {
+      out_opcode_ = (out<<3) | opcode();
+    }
+
+    void set_out_opcode(int out, InstOp opcode) {
+      out_opcode_ = (out<<3) | opcode;
+    }
+
+    uint32 out_opcode_;  // 29 bits of out, 3 (low) bits opcode
+    union {              // additional instruction arguments:
+      uint32 out1_;      // opcode == kInstAlt
+                         //   alternate next instruction
+
+      int32 cap_;        // opcode == kInstCapture
+                         //   Index of capture register (holds text
+                         //   position recorded by capturing parentheses).
+                         //   For \n (the submatch for the nth parentheses),
+                         //   the left parenthesis captures into register 2*n
+                         //   and the right one captures into register 2*n+1.
+
+      int32 match_id_;   // opcode == kInstMatch
+                         //   Match ID to identify this match (for re2::Set).
+
+      struct {           // opcode == kInstByteRange
+        uint8 lo_;       //   byte range is lo_-hi_ inclusive
+        uint8 hi_;       //
+        uint8 foldcase_; //   convert A-Z to a-z before checking range.
+      };
+
+      EmptyOp empty_;    // opcode == kInstEmptyWidth
+                         //   empty_ is bitwise OR of kEmpty* flags above.
+    };
+
+    friend class Compiler;
+    friend struct PatchList;
+    friend class Prog;
+
+    DISALLOW_EVIL_CONSTRUCTORS(Inst);
+  };
+
+  // Whether to anchor the search.
+  enum Anchor {
+    kUnanchored,  // match anywhere
+    kAnchored,    // match only starting at beginning of text
+  };
+
+  // Kind of match to look for (for anchor != kFullMatch)
+  //
+  // kLongestMatch mode finds the overall longest
+  // match but still makes its submatch choices the way
+  // Perl would, not in the way prescribed by POSIX.
+  // The POSIX rules are much more expensive to implement,
+  // and no one has needed them.
+  //
+  // kFullMatch is not strictly necessary -- we could use
+  // kLongestMatch and then check the length of the match -- but
+  // the matching code can run faster if it knows to consider only
+  // full matches.
+  enum MatchKind {
+    kFirstMatch,     // like Perl, PCRE
+    kLongestMatch,   // like egrep or POSIX
+    kFullMatch,      // match only entire text; implies anchor==kAnchored
+    kManyMatch       // for SearchDFA, records set of matches
+  };
+
+  Inst *inst(int id) { return &inst_[id]; }
+  int start() { return start_; }
+  int start_unanchored() { return start_unanchored_; }
+  void set_start(int start) { start_ = start; }
+  void set_start_unanchored(int start) { start_unanchored_ = start; }
+  int64 size() { return size_; }
+  bool reversed() { return reversed_; }
+  void set_reversed(bool reversed) { reversed_ = reversed; }
+  int64 byte_inst_count() { return byte_inst_count_; }
+  const Bitmap<256>& byterange() { return byterange_; }
+  void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
+  int64 dfa_mem() { return dfa_mem_; }
+  int flags() { return flags_; }
+  void set_flags(int flags) { flags_ = flags; }
+  bool anchor_start() { return anchor_start_; }
+  void set_anchor_start(bool b) { anchor_start_ = b; }
+  bool anchor_end() { return anchor_end_; }
+  void set_anchor_end(bool b) { anchor_end_ = b; }
+  int bytemap_range() { return bytemap_range_; }
+  const uint8* bytemap() { return bytemap_; }
+
+  // Returns string representation of program for debugging.
+  string Dump();
+  string DumpUnanchored();
+
+  // Record that at some point in the prog, the bytes in the range
+  // lo-hi (inclusive) are treated as different from bytes outside the range.
+  // Tracking this lets the DFA collapse commonly-treated byte ranges
+  // when recording state pointers, greatly reducing its memory footprint.
+  void MarkByteRange(int lo, int hi);
+
+  // Returns the set of kEmpty flags that are in effect at
+  // position p within context.
+  static uint32 EmptyFlags(const StringPiece& context, const char* p);
+
+  // Returns whether byte c is a word character: ASCII only.
+  // Used by the implementation of \b and \B.
+  // This is not right for Unicode, but:
+  //   - it's hard to get right in a byte-at-a-time matching world
+  //     (the DFA has only one-byte lookahead).
+  //   - even if the lookahead were possible, the Progs would be huge.
+  // This crude approximation is the same one PCRE uses.
+  static bool IsWordChar(uint8 c) {
+    return ('A' <= c && c <= 'Z') ||
+           ('a' <= c && c <= 'z') ||
+           ('0' <= c && c <= '9') ||
+           c == '_';
+  }
+
+  // Execution engines.  They all search for the regexp (run the prog)
+  // in text, which is in the larger context (used for ^ $ \b etc).
+  // Anchor and kind control the kind of search.
+  // Returns true if match found, false if not.
+  // If match found, fills match[0..nmatch-1] with submatch info.
+  // match[0] is overall match, match[1] is first set of parens, etc.
+  // If a particular submatch is not matched during the regexp match,
+  // it is set to NULL.
+  //
+  // Matching text == StringPiece(NULL, 0) is treated as any other empty
+  // string, but note that on return, it will not be possible to distinguish
+  // submatches that matched that empty string from submatches that didn't
+  // match anything.  Either way, match[i] == NULL.
+
+  // Search using NFA: can find submatches but kind of slow.
+  bool SearchNFA(const StringPiece& text, const StringPiece& context,
+                 Anchor anchor, MatchKind kind,
+                 StringPiece* match, int nmatch);
+
+  // Search using DFA: much faster than NFA but only finds
+  // end of match and can use a lot more memory.
+  // Returns whether a match was found.
+  // If the DFA runs out of memory, sets *failed to true and returns false.
+  // If matches != NULL and kind == kManyMatch and there is a match,
+  // SearchDFA fills matches with the match IDs of the final matching state.
+  bool SearchDFA(const StringPiece& text, const StringPiece& context,
+                 Anchor anchor, MatchKind kind,
+                 StringPiece* match0, bool* failed,
+                 vector<int>* matches);
+
+  // Build the entire DFA for the given match kind.  FOR TESTING ONLY.
+  // Usually the DFA is built out incrementally, as needed, which
+  // avoids lots of unnecessary work.  This function is useful only
+  // for testing purposes.  Returns number of states.
+  int BuildEntireDFA(MatchKind kind);
+
+  // Compute byte map.
+  void ComputeByteMap();
+
+  // Run peep-hole optimizer on program.
+  void Optimize();
+
+  // One-pass NFA: only correct if IsOnePass() is true,
+  // but much faster than NFA (competitive with PCRE)
+  // for those expressions.
+  bool IsOnePass();
+  bool SearchOnePass(const StringPiece& text, const StringPiece& context,
+                     Anchor anchor, MatchKind kind,
+                     StringPiece* match, int nmatch);
+
+  // Bit-state backtracking.  Fast on small cases but uses memory
+  // proportional to the product of the program size and the text size.
+  bool SearchBitState(const StringPiece& text, const StringPiece& context,
+                      Anchor anchor, MatchKind kind,
+                      StringPiece* match, int nmatch);
+
+  static const int kMaxOnePassCapture = 5;  // $0 through $4
+
+  // Backtracking search: the gold standard against which the other
+  // implementations are checked.  FOR TESTING ONLY.
+  // It allocates a ton of memory to avoid running forever.
+  // It is also recursive, so can't use in production (will overflow stacks).
+  // The name "Unsafe" here is supposed to be a flag that
+  // you should not be using this function.
+  bool UnsafeSearchBacktrack(const StringPiece& text,
+                             const StringPiece& context,
+                             Anchor anchor, MatchKind kind,
+                             StringPiece* match, int nmatch);
+
+  // Computes range for any strings matching regexp. The min and max can in
+  // some cases be arbitrarily precise, so the caller gets to specify the
+  // maximum desired length of string returned.
+  //
+  // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
+  // string s that is an anchored match for this regexp satisfies
+  //   min <= s && s <= max.
+  //
+  // Note that PossibleMatchRange() will only consider the first copy of an
+  // infinitely repeated element (i.e., any regexp element followed by a '*' or
+  // '+' operator). Regexps with "{N}" constructions are not affected, as those
+  // do not compile down to infinite repetitions.
+  //
+  // Returns true on success, false on error.
+  bool PossibleMatchRange(string* min, string* max, int maxlen);
+
+  // Compiles a collection of regexps to Prog.  Each regexp will have
+  // its own Match instruction recording the index in the vector.
+  static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
+                          Regexp* re);
+
+ private:
+  friend class Compiler;
+
+  DFA* GetDFA(MatchKind kind);
+
+  bool anchor_start_;       // regexp has explicit start anchor
+  bool anchor_end_;         // regexp has explicit end anchor
+  bool reversed_;           // whether program runs backward over input
+  bool did_onepass_;        // has IsOnePass been called?
+
+  int start_;               // entry point for program
+  int start_unanchored_;    // unanchored entry point for program
+  int size_;                // number of instructions
+  int byte_inst_count_;     // number of kInstByteRange instructions
+  int bytemap_range_;       // bytemap_[x] < bytemap_range_
+  int flags_;               // regexp parse flags
+  int onepass_statesize_;   // byte size of each OneState* node
+
+  Inst* inst_;              // pointer to instruction array
+
+  Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
+  DFA* volatile dfa_first_;     // DFA cached for kFirstMatch
+  DFA* volatile dfa_longest_;   // DFA cached for kLongestMatch and kFullMatch
+  int64 dfa_mem_;      // Maximum memory for DFAs.
+  void (*delete_dfa_)(DFA* dfa);
+
+  Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
+                             // commonly-treated byte range.
+  uint8 bytemap_[256];       // map from input bytes to byte classes
+  uint8 *unbytemap_;         // bytemap_[unbytemap_[x]] == x
+
+  uint8* onepass_nodes_;     // data for OnePass nodes
+  OneState* onepass_start_;  // start node for OnePass program
+
+  DISALLOW_EVIL_CONSTRUCTORS(Prog);
+};
+
+}  // namespace re2
+
+#endif  // RE2_PROG_H__
--- a/re2/re2/re2.cc
+++ b/re2/re2/re2.cc
--- a/re2/re2/re2.h
+++ b/re2/re2/re2.h
@ -0,0 +1,837 @@
+// Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_RE2_H
+#define RE2_RE2_H
+
+// C++ interface to the re2 regular-expression library.
+// RE2 supports Perl-style regular expressions (with extensions like
+// \d, \w, \s, ...).
+//
+// -----------------------------------------------------------------------
+// REGEXP SYNTAX:
+//
+// This module uses the re2 library and hence supports
+// its syntax for regular expressions, which is similar to Perl's with
+// some of the more complicated things thrown away.  In particular,
+// backreferences and generalized assertions are not available, nor is \Z.
+//
+// See http://code.google.com/p/re2/wiki/Syntax for the syntax
+// supported by RE2, and a comparison with PCRE and PERL regexps.
+//
+// For those not familiar with Perl's regular expressions,
+// here are some examples of the most commonly used extensions:
+//
+//   "hello (\\w+) world"  -- \w matches a "word" character
+//   "version (\\d+)"      -- \d matches a digit
+//   "hello\\s+world"      -- \s matches any whitespace character
+//   "\\b(\\w+)\\b"        -- \b matches non-empty string at word boundary
+//   "(?i)hello"           -- (?i) turns on case-insensitive matching
+//   "/\\*(.*?)\\*/"       -- .*? matches . minimum no. of times possible
+//
+// -----------------------------------------------------------------------
+// MATCHING INTERFACE:
+//
+// The "FullMatch" operation checks that supplied text matches a
+// supplied pattern exactly.
+//
+// Example: successful match
+//    CHECK(RE2::FullMatch("hello", "h.*o"));
+//
+// Example: unsuccessful match (requires full match):
+//    CHECK(!RE2::FullMatch("hello", "e"));
+//
+// -----------------------------------------------------------------------
+// UTF-8 AND THE MATCHING INTERFACE:
+//
+// By default, the pattern and input text are interpreted as UTF-8.
+// The RE2::Latin1 option causes them to be interpreted as Latin-1.
+//
+// Example:
+//    CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
+//    CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
+//
+// -----------------------------------------------------------------------
+// MATCHING WITH SUB-STRING EXTRACTION:
+//
+// You can supply extra pointer arguments to extract matched subpieces.
+//
+// Example: extracts "ruby" into "s" and 1234 into "i"
+//    int i;
+//    string s;
+//    CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
+//
+// Example: fails because string cannot be stored in integer
+//    CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
+//
+// Example: fails because there aren't enough sub-patterns:
+//    CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
+//
+// Example: does not try to extract any extra sub-patterns
+//    CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
+//
+// Example: does not try to extract into NULL
+//    CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
+//
+// Example: integer overflow causes failure
+//    CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
+//
+// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
+// This may get a little faster in the future, but right now is slower
+// than PCRE.  On the other hand, failed matches run *very* fast (faster
+// than PCRE), as do matches without substring extraction.
+//
+// -----------------------------------------------------------------------
+// PARTIAL MATCHES
+//
+// You can use the "PartialMatch" operation when you want the pattern
+// to match any substring of the text.
+//
+// Example: simple search for a string:
+//      CHECK(RE2::PartialMatch("hello", "ell"));
+//
+// Example: find first number in a string
+//      int number;
+//      CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
+//      CHECK_EQ(number, 100);
+//
+// -----------------------------------------------------------------------
+// PRE-COMPILED REGULAR EXPRESSIONS
+//
+// RE2 makes it easy to use any string as a regular expression, without
+// requiring a separate compilation step.
+//
+// If speed is of the essence, you can create a pre-compiled "RE2"
+// object from the pattern and use it multiple times.  If you do so,
+// you can typically parse text faster than with sscanf.
+//
+// Example: precompile pattern for faster matching:
+//    RE2 pattern("h.*o");
+//    while (ReadLine(&str)) {
+//      if (RE2::FullMatch(str, pattern)) ...;
+//    }
+//
+// -----------------------------------------------------------------------
+// SCANNING TEXT INCREMENTALLY
+//
+// The "Consume" operation may be useful if you want to repeatedly
+// match regular expressions at the front of a string and skip over
+// them as they match.  This requires use of the "StringPiece" type,
+// which represents a sub-range of a real string.
+//
+// Example: read lines of the form "var = value" from a string.
+//      string contents = ...;          // Fill string somehow
+//      StringPiece input(contents);    // Wrap a StringPiece around it
+//
+//      string var;
+//      int value;
+//      while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
+//        ...;
+//      }
+//
+// Each successful call to "Consume" will set "var/value", and also
+// advance "input" so it points past the matched text.  Note that if the
+// regular expression matches an empty string, input will advance
+// by 0 bytes.  If the regular expression being used might match
+// an empty string, the loop body must check for this case and either
+// advance the string or break out of the loop.
+//
+// The "FindAndConsume" operation is similar to "Consume" but does not
+// anchor your match at the beginning of the string.  For example, you
+// could extract all words from a string by repeatedly calling
+//     RE2::FindAndConsume(&input, "(\\w+)", &word)
+//
+// -----------------------------------------------------------------------
+// USING VARIABLE NUMBER OF ARGUMENTS
+//
+// The above operations require you to know the number of arguments
+// when you write the code.  This is not always possible or easy (for
+// example, the regular expression may be calculated at run time).
+// You can use the "N" version of the operations when the number of
+// match arguments are determined at run time.
+//
+// Example:
+//   const RE2::Arg* args[10];
+//   int n;
+//   // ... populate args with pointers to RE2::Arg values ...
+//   // ... set n to the number of RE2::Arg objects ...
+//   bool match = RE2::FullMatchN(input, pattern, args, n);
+//
+// The last statement is equivalent to
+//
+//   bool match = RE2::FullMatch(input, pattern,
+//                               *args[0], *args[1], ..., *args[n - 1]);
+//
+// -----------------------------------------------------------------------
+// PARSING HEX/OCTAL/C-RADIX NUMBERS
+//
+// By default, if you pass a pointer to a numeric value, the
+// corresponding text is interpreted as a base-10 number.  You can
+// instead wrap the pointer with a call to one of the operators Hex(),
+// Octal(), or CRadix() to interpret the text in another base.  The
+// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
+// prefixes, but defaults to base-10.
+//
+// Example:
+//   int a, b, c, d;
+//   CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
+//         RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
+// will leave 64 in a, b, c, and d.
+
+
+#include <stdint.h>
+#include <map>
+#include <string>
+#include "re2/stringpiece.h"
+#include "re2/variadic_function.h"
+
+namespace re2 {
+using std::string;
+using std::map;
+class Mutex;
+class Prog;
+class Regexp;
+
+// Interface for regular expression matching.  Also corresponds to a
+// pre-compiled regular expression.  An "RE2" object is safe for
+// concurrent use by multiple threads.
+class RE2 {
+ public:
+  // We convert user-passed pointers into special Arg objects
+  class Arg;
+  class Options;
+
+  // Defined in set.h.
+  class Set;
+
+  enum ErrorCode {
+    NoError = 0,
+
+    // Unexpected error
+    ErrorInternal,
+
+    // Parse errors
+    ErrorBadEscape,          // bad escape sequence
+    ErrorBadCharClass,       // bad character class
+    ErrorBadCharRange,       // bad character class range
+    ErrorMissingBracket,     // missing closing ]
+    ErrorMissingParen,       // missing closing )
+    ErrorTrailingBackslash,  // trailing \ at end of regexp
+    ErrorRepeatArgument,     // repeat argument missing, e.g. "*"
+    ErrorRepeatSize,         // bad repetition argument
+    ErrorRepeatOp,           // bad repetition operator
+    ErrorBadPerlOp,          // bad perl operator
+    ErrorBadUTF8,            // invalid UTF-8 in regexp
+    ErrorBadNamedCapture,    // bad named capture group
+    ErrorPatternTooLarge,    // pattern too large (compile failed)
+  };
+
+  // Predefined common options.
+  // If you need more complicated things, instantiate
+  // an Option class, change the settings, and pass it to the
+  // RE2 constructor.
+  static const Options DefaultOptions;
+  static const Options Latin1; // treat input as Latin-1 (default UTF-8)
+  static const Options POSIX;  // POSIX syntax, leftmost-longest match
+  static const Options Quiet;  // do not log about regexp parse errors
+
+  // Need to have the const char* and const string& forms for implicit
+  // conversions when passing string literals to FullMatch and PartialMatch.
+  // Otherwise the StringPiece form would be sufficient.
+#ifndef SWIG
+  RE2(const char* pattern);
+  RE2(const string& pattern);
+#endif
+  RE2(const StringPiece& pattern);
+  RE2(const StringPiece& pattern, const Options& option);
+  ~RE2();
+
+  // Returns whether RE2 was created properly.
+  bool ok() const { return error_code() == NoError; }
+
+  // The string specification for this RE2.  E.g.
+  //   RE2 re("ab*c?d+");
+  //   re.pattern();    // "ab*c?d+"
+  const string& pattern() const { return pattern_; }
+
+  // If RE2 could not be created properly, returns an error string.
+  // Else returns the empty string.
+  const string& error() const { return *error_; }
+
+  // If RE2 could not be created properly, returns an error code.
+  // Else returns RE2::NoError (== 0).
+  ErrorCode error_code() const { return error_code_; }
+
+  // If RE2 could not be created properly, returns the offending
+  // portion of the regexp.
+  const string& error_arg() const { return error_arg_; }
+
+  // Returns the program size, a very approximate measure of a regexp's "cost".
+  // Larger numbers are more expensive than smaller numbers.
+  int ProgramSize() const;
+
+  // Returns the underlying Regexp; not for general use.
+  // Returns entire_regexp_ so that callers don't need
+  // to know about prefix_ and prefix_foldcase_.
+  re2::Regexp* Regexp() const { return entire_regexp_; }
+
+  /***** The useful part: the matching interface *****/
+
+  // Matches "text" against "pattern".  If pointer arguments are
+  // supplied, copies matched sub-patterns into them.
+  //
+  // You can pass in a "const char*" or a "string" for "text".
+  // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
+  //
+  // The provided pointer arguments can be pointers to any scalar numeric
+  // type, or one of:
+  //    string          (matched piece is copied to string)
+  //    StringPiece     (StringPiece is mutated to point to matched piece)
+  //    T               (where "bool T::ParseFrom(const char*, int)" exists)
+  //    (void*)NULL     (the corresponding matched sub-pattern is not copied)
+  //
+  // Returns true iff all of the following conditions are satisfied:
+  //   a. "text" matches "pattern" exactly
+  //   b. The number of matched sub-patterns is >= number of supplied pointers
+  //   c. The "i"th argument has a suitable type for holding the
+  //      string captured as the "i"th sub-pattern.  If you pass in
+  //      NULL for the "i"th argument, or pass fewer arguments than
+  //      number of sub-patterns, "i"th captured sub-pattern is
+  //      ignored.
+  //
+  // CAVEAT: An optional sub-pattern that does not exist in the
+  // matched string is assigned the empty string.  Therefore, the
+  // following will return false (because the empty string is not a
+  // valid number):
+  //    int number;
+  //    RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
+  static bool FullMatchN(const StringPiece& text, const RE2& re,
+                         const Arg* const args[], int argc);
+  static const VariadicFunction2<
+      bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
+
+  // Exactly like FullMatch(), except that "pattern" is allowed to match
+  // a substring of "text".
+  static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
+                            const Arg* const args[], int argc);
+  static const VariadicFunction2<
+      bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
+
+  // Like FullMatch() and PartialMatch(), except that pattern has to
+  // match a prefix of "text", and "input" is advanced past the matched
+  // text.  Note: "input" is modified iff this routine returns true.
+  static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
+                       const Arg* const args[], int argc);
+  static const VariadicFunction2<
+      bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
+
+  // Like Consume(..), but does not anchor the match at the beginning of the
+  // string.  That is, "pattern" need not start its match at the beginning of
+  // "input".  For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
+  // word in "s" and stores it in "word".
+  static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
+                             const Arg* const args[], int argc);
+  static const VariadicFunction2<
+      bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
+
+  // Replace the first match of "pattern" in "str" with "rewrite".
+  // Within "rewrite", backslash-escaped digits (\1 to \9) can be
+  // used to insert text matching corresponding parenthesized group
+  // from the pattern.  \0 in "rewrite" refers to the entire matching
+  // text.  E.g.,
+  //
+  //   string s = "yabba dabba doo";
+  //   CHECK(RE2::Replace(&s, "b+", "d"));
+  //
+  // will leave "s" containing "yada dabba doo"
+  //
+  // Returns true if the pattern matches and a replacement occurs,
+  // false otherwise.
+  static bool Replace(string *str,
+                      const RE2& pattern,
+                      const StringPiece& rewrite);
+
+  // Like Replace(), except replaces successive non-overlapping occurrences
+  // of the pattern in the string with the rewrite. E.g.
+  //
+  //   string s = "yabba dabba doo";
+  //   CHECK(RE2::GlobalReplace(&s, "b+", "d"));
+  //
+  // will leave "s" containing "yada dada doo"
+  // Replacements are not subject to re-matching.
+  //
+  // Because GlobalReplace only replaces non-overlapping matches,
+  // replacing "ana" within "banana" makes only one replacement, not two.
+  //
+  // Returns the number of replacements made.
+  static int GlobalReplace(string *str,
+                           const RE2& pattern,
+                           const StringPiece& rewrite);
+
+  // Like Replace, except that if the pattern matches, "rewrite"
+  // is copied into "out" with substitutions.  The non-matching
+  // portions of "text" are ignored.
+  //
+  // Returns true iff a match occurred and the extraction happened
+  // successfully;  if no match occurs, the string is left unaffected.
+  static bool Extract(const StringPiece &text,
+                      const RE2& pattern,
+                      const StringPiece &rewrite,
+                      string *out);
+
+  // Escapes all potentially meaningful regexp characters in
+  // 'unquoted'.  The returned string, used as a regular expression,
+  // will exactly match the original string.  For example,
+  //           1.5-2.0?
+  // may become:
+  //           1\.5\-2\.0\?
+  static string QuoteMeta(const StringPiece& unquoted);
+
+  // Computes range for any strings matching regexp. The min and max can in
+  // some cases be arbitrarily precise, so the caller gets to specify the
+  // maximum desired length of string returned.
+  //
+  // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
+  // string s that is an anchored match for this regexp satisfies
+  //   min <= s && s <= max.
+  //
+  // Note that PossibleMatchRange() will only consider the first copy of an
+  // infinitely repeated element (i.e., any regexp element followed by a '*' or
+  // '+' operator). Regexps with "{N}" constructions are not affected, as those
+  // do not compile down to infinite repetitions.
+  //
+  // Returns true on success, false on error.
+  bool PossibleMatchRange(string* min, string* max, int maxlen) const;
+
+  // Generic matching interface
+
+  // Type of match.
+  enum Anchor {
+    UNANCHORED,         // No anchoring
+    ANCHOR_START,       // Anchor at start only
+    ANCHOR_BOTH,        // Anchor at start and end
+  };
+
+  // Return the number of capturing subpatterns, or -1 if the
+  // regexp wasn't valid on construction.  The overall match ($0)
+  // does not count: if the regexp is "(a)(b)", returns 2.
+  int NumberOfCapturingGroups() const;
+
+
+  // Return a map from names to capturing indices.
+  // The map records the index of the leftmost group
+  // with the given name.
+  // Only valid until the re is deleted.
+  const map<string, int>& NamedCapturingGroups() const;
+
+  // Return a map from capturing indices to names.
+  // The map has no entries for unnamed groups.
+  // Only valid until the re is deleted.
+  const map<int, string>& CapturingGroupNames() const;
+
+  // General matching routine.
+  // Match against text starting at offset startpos
+  // and stopping the search at offset endpos.
+  // Returns true if match found, false if not.
+  // On a successful match, fills in match[] (up to nmatch entries)
+  // with information about submatches.
+  // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
+  // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
+  // match[3] = NULL, ..., up to match[nmatch-1] = NULL.
+  //
+  // Don't ask for more match information than you will use:
+  // runs much faster with nmatch == 1 than nmatch > 1, and
+  // runs even faster if nmatch == 0.
+  // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
+  // but will be handled correctly.
+  //
+  // Passing text == StringPiece(NULL, 0) will be handled like any other
+  // empty string, but note that on return, it will not be possible to tell
+  // whether submatch i matched the empty string or did not match:
+  // either way, match[i] == NULL.
+  bool Match(const StringPiece& text,
+             int startpos,
+             int endpos,
+             Anchor anchor,
+             StringPiece *match,
+             int nmatch) const;
+
+  // Check that the given rewrite string is suitable for use with this
+  // regular expression.  It checks that:
+  //   * The regular expression has enough parenthesized subexpressions
+  //     to satisfy all of the \N tokens in rewrite
+  //   * The rewrite string doesn't have any syntax errors.  E.g.,
+  //     '\' followed by anything other than a digit or '\'.
+  // A true return value guarantees that Replace() and Extract() won't
+  // fail because of a bad rewrite string.
+  bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
+
+  // Constructor options
+  class Options {
+   public:
+    // The options are (defaults in parentheses):
+    //
+    //   utf8             (true)  text and pattern are UTF-8; otherwise Latin-1
+    //   posix_syntax     (false) restrict regexps to POSIX egrep syntax
+    //   longest_match    (false) search for longest match, not first match
+    //   log_errors       (true)  log syntax and execution errors to ERROR
+    //   max_mem          (see below)  approx. max memory footprint of RE2
+    //   literal          (false) interpret string as literal, not regexp
+    //   never_nl         (false) never match \n, even if it is in regexp
+    //   case_sensitive   (true)  match is case-sensitive (regexp can override
+    //                              with (?i) unless in posix_syntax mode)
+    //
+    // The following options are only consulted when posix_syntax == true.
+    // (When posix_syntax == false these features are always enabled and
+    // cannot be turned off.)
+    //   perl_classes     (false) allow Perl's \d \s \w \D \S \W
+    //   word_boundary    (false) allow Perl's \b \B (word boundary and not)
+    //   one_line         (false) ^ and $ only match beginning and end of text
+    //
+    // The max_mem option controls how much memory can be used
+    // to hold the compiled form of the regexp (the Prog) and
+    // its cached DFA graphs.  Code Search placed limits on the number
+    // of Prog instructions and DFA states: 10,000 for both.
+    // In RE2, those limits would translate to about 240 KB per Prog
+    // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
+    // better job of keeping them small than Code Search did).
+    // Each RE2 has two Progs (one forward, one reverse), and each Prog
+    // can have two DFAs (one first match, one longest match).
+    // That makes 4 DFAs:
+    //
+    //   forward, first-match    - used for UNANCHORED or ANCHOR_LEFT searches
+    //                               if opt.longest_match() == false
+    //   forward, longest-match  - used for all ANCHOR_BOTH searches,
+    //                               and the other two kinds if
+    //                               opt.longest_match() == true
+    //   reverse, first-match    - never used
+    //   reverse, longest-match  - used as second phase for unanchored searches
+    //
+    // The RE2 memory budget is statically divided between the two
+    // Progs and then the DFAs: two thirds to the forward Prog
+    // and one third to the reverse Prog.  The forward Prog gives half
+    // of what it has left over to each of its DFAs.  The reverse Prog
+    // gives it all to its longest-match DFA.
+    //
+    // Once a DFA fills its budget, it flushes its cache and starts over.
+    // If this happens too often, RE2 falls back on the NFA implementation.
+
+    // For now, make the default budget something close to Code Search.
+    static const int kDefaultMaxMem = 8<<20;
+
+    enum Encoding {
+      EncodingUTF8 = 1,
+      EncodingLatin1
+    };
+
+    Options() :
+      encoding_(EncodingUTF8),
+      posix_syntax_(false),
+      longest_match_(false),
+      log_errors_(true),
+      max_mem_(kDefaultMaxMem),
+      literal_(false),
+      never_nl_(false),
+      case_sensitive_(true),
+      perl_classes_(false),
+      word_boundary_(false),
+      one_line_(false) {
+    }
+
+    Encoding encoding() const { return encoding_; }
+    void set_encoding(Encoding encoding) { encoding_ = encoding; }
+
+    // Legacy interface to encoding.
+    // TODO(rsc): Remove once clients have been converted.
+    bool utf8() const { return encoding_ == EncodingUTF8; }
+    void set_utf8(bool b) {
+      if (b) {
+        encoding_ = EncodingUTF8;
+      } else {
+        encoding_ = EncodingLatin1;
+      }
+    }
+
+    bool posix_syntax() const { return posix_syntax_; }
+    void set_posix_syntax(bool b) { posix_syntax_ = b; }
+
+    bool longest_match() const { return longest_match_; }
+    void set_longest_match(bool b) { longest_match_ = b; }
+
+    bool log_errors() const { return log_errors_; }
+    void set_log_errors(bool b) { log_errors_ = b; }
+
+    int max_mem() const { return max_mem_; }
+    void set_max_mem(int m) { max_mem_ = m; }
+
+    bool literal() const { return literal_; }
+    void set_literal(bool b) { literal_ = b; }
+
+    bool never_nl() const { return never_nl_; }
+    void set_never_nl(bool b) { never_nl_ = b; }
+
+    bool case_sensitive() const { return case_sensitive_; }
+    void set_case_sensitive(bool b) { case_sensitive_ = b; }
+
+    bool perl_classes() const { return perl_classes_; }
+    void set_perl_classes(bool b) { perl_classes_ = b; }
+
+    bool word_boundary() const { return word_boundary_; }
+    void set_word_boundary(bool b) { word_boundary_ = b; }
+
+    bool one_line() const { return one_line_; }
+    void set_one_line(bool b) { one_line_ = b; }
+
+    void Copy(const Options& src) {
+      encoding_ = src.encoding_;
+      posix_syntax_ = src.posix_syntax_;
+      longest_match_ = src.longest_match_;
+      log_errors_ = src.log_errors_;
+      max_mem_ = src.max_mem_;
+      literal_ = src.literal_;
+      never_nl_ = src.never_nl_;
+      case_sensitive_ = src.case_sensitive_;
+      perl_classes_ = src.perl_classes_;
+      word_boundary_ = src.word_boundary_;
+      one_line_ = src.one_line_;
+    }
+
+    int ParseFlags() const;
+
+   private:
+    // Private constructor for defining constants like RE2::Latin1.
+    friend class RE2;
+    Options(Encoding encoding,
+            bool posix_syntax,
+            bool longest_match,
+            bool log_errors) :
+      encoding_(encoding),
+      posix_syntax_(posix_syntax),
+      longest_match_(longest_match),
+      log_errors_(log_errors),
+      max_mem_(kDefaultMaxMem),
+      literal_(false),
+      never_nl_(false),
+      case_sensitive_(true),
+      perl_classes_(false),
+      word_boundary_(false),
+      one_line_(false) {
+    }
+
+    Encoding encoding_;
+    bool posix_syntax_;
+    bool longest_match_;
+    bool log_errors_;
+    int64_t max_mem_;
+    bool literal_;
+    bool never_nl_;
+    bool case_sensitive_;
+    bool perl_classes_;
+    bool word_boundary_;
+    bool one_line_;
+
+    //DISALLOW_EVIL_CONSTRUCTORS(Options);
+    Options(const Options&);
+    void operator=(const Options&);
+  };
+
+  // Returns the options set in the constructor.
+  const Options& options() const { return options_; };
+
+  // Argument converters; see below.
+  static inline Arg CRadix(short* x);
+  static inline Arg CRadix(unsigned short* x);
+  static inline Arg CRadix(int* x);
+  static inline Arg CRadix(unsigned int* x);
+  static inline Arg CRadix(long* x);
+  static inline Arg CRadix(unsigned long* x);
+  static inline Arg CRadix(long long* x);
+  static inline Arg CRadix(unsigned long long* x);
+
+  static inline Arg Hex(short* x);
+  static inline Arg Hex(unsigned short* x);
+  static inline Arg Hex(int* x);
+  static inline Arg Hex(unsigned int* x);
+  static inline Arg Hex(long* x);
+  static inline Arg Hex(unsigned long* x);
+  static inline Arg Hex(long long* x);
+  static inline Arg Hex(unsigned long long* x);
+
+  static inline Arg Octal(short* x);
+  static inline Arg Octal(unsigned short* x);
+  static inline Arg Octal(int* x);
+  static inline Arg Octal(unsigned int* x);
+  static inline Arg Octal(long* x);
+  static inline Arg Octal(unsigned long* x);
+  static inline Arg Octal(long long* x);
+  static inline Arg Octal(unsigned long long* x);
+
+ private:
+  void Init(const StringPiece& pattern, const Options& options);
+
+  bool Rewrite(string *out,
+               const StringPiece &rewrite,
+               const StringPiece* vec,
+               int veclen) const;
+
+  bool DoMatch(const StringPiece& text,
+                   Anchor anchor,
+                   int* consumed,
+                   const Arg* const args[],
+                   int n) const;
+
+  re2::Prog* ReverseProg() const;
+
+  mutable Mutex*           mutex_;
+  string                   pattern_;       // string regular expression
+  Options                  options_;       // option flags
+  string        prefix_;           // required prefix (before regexp_)
+  bool          prefix_foldcase_;  // prefix is ASCII case-insensitive
+  re2::Regexp*  entire_regexp_;    // parsed regular expression
+  re2::Regexp*  suffix_regexp_;    // parsed regular expression, prefix removed
+  re2::Prog*    prog_;             // compiled program for regexp
+  mutable re2::Prog* rprog_;       // reverse program for regexp
+  bool                     is_one_pass_;   // can use prog_->SearchOnePass?
+  mutable const string*    error_;         // Error indicator
+                                           // (or points to empty string)
+  mutable ErrorCode        error_code_;    // Error code
+  mutable string           error_arg_;     // Fragment of regexp showing error
+  mutable int              num_captures_;  // Number of capturing groups
+
+  // Map from capture names to indices
+  mutable const map<string, int>* named_groups_;
+
+  // Map from capture indices to names
+  mutable const map<int, string>* group_names_;
+
+  //DISALLOW_EVIL_CONSTRUCTORS(RE2);
+  RE2(const RE2&);
+  void operator=(const RE2&);
+};
+
+/***** Implementation details *****/
+
+// Hex/Octal/Binary?
+
+// Special class for parsing into objects that define a ParseFrom() method
+template <class T>
+class _RE2_MatchObject {
+ public:
+  static inline bool Parse(const char* str, int n, void* dest) {
+    if (dest == NULL) return true;
+    T* object = reinterpret_cast<T*>(dest);
+    return object->ParseFrom(str, n);
+  }
+};
+
+class RE2::Arg {
+ public:
+  // Empty constructor so we can declare arrays of RE2::Arg
+  Arg();
+
+  // Constructor specially designed for NULL arguments
+  Arg(void*);
+
+  typedef bool (*Parser)(const char* str, int n, void* dest);
+
+// Type-specific parsers
+#define MAKE_PARSER(type,name) \
+  Arg(type* p) : arg_(p), parser_(name) { } \
+  Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
+
+
+  MAKE_PARSER(char,               parse_char);
+  MAKE_PARSER(signed char,        parse_char);
+  MAKE_PARSER(unsigned char,      parse_uchar);
+  MAKE_PARSER(short,              parse_short);
+  MAKE_PARSER(unsigned short,     parse_ushort);
+  MAKE_PARSER(int,                parse_int);
+  MAKE_PARSER(unsigned int,       parse_uint);
+  MAKE_PARSER(long,               parse_long);
+  MAKE_PARSER(unsigned long,      parse_ulong);
+  MAKE_PARSER(long long,          parse_longlong);
+  MAKE_PARSER(unsigned long long, parse_ulonglong);
+  MAKE_PARSER(float,              parse_float);
+  MAKE_PARSER(double,             parse_double);
+  MAKE_PARSER(string,             parse_string);
+  MAKE_PARSER(StringPiece,        parse_stringpiece);
+
+#undef MAKE_PARSER
+
+  // Generic constructor
+  template <class T> Arg(T*, Parser parser);
+  // Generic constructor template
+  template <class T> Arg(T* p)
+    : arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
+  }
+
+  // Parse the data
+  bool Parse(const char* str, int n) const;
+
+ private:
+  void*         arg_;
+  Parser        parser_;
+
+  static bool parse_null          (const char* str, int n, void* dest);
+  static bool parse_char          (const char* str, int n, void* dest);
+  static bool parse_uchar         (const char* str, int n, void* dest);
+  static bool parse_float         (const char* str, int n, void* dest);
+  static bool parse_double        (const char* str, int n, void* dest);
+  static bool parse_string        (const char* str, int n, void* dest);
+  static bool parse_stringpiece   (const char* str, int n, void* dest);
+
+#define DECLARE_INTEGER_PARSER(name)                                        \
+ private:                                                                   \
+  static bool parse_ ## name(const char* str, int n, void* dest);           \
+  static bool parse_ ## name ## _radix(                                     \
+    const char* str, int n, void* dest, int radix);                         \
+ public:                                                                    \
+  static bool parse_ ## name ## _hex(const char* str, int n, void* dest);   \
+  static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
+  static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
+
+  DECLARE_INTEGER_PARSER(short);
+  DECLARE_INTEGER_PARSER(ushort);
+  DECLARE_INTEGER_PARSER(int);
+  DECLARE_INTEGER_PARSER(uint);
+  DECLARE_INTEGER_PARSER(long);
+  DECLARE_INTEGER_PARSER(ulong);
+  DECLARE_INTEGER_PARSER(longlong);
+  DECLARE_INTEGER_PARSER(ulonglong);
+
+#undef DECLARE_INTEGER_PARSER
+};
+
+inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
+inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
+
+inline bool RE2::Arg::Parse(const char* str, int n) const {
+  return (*parser_)(str, n, arg_);
+}
+
+// This part of the parser, appropriate only for ints, deals with bases
+#define MAKE_INTEGER_PARSER(type, name) \
+  inline RE2::Arg RE2::Hex(type* ptr) { \
+    return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
+  inline RE2::Arg RE2::Octal(type* ptr) { \
+    return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
+  inline RE2::Arg RE2::CRadix(type* ptr) { \
+    return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
+
+MAKE_INTEGER_PARSER(short,              short);
+MAKE_INTEGER_PARSER(unsigned short,     ushort);
+MAKE_INTEGER_PARSER(int,                int);
+MAKE_INTEGER_PARSER(unsigned int,       uint);
+MAKE_INTEGER_PARSER(long,               long);
+MAKE_INTEGER_PARSER(unsigned long,      ulong);
+MAKE_INTEGER_PARSER(long long,          longlong);
+MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
+
+#undef MAKE_INTEGER_PARSER
+
+}  // namespace re2
+
+using re2::RE2;
+
+#endif /* RE2_RE2_H */
--- a/re2/re2/regexp.cc
+++ b/re2/re2/regexp.cc
@ -0,0 +1,920 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression representation.
+// Tested by parse_test.cc
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/stringpiece.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Constructor.  Allocates vectors as appropriate for operator.
+Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
+  : op_(op),
+    simple_(false),
+    parse_flags_(static_cast<uint16>(parse_flags)),
+    ref_(1),
+    nsub_(0),
+    down_(NULL) {
+  subone_ = NULL;
+  memset(the_union_, 0, sizeof the_union_);
+}
+
+// Destructor.  Assumes already cleaned up children.
+// Private: use Decref() instead of delete to destroy Regexps.
+// Can't call Decref on the sub-Regexps here because
+// that could cause arbitrarily deep recursion, so
+// required Decref() to have handled them for us.
+Regexp::~Regexp() {
+  if (nsub_ > 0)
+    LOG(DFATAL) << "Regexp not destroyed.";
+
+  switch (op_) {
+    default:
+      break;
+    case kRegexpCapture:
+      delete name_;
+      break;
+    case kRegexpLiteralString:
+      delete[] runes_;
+      break;
+    case kRegexpCharClass:
+      cc_->Delete();
+      delete ccb_;
+      break;
+  }
+}
+
+// If it's possible to destroy this regexp without recurring,
+// do so and return true.  Else return false.
+bool Regexp::QuickDestroy() {
+  if (nsub_ == 0) {
+    delete this;
+    return true;
+  }
+  return false;
+}
+
+static map<Regexp*, int> ref_map;
+static Mutex ref_mutex;
+
+int Regexp::Ref() {
+  if (ref_ < kMaxRef)
+    return ref_;
+
+  MutexLock l(&ref_mutex);
+  return ref_map[this];
+}
+
+// Increments reference count, returns object as convenience.
+Regexp* Regexp::Incref() {
+  if (ref_ >= kMaxRef-1) {
+    // Store ref count in overflow map.
+    MutexLock l(&ref_mutex);
+    if (ref_ == kMaxRef) {  // already overflowed
+      ref_map[this]++;
+      return this;
+    }
+    // overflowing now
+    ref_map[this] = kMaxRef;
+    ref_ = kMaxRef;
+    return this;
+  }
+
+  ref_++;
+  return this;
+}
+
+// Decrements reference count and deletes this object if count reaches 0.
+void Regexp::Decref() {
+  if (ref_ == kMaxRef) {
+    // Ref count is stored in overflow map.
+    MutexLock l(&ref_mutex);
+    int r = ref_map[this] - 1;
+    if (r < kMaxRef) {
+      ref_ = r;
+      ref_map.erase(this);
+    } else {
+      ref_map[this] = r;
+    }
+    return;
+  }
+  ref_--;
+  if (ref_ == 0)
+    Destroy();
+}
+
+// Deletes this object; ref count has count reached 0.
+void Regexp::Destroy() {
+  if (QuickDestroy())
+    return;
+
+  // Handle recursive Destroy with explicit stack
+  // to avoid arbitrarily deep recursion on process stack [sigh].
+  down_ = NULL;
+  Regexp* stack = this;
+  while (stack != NULL) {
+    Regexp* re = stack;
+    stack = re->down_;
+    if (re->ref_ != 0)
+      LOG(DFATAL) << "Bad reference count " << re->ref_;
+    if (re->nsub_ > 0) {
+      Regexp** subs = re->sub();
+      for (int i = 0; i < re->nsub_; i++) {
+        Regexp* sub = subs[i];
+        if (sub == NULL)
+          continue;
+        if (sub->ref_ == kMaxRef)
+          sub->Decref();
+        else
+          --sub->ref_;
+        if (sub->ref_ == 0 && !sub->QuickDestroy()) {
+          sub->down_ = stack;
+          stack = sub;
+        }
+      }
+      if (re->nsub_ > 1)
+        delete[] subs;
+      re->nsub_ = 0;
+    }
+    delete re;
+  }
+}
+
+void Regexp::AddRuneToString(Rune r) {
+  DCHECK(op_ == kRegexpLiteralString);
+  if (nrunes_ == 0) {
+    // start with 8
+    runes_ = new Rune[8];
+  } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
+    // double on powers of two
+    Rune *old = runes_;
+    runes_ = new Rune[nrunes_ * 2];
+    for (int i = 0; i < nrunes_; i++)
+      runes_[i] = old[i];
+    delete[] old;
+  }
+
+  runes_[nrunes_++] = r;
+}
+
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+  Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+  re->match_id_ = match_id;
+  return re;
+}
+
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
+  if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
+    return sub;
+  Regexp* re = new Regexp(kRegexpPlus, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  return re;
+}
+
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
+  if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
+    return sub;
+  Regexp* re = new Regexp(kRegexpStar, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  return re;
+}
+
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
+  if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
+    return sub;
+  Regexp* re = new Regexp(kRegexpQuest, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  return re;
+}
+
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+                                  ParseFlags flags, bool can_factor) {
+  if (nsub == 1)
+    return sub[0];
+
+  Regexp** subcopy = NULL;
+  if (op == kRegexpAlternate && can_factor) {
+    // Going to edit sub; make a copy so we don't step on caller.
+    subcopy = new Regexp*[nsub];
+    memmove(subcopy, sub, nsub * sizeof sub[0]);
+    sub = subcopy;
+    nsub = FactorAlternation(sub, nsub, flags);
+    if (nsub == 1) {
+      Regexp* re = sub[0];
+      delete[] subcopy;
+      return re;
+    }
+  }
+
+  if (nsub > kMaxNsub) {
+    // Too many subexpressions to fit in a single Regexp.
+    // Make a two-level tree.  Two levels gets us to 65535^2.
+    int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
+    Regexp* re = new Regexp(op, flags);
+    re->AllocSub(nbigsub);
+    Regexp** subs = re->sub();
+    for (int i = 0; i < nbigsub - 1; i++)
+      subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+    subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+                                          nsub - (nbigsub-1)*kMaxNsub, flags,
+                                          false);
+    delete[] subcopy;
+    return re;
+  }
+
+  Regexp* re = new Regexp(op, flags);
+  re->AllocSub(nsub);
+  Regexp** subs = re->sub();
+  for (int i = 0; i < nsub; i++)
+    subs[i] = sub[i];
+
+  delete[] subcopy;
+  return re;
+}
+
+Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
+  return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
+  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
+  Regexp* re = new Regexp(kRegexpCapture, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  re->cap_ = cap;
+  return re;
+}
+
+Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
+  Regexp* re = new Regexp(kRegexpRepeat, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  re->min_ = min;
+  re->max_ = max;
+  return re;
+}
+
+Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
+  Regexp* re = new Regexp(kRegexpLiteral, flags);
+  re->rune_ = rune;
+  return re;
+}
+
+Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
+  if (nrunes <= 0)
+    return new Regexp(kRegexpEmptyMatch, flags);
+  if (nrunes == 1)
+    return NewLiteral(runes[0], flags);
+  Regexp* re = new Regexp(kRegexpLiteralString, flags);
+  for (int i = 0; i < nrunes; i++)
+    re->AddRuneToString(runes[i]);
+  return re;
+}
+
+Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
+  Regexp* re = new Regexp(kRegexpCharClass, flags);
+  re->cc_ = cc;
+  return re;
+}
+
+// Swaps this and that in place.
+void Regexp::Swap(Regexp* that) {
+  // Can use memmove because Regexp is just a struct (no vtable).
+  char tmp[sizeof *this];
+  memmove(tmp, this, sizeof tmp);
+  memmove(this, that, sizeof tmp);
+  memmove(that, tmp, sizeof tmp);
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+  if (a->op() != b->op())
+    return false;
+
+  switch (a->op()) {
+    case kRegexpNoMatch:
+    case kRegexpEmptyMatch:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpBeginLine:
+    case kRegexpEndLine:
+    case kRegexpWordBoundary:
+    case kRegexpNoWordBoundary:
+    case kRegexpBeginText:
+      return true;
+
+    case kRegexpEndText:
+      // The parse flags remember whether it's \z or (?-m:$),
+      // which matters when testing against PCRE.
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+    case kRegexpLiteral:
+      return a->rune() == b->rune() &&
+             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+    case kRegexpLiteralString:
+      return a->nrunes() == b->nrunes() &&
+             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+             memcmp(a->runes(), b->runes(),
+                    a->nrunes() * sizeof a->runes()[0]) == 0;
+
+    case kRegexpAlternate:
+    case kRegexpConcat:
+      return a->nsub() == b->nsub();
+
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+    case kRegexpRepeat:
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+             a->min() == b->min() &&
+             a->max() == b->max();
+
+    case kRegexpCapture:
+      return a->cap() == b->cap() && a->name() == b->name();
+
+    case kRegexpHaveMatch:
+      return a->match_id() == b->match_id();
+
+    case kRegexpCharClass: {
+      CharClass* acc = a->cc();
+      CharClass* bcc = b->cc();
+      return acc->size() == bcc->size() &&
+             acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+             memcmp(acc->begin(), bcc->begin(),
+                    (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+    }
+  }
+
+  LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+  return 0;
+}
+
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+  if (a == NULL || b == NULL)
+    return a == b;
+
+  if (!TopEqual(a, b))
+    return false;
+
+  // Fast path:
+  // return without allocating vector if there are no subregexps.
+  switch (a->op()) {
+    case kRegexpAlternate:
+    case kRegexpConcat:
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+    case kRegexpRepeat:
+    case kRegexpCapture:
+      break;
+
+    default:
+      return true;
+  }
+
+  // Committed to doing real work.
+  // The stack (vector) has pairs of regexps waiting to
+  // be compared.  The regexps are only equal if
+  // all the pairs end up being equal.
+  vector<Regexp*> stk;
+
+  for (;;) {
+    // Invariant: TopEqual(a, b) == true.
+    Regexp* a2;
+    Regexp* b2;
+    switch (a->op()) {
+      default:
+        break;
+      case kRegexpAlternate:
+      case kRegexpConcat:
+        for (int i = 0; i < a->nsub(); i++) {
+          a2 = a->sub()[i];
+          b2 = b->sub()[i];
+          if (!TopEqual(a2, b2))
+            return false;
+          stk.push_back(a2);
+          stk.push_back(b2);
+        }
+        break;
+
+      case kRegexpStar:
+      case kRegexpPlus:
+      case kRegexpQuest:
+      case kRegexpRepeat:
+      case kRegexpCapture:
+        a2 = a->sub()[0];
+        b2 = b->sub()[0];
+        if (!TopEqual(a2, b2))
+          return false;
+        // Really:
+        //   stk.push_back(a2);
+        //   stk.push_back(b2);
+        //   break;
+        // but faster to assign directly and loop.
+        a = a2;
+        b = b2;
+        continue;
+    }
+
+    int n = stk.size();
+    if (n == 0)
+      break;
+
+    a = stk[n-2];
+    b = stk[n-1];
+    stk.resize(n-2);
+  }
+
+  return true;
+}
+
+// Keep in sync with enum RegexpStatusCode in regexp.h
+static const string kErrorStrings[] = {
+  "no error",
+  "unexpected error",
+  "invalid escape sequence",
+  "invalid character class",
+  "invalid character class range",
+  "missing ]",
+  "missing )",
+  "trailing \\",
+  "no argument for repetition operator",
+  "invalid repetition size",
+  "bad repetition operator",
+  "invalid perl operator",
+  "invalid UTF-8",
+  "invalid named capture group",
+};
+
+const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
+  if (code < 0 || code >= arraysize(kErrorStrings))
+    code = kRegexpInternalError;
+  return kErrorStrings[code];
+}
+
+string RegexpStatus::Text() const {
+  if (error_arg_.empty())
+    return CodeText(code_);
+  string s;
+  s.append(CodeText(code_));
+  s.append(": ");
+  s.append(error_arg_.data(), error_arg_.size());
+  return s;
+}
+
+void RegexpStatus::Copy(const RegexpStatus& status) {
+  code_ = status.code_;
+  error_arg_ = status.error_arg_;
+}
+
+typedef int Ignored;  // Walker<void> doesn't exist
+
+// Walker subclass to count capturing parens in regexp.
+class NumCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+  NumCapturesWalker() : ncapture_(0) {}
+  int ncapture() { return ncapture_; }
+
+  virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+    if (re->op() == kRegexpCapture)
+      ncapture_++;
+    return ignored;
+  }
+  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
+    return ignored;
+  }
+
+ private:
+  int ncapture_;
+  DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
+};
+
+int Regexp::NumCaptures() {
+  NumCapturesWalker w;
+  w.Walk(this, 0);
+  return w.ncapture();
+}
+
+// Walker class to build map of named capture groups and their indices.
+class NamedCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+  NamedCapturesWalker() : map_(NULL) {}
+  ~NamedCapturesWalker() { delete map_; }
+
+  map<string, int>* TakeMap() {
+    map<string, int>* m = map_;
+    map_ = NULL;
+    return m;
+  }
+
+  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+    if (re->op() == kRegexpCapture && re->name() != NULL) {
+      // Allocate map once we find a name.
+      if (map_ == NULL)
+        map_ = new map<string, int>;
+
+      // Record first occurrence of each name.
+      // (The rule is that if you have the same name
+      // multiple times, only the leftmost one counts.)
+      if (map_->find(*re->name()) == map_->end())
+        (*map_)[*re->name()] = re->cap();
+    }
+    return ignored;
+  }
+
+  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
+    return ignored;
+  }
+
+ private:
+  map<string, int>* map_;
+  DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
+};
+
+map<string, int>* Regexp::NamedCaptures() {
+  NamedCapturesWalker w;
+  w.Walk(this, 0);
+  return w.TakeMap();
+}
+
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+ public:
+  CaptureNamesWalker() : map_(NULL) {}
+  ~CaptureNamesWalker() { delete map_; }
+
+  map<int, string>* TakeMap() {
+    map<int, string>* m = map_;
+    map_ = NULL;
+    return m;
+  }
+
+  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+    if (re->op() == kRegexpCapture && re->name() != NULL) {
+      // Allocate map once we find a name.
+      if (map_ == NULL)
+        map_ = new map<int, string>;
+
+      (*map_)[re->cap()] = *re->name();
+    }
+    return ignored;
+  }
+
+  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+    return ignored;
+  }
+
+ private:
+  map<int, string>* map_;
+  DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
+};
+
+map<int, string>* Regexp::CaptureNames() {
+  CaptureNamesWalker w;
+  w.Walk(this, 0);
+  return w.TakeMap();
+}
+
+// Determines whether regexp matches must be anchored
+// with a fixed string prefix.  If so, returns the prefix and
+// the regexp that remains after the prefix.  The prefix might
+// be ASCII case-insensitive.
+bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
+  // No need for a walker: the regexp must be of the form
+  // 1. some number of ^ anchors
+  // 2. a literal char or string
+  // 3. the rest
+  prefix->clear();
+  *foldcase = false;
+  *suffix = NULL;
+  if (op_ != kRegexpConcat)
+    return false;
+
+  // Some number of anchors, then a literal or concatenation.
+  int i = 0;
+  Regexp** sub = this->sub();
+  while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
+    i++;
+  if (i == 0 || i >= nsub_)
+    return false;
+
+  Regexp* re = sub[i];
+  switch (re->op_) {
+    default:
+      return false;
+
+    case kRegexpLiteralString:
+      // Convert to string in proper encoding.
+      if (re->parse_flags() & Latin1) {
+        prefix->resize(re->nrunes_);
+        for (int j = 0; j < re->nrunes_; j++)
+          (*prefix)[j] = re->runes_[j];
+      } else {
+        // Convert to UTF-8 in place.
+        // Assume worst-case space and then trim.
+        prefix->resize(re->nrunes_ * UTFmax);
+        char *p = &(*prefix)[0];
+        for (int j = 0; j < re->nrunes_; j++) {
+          Rune r = re->runes_[j];
+          if (r < Runeself)
+            *p++ = r;
+          else
+            p += runetochar(p, &r);
+        }
+        prefix->resize(p - &(*prefix)[0]);
+      }
+      break;
+
+    case kRegexpLiteral:
+      if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
+        prefix->append(1, re->rune_);
+      } else {
+        char buf[UTFmax];
+        prefix->append(buf, runetochar(buf, &re->rune_));
+      }
+      break;
+  }
+  *foldcase = (sub[i]->parse_flags() & FoldCase);
+  i++;
+
+  // The rest.
+  if (i < nsub_) {
+    for (int j = i; j < nsub_; j++)
+      sub[j]->Incref();
+    re = Concat(sub + i, nsub_ - i, parse_flags());
+  } else {
+    re = new Regexp(kRegexpEmptyMatch, parse_flags());
+  }
+  *suffix = re;
+  return true;
+}
+
+// Character class builder is a balanced binary tree (STL set)
+// containing non-overlapping, non-abutting RuneRanges.
+// The less-than operator used in the tree treats two
+// ranges as equal if they overlap at all, so that
+// lookups for a particular Rune are possible.
+
+CharClassBuilder::CharClassBuilder() {
+  nrunes_ = 0;
+  upper_ = 0;
+  lower_ = 0;
+}
+
+// Add lo-hi to the class; return whether class got bigger.
+bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
+  if (hi < lo)
+    return false;
+
+  if (lo <= 'z' && hi >= 'A') {
+    // Overlaps some alpha, maybe not all.
+    // Update bitmaps telling which ASCII letters are in the set.
+    Rune lo1 = max<Rune>(lo, 'A');
+    Rune hi1 = min<Rune>(hi, 'Z');
+    if (lo1 <= hi1)
+      upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
+
+    lo1 = max<Rune>(lo, 'a');
+    hi1 = min<Rune>(hi, 'z');
+    if (lo1 <= hi1)
+      lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
+  }
+
+  {  // Check whether lo, hi is already in the class.
+    iterator it = ranges_.find(RuneRange(lo, lo));
+    if (it != end() && it->lo <= lo && hi <= it->hi)
+      return false;
+  }
+
+  // Look for a range abutting lo on the left.
+  // If it exists, take it out and increase our range.
+  if (lo > 0) {
+    iterator it = ranges_.find(RuneRange(lo-1, lo-1));
+    if (it != end()) {
+      lo = it->lo;
+      if (it->hi > hi)
+        hi = it->hi;
+      nrunes_ -= it->hi - it->lo + 1;
+      ranges_.erase(it);
+    }
+  }
+
+  // Look for a range abutting hi on the right.
+  // If it exists, take it out and increase our range.
+  if (hi < Runemax) {
+    iterator it = ranges_.find(RuneRange(hi+1, hi+1));
+    if (it != end()) {
+      hi = it->hi;
+      nrunes_ -= it->hi - it->lo + 1;
+      ranges_.erase(it);
+    }
+  }
+
+  // Look for ranges between lo and hi.  Take them out.
+  // This is only safe because the set has no overlapping ranges.
+  // We've already removed any ranges abutting lo and hi, so
+  // any that overlap [lo, hi] must be contained within it.
+  for (;;) {
+    iterator it = ranges_.find(RuneRange(lo, hi));
+    if (it == end())
+      break;
+    nrunes_ -= it->hi - it->lo + 1;
+    ranges_.erase(it);
+  }
+
+  // Finally, add [lo, hi].
+  nrunes_ += hi - lo + 1;
+  ranges_.insert(RuneRange(lo, hi));
+  return true;
+}
+
+void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
+  for (iterator it = cc->begin(); it != cc->end(); ++it)
+    AddRange(it->lo, it->hi);
+}
+
+bool CharClassBuilder::Contains(Rune r) {
+  return ranges_.find(RuneRange(r, r)) != end();
+}
+
+// Does the character class behave the same on A-Z as on a-z?
+bool CharClassBuilder::FoldsASCII() {
+  return ((upper_ ^ lower_) & AlphaMask) == 0;
+}
+
+CharClassBuilder* CharClassBuilder::Copy() {
+  CharClassBuilder* cc = new CharClassBuilder;
+  for (iterator it = begin(); it != end(); ++it)
+    cc->ranges_.insert(RuneRange(it->lo, it->hi));
+  cc->upper_ = upper_;
+  cc->lower_ = lower_;
+  cc->nrunes_ = nrunes_;
+  return cc;
+}
+
+
+
+void CharClassBuilder::RemoveAbove(Rune r) {
+  if (r >= Runemax)
+    return;
+
+  if (r < 'z') {
+    if (r < 'a')
+      lower_ = 0;
+    else
+      lower_ &= AlphaMask >> ('z' - r);
+  }
+
+  if (r < 'Z') {
+    if (r < 'A')
+      upper_ = 0;
+    else
+      upper_ &= AlphaMask >> ('Z' - r);
+  }
+
+  for (;;) {
+
+    iterator it = ranges_.find(RuneRange(r + 1, Runemax));
+    if (it == end())
+      break;
+    RuneRange rr = *it;
+    ranges_.erase(it);
+    nrunes_ -= rr.hi - rr.lo + 1;
+    if (rr.lo <= r) {
+      rr.hi = r;
+      ranges_.insert(rr);
+      nrunes_ += rr.hi - rr.lo + 1;
+    }
+  }
+}
+
+void CharClassBuilder::Negate() {
+  // Build up negation and then copy in.
+  // Could edit ranges in place, but C++ won't let me.
+  vector<RuneRange> v;
+  v.reserve(ranges_.size() + 1);
+
+  // In negation, first range begins at 0, unless
+  // the current class begins at 0.
+  iterator it = begin();
+  if (it == end()) {
+    v.push_back(RuneRange(0, Runemax));
+  } else {
+    int nextlo = 0;
+    if (it->lo == 0) {
+      nextlo = it->hi + 1;
+      ++it;
+    }
+    for (; it != end(); ++it) {
+      v.push_back(RuneRange(nextlo, it->lo - 1));
+      nextlo = it->hi + 1;
+    }
+    if (nextlo <= Runemax)
+      v.push_back(RuneRange(nextlo, Runemax));
+  }
+
+  ranges_.clear();
+  for (int i = 0; i < v.size(); i++)
+    ranges_.insert(v[i]);
+
+  upper_ = AlphaMask & ~upper_;
+  lower_ = AlphaMask & ~lower_;
+  nrunes_ = Runemax+1 - nrunes_;
+}
+
+// Character class is a sorted list of ranges.
+// The ranges are allocated in the same block as the header,
+// necessitating a special allocator and Delete method.
+
+CharClass* CharClass::New(int maxranges) {
+  CharClass* cc;
+  uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
+  cc = reinterpret_cast<CharClass*>(data);
+  cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
+  cc->nranges_ = 0;
+  cc->folds_ascii_ = false;
+  cc->nrunes_ = 0;
+  return cc;
+}
+
+void CharClass::Delete() {
+  if (this == NULL)
+    return;
+  uint8 *data = reinterpret_cast<uint8*>(this);
+  delete[] data;
+}
+
+CharClass* CharClass::Negate() {
+  CharClass* cc = CharClass::New(nranges_+1);
+  cc->folds_ascii_ = folds_ascii_;
+  cc->nrunes_ = Runemax + 1 - nrunes_;
+  int n = 0;
+  int nextlo = 0;
+  for (CharClass::iterator it = begin(); it != end(); ++it) {
+    if (it->lo == nextlo) {
+      nextlo = it->hi + 1;
+    } else {
+      cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
+      nextlo = it->hi + 1;
+    }
+  }
+  if (nextlo <= Runemax)
+    cc->ranges_[n++] = RuneRange(nextlo, Runemax);
+  cc->nranges_ = n;
+  return cc;
+}
+
+bool CharClass::Contains(Rune r) {
+  RuneRange* rr = ranges_;
+  int n = nranges_;
+  while (n > 0) {
+    int m = n/2;
+    if (rr[m].hi < r) {
+      rr += m+1;
+      n -= m+1;
+    } else if (r < rr[m].lo) {
+      n = m;
+    } else {  // rr[m].lo <= r && r <= rr[m].hi
+      return true;
+    }
+  }
+  return false;
+}
+
+CharClass* CharClassBuilder::GetCharClass() {
+  CharClass* cc = CharClass::New(ranges_.size());
+  int n = 0;
+  for (iterator it = begin(); it != end(); ++it)
+    cc->ranges_[n++] = *it;
+  cc->nranges_ = n;
+  DCHECK_LE(n, ranges_.size());
+  cc->nrunes_ = nrunes_;
+  cc->folds_ascii_ = FoldsASCII();
+  return cc;
+}
+
+}  // namespace re2
--- a/re2/re2/regexp.h
+++ b/re2/re2/regexp.h
@ -0,0 +1,632 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// --- SPONSORED LINK --------------------------------------------------
+// If you want to use this library for regular expression matching,
+// you should use re2/re2.h, which provides a class RE2 that
+// mimics the PCRE interface provided by PCRE's C++ wrappers.
+// This header describes the low-level interface used to implement RE2
+// and may change in backwards-incompatible ways from time to time.
+// In contrast, RE2's interface will not.
+// ---------------------------------------------------------------------
+
+// Regular expression library: parsing, execution, and manipulation
+// of regular expressions.
+//
+// Any operation that traverses the Regexp structures should be written
+// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
+// regular expressions such as x++++++++++++++++++++... might cause recursive
+// traversals to overflow the stack.
+//
+// It is the caller's responsibility to provide appropriate mutual exclusion
+// around manipulation of the regexps.  RE2 does this.
+//
+// PARSING
+//
+// Regexp::Parse parses regular expressions encoded in UTF-8.
+// The default syntax is POSIX extended regular expressions,
+// with the following changes:
+//
+//   1.  Backreferences (optional in POSIX EREs) are not supported.
+//         (Supporting them precludes the use of DFA-based
+//          matching engines.)
+//
+//   2.  Collating elements and collation classes are not supported.
+//         (No one has needed or wanted them.)
+//
+// The exact syntax accepted can be modified by passing flags to
+// Regexp::Parse.  In particular, many of the basic Perl additions
+// are available.  The flags are documented below (search for LikePerl).
+//
+// If parsed with the flag Regexp::Latin1, both the regular expression
+// and the input to the matching routines are assumed to be encoded in
+// Latin-1, not UTF-8.
+//
+// EXECUTION
+//
+// Once Regexp has parsed a regular expression, it provides methods
+// to search text using that regular expression.  These methods are
+// implemented via calling out to other regular expression libraries.
+// (Let's call them the sublibraries.)
+//
+// To call a sublibrary, Regexp does not simply prepare a
+// string version of the regular expression and hand it to the
+// sublibrary.  Instead, Regexp prepares, from its own parsed form, the
+// corresponding internal representation used by the sublibrary.
+// This has the drawback of needing to know the internal representation
+// used by the sublibrary, but it has two important benefits:
+//
+//   1. The syntax and meaning of regular expressions is guaranteed
+//      to be that used by Regexp's parser, not the syntax expected
+//      by the sublibrary.  Regexp might accept a restricted or
+//      expanded syntax for regular expressions as compared with
+//      the sublibrary.  As long as Regexp can translate from its
+//      internal form into the sublibrary's, clients need not know
+//      exactly which sublibrary they are using.
+//
+//   2. The sublibrary parsers are bypassed.  For whatever reason,
+//      sublibrary regular expression parsers often have security
+//      problems.  For example, plan9grep's regular expression parser
+//      has a buffer overflow in its handling of large character
+//      classes, and PCRE's parser has had buffer overflow problems
+//      in the past.  Security-team requires sandboxing of sublibrary
+//      regular expression parsers.  Avoiding the sublibrary parsers
+//      avoids the sandbox.
+//
+// The execution methods we use now are provided by the compiled form,
+// Prog, described in prog.h
+//
+// MANIPULATION
+//
+// Unlike other regular expression libraries, Regexp makes its parsed
+// form accessible to clients, so that client code can analyze the
+// parsed regular expressions.
+
+#ifndef RE2_REGEXP_H__
+#define RE2_REGEXP_H__
+
+#include "util/util.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
+enum RegexpOp {
+  // Matches no strings.
+  kRegexpNoMatch = 1,
+
+  // Matches empty string.
+  kRegexpEmptyMatch,
+
+  // Matches rune_.
+  kRegexpLiteral,
+
+  // Matches runes_.
+  kRegexpLiteralString,
+
+  // Matches concatenation of sub_[0..nsub-1].
+  kRegexpConcat,
+  // Matches union of sub_[0..nsub-1].
+  kRegexpAlternate,
+
+  // Matches sub_[0] zero or more times.
+  kRegexpStar,
+  // Matches sub_[0] one or more times.
+  kRegexpPlus,
+  // Matches sub_[0] zero or one times.
+  kRegexpQuest,
+
+  // Matches sub_[0] at least min_ times, at most max_ times.
+  // max_ == -1 means no upper limit.
+  kRegexpRepeat,
+
+  // Parenthesized (capturing) subexpression.  Index is cap_.
+  // Optionally, capturing name is name_.
+  kRegexpCapture,
+
+  // Matches any character.
+  kRegexpAnyChar,
+
+  // Matches any byte [sic].
+  kRegexpAnyByte,
+
+  // Matches empty string at beginning of line.
+  kRegexpBeginLine,
+  // Matches empty string at end of line.
+  kRegexpEndLine,
+
+  // Matches word boundary "\b".
+  kRegexpWordBoundary,
+  // Matches not-a-word boundary "\B".
+  kRegexpNoWordBoundary,
+
+  // Matches empty string at beginning of text.
+  kRegexpBeginText,
+  // Matches empty string at end of text.
+  kRegexpEndText,
+
+  // Matches character class given by cc_.
+  kRegexpCharClass,
+
+  // Forces match of entire expression right now,
+  // with match ID match_id_ (used by RE2::Set).
+  kRegexpHaveMatch,
+
+  kMaxRegexpOp = kRegexpHaveMatch,
+};
+
+// Keep in sync with string list in regexp.cc
+enum RegexpStatusCode {
+  // No error
+  kRegexpSuccess = 0,
+
+  // Unexpected error
+  kRegexpInternalError,
+
+  // Parse errors
+  kRegexpBadEscape,          // bad escape sequence
+  kRegexpBadCharClass,       // bad character class
+  kRegexpBadCharRange,       // bad character class range
+  kRegexpMissingBracket,     // missing closing ]
+  kRegexpMissingParen,       // missing closing )
+  kRegexpTrailingBackslash,  // at end of regexp
+  kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
+  kRegexpRepeatSize,         // bad repetition argument
+  kRegexpRepeatOp,           // bad repetition operator
+  kRegexpBadPerlOp,          // bad perl operator
+  kRegexpBadUTF8,            // invalid UTF-8 in regexp
+  kRegexpBadNamedCapture,    // bad named capture
+};
+
+// Error status for certain operations.
+class RegexpStatus {
+ public:
+  RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
+  ~RegexpStatus() { delete tmp_; }
+
+  void set_code(enum RegexpStatusCode code) { code_ = code; }
+  void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
+  void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
+  enum RegexpStatusCode code() const { return code_; }
+  const StringPiece& error_arg() const { return error_arg_; }
+  bool ok() const { return code() == kRegexpSuccess; }
+
+  // Copies state from status.
+  void Copy(const RegexpStatus& status);
+
+  // Returns text equivalent of code, e.g.:
+  //   "Bad character class"
+  static const string& CodeText(enum RegexpStatusCode code);
+
+  // Returns text describing error, e.g.:
+  //   "Bad character class: [z-a]"
+  string Text() const;
+
+ private:
+  enum RegexpStatusCode code_;  // Kind of error
+  StringPiece error_arg_;       // Piece of regexp containing syntax error.
+  string* tmp_;                 // Temporary storage, possibly where error_arg_ is.
+
+  DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
+};
+
+// Walker to implement Simplify.
+class SimplifyWalker;
+
+// Compiled form; see prog.h
+class Prog;
+
+struct RuneRange {
+  RuneRange() : lo(0), hi(0) { }
+  RuneRange(int l, int h) : lo(l), hi(h) { }
+  Rune lo;
+  Rune hi;
+};
+
+// Less-than on RuneRanges treats a == b if they overlap at all.
+// This lets us look in a set to find the range covering a particular Rune.
+struct RuneRangeLess {
+  bool operator()(const RuneRange& a, const RuneRange& b) const {
+    return a.hi < b.lo;
+  }
+};
+
+class CharClassBuilder;
+
+class CharClass {
+ public:
+  void Delete();
+
+  typedef RuneRange* iterator;
+  iterator begin() { return ranges_; }
+  iterator end() { return ranges_ + nranges_; }
+
+  int size() { return nrunes_; }
+  bool empty() { return nrunes_ == 0; }
+  bool full() { return nrunes_ == Runemax+1; }
+  bool FoldsASCII() { return folds_ascii_; }
+
+  bool Contains(Rune r);
+  CharClass* Negate();
+
+ private:
+  CharClass();  // not implemented
+  ~CharClass();  // not implemented
+  static CharClass* New(int maxranges);
+
+  friend class CharClassBuilder;
+
+  bool folds_ascii_;
+  int nrunes_;
+  RuneRange *ranges_;
+  int nranges_;
+  DISALLOW_EVIL_CONSTRUCTORS(CharClass);
+};
+
+class Regexp {
+ public:
+
+  // Flags for parsing.  Can be ORed together.
+  enum ParseFlags {
+    NoParseFlags = 0,
+    FoldCase     = 1<<0,   // Fold case during matching (case-insensitive).
+    Literal      = 1<<1,   // Treat s as literal string instead of a regexp.
+    ClassNL      = 1<<2,   // Allow char classes like [^a-z] and \D and \s
+                           // and [[:space:]] to match newline.
+    DotNL        = 1<<3,   // Allow . to match newline.
+    MatchNL      = ClassNL | DotNL,
+    OneLine      = 1<<4,   // Treat ^ and $ as only matching at beginning and
+                           // end of text, not around embedded newlines.
+                           // (Perl's default)
+    Latin1       = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
+    NonGreedy    = 1<<6,   // Repetition operators are non-greedy by default.
+    PerlClasses  = 1<<7,   // Allow Perl character classes like \d.
+    PerlB        = 1<<8,   // Allow Perl's \b and \B.
+    PerlX        = 1<<9,   // Perl extensions:
+                           //   non-capturing parens - (?: )
+                           //   non-greedy operators - *? +? ?? {}?
+                           //   flag edits - (?i) (?-i) (?i: )
+                           //     i - FoldCase
+                           //     m - !OneLine
+                           //     s - DotNL
+                           //     U - NonGreedy
+                           //   line ends: \A \z
+                           //   \Q and \E to disable/enable metacharacters
+                           //   (?P<name>expr) for named captures
+                           //   \C to match any single byte
+    UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
+                           //   and \P{Han} for its negation.
+    NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
+                           //   it explicitly.
+
+    // As close to Perl as we can get.
+    LikePerl     = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
+                   UnicodeGroups,
+
+    // Internal use only.
+    WasDollar    = 1<<15,  // on kRegexpEndText: was $ in regexp text
+  };
+
+  // Get.  No set, Regexps are logically immutable once created.
+  RegexpOp op() { return static_cast<RegexpOp>(op_); }
+  int nsub() { return nsub_; }
+  bool simple() { return simple_; }
+  enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
+  int Ref();  // For testing.
+
+  Regexp** sub() {
+    if(nsub_ <= 1)
+      return &subone_;
+    else
+      return submany_;
+  }
+
+  int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
+  int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
+  Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
+  CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
+  int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
+  const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
+  Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
+  int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
+  int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
+
+  // Increments reference count, returns object as convenience.
+  Regexp* Incref();
+
+  // Decrements reference count and deletes this object if count reaches 0.
+  void Decref();
+
+  // Parses string s to produce regular expression, returned.
+  // Caller must release return value with re->Decref().
+  // On failure, sets *status (if status != NULL) and returns NULL.
+  static Regexp* Parse(const StringPiece& s, ParseFlags flags,
+                       RegexpStatus* status);
+
+  // Returns a _new_ simplified version of the current regexp.
+  // Does not edit the current regexp.
+  // Caller must release return value with re->Decref().
+  // Simplified means that counted repetition has been rewritten
+  // into simpler terms and all Perl/POSIX features have been
+  // removed.  The result will capture exactly the same
+  // subexpressions the original did, unless formatted with ToString.
+  Regexp* Simplify();
+  friend class SimplifyWalker;
+
+  // Parses the regexp src and then simplifies it and sets *dst to the
+  // string representation of the simplified form.  Returns true on success.
+  // Returns false and sets *status (if status != NULL) on parse error.
+  static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
+                             string* dst,
+                             RegexpStatus* status);
+
+  // Returns the number of capturing groups in the regexp.
+  int NumCaptures();
+  friend class NumCapturesWalker;
+
+  // Returns a map from names to capturing group indices,
+  // or NULL if the regexp contains no named capture groups.
+  // The caller is responsible for deleting the map.
+  map<string, int>* NamedCaptures();
+
+  // Returns a map from capturing group indices to capturing group
+  // names or NULL if the regexp contains no named capture groups. The
+  // caller is responsible for deleting the map.
+  map<int, string>* CaptureNames();
+
+  // Returns a string representation of the current regexp,
+  // using as few parentheses as possible.
+  string ToString();
+
+  // Convenience functions.  They consume the passed reference,
+  // so in many cases you should use, e.g., Plus(re->Incref(), flags).
+  // They do not consume allocated arrays like subs or runes.
+  static Regexp* Plus(Regexp* sub, ParseFlags flags);
+  static Regexp* Star(Regexp* sub, ParseFlags flags);
+  static Regexp* Quest(Regexp* sub, ParseFlags flags);
+  static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
+  static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
+  static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
+  static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
+  static Regexp* NewLiteral(Rune rune, ParseFlags flags);
+  static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
+  static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
+  static Regexp* HaveMatch(int match_id, ParseFlags flags);
+
+  // Like Alternate but does not factor out common prefixes.
+  static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
+
+  // Debugging function.  Returns string format for regexp
+  // that makes structure clear.  Does NOT use regexp syntax.
+  string Dump();
+
+  // Helper traversal class, defined fully in walker-inl.h.
+  template<typename T> class Walker;
+
+  // Compile to Prog.  See prog.h
+  // Reverse prog expects to be run over text backward.
+  // Construction and execution of prog will
+  // stay within approximately max_mem bytes of memory.
+  // If max_mem <= 0, a reasonable default is used.
+  Prog* CompileToProg(int64 max_mem);
+  Prog* CompileToReverseProg(int64 max_mem);
+
+  // Whether to expect this library to find exactly the same answer as PCRE
+  // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
+  // obscure cases behave differently.  Technically this is more a property
+  // of the Prog than the Regexp, but the computation is much easier to do
+  // on the Regexp.  See mimics_pcre.cc for the exact conditions.
+  bool MimicsPCRE();
+
+  // Benchmarking function.
+  void NullWalk();
+
+  // Whether every match of this regexp must be anchored and
+  // begin with a non-empty fixed string (perhaps after ASCII
+  // case-folding).  If so, returns the prefix and the sub-regexp that
+  // follows it.
+  bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
+
+ private:
+  // Constructor allocates vectors as appropriate for operator.
+  explicit Regexp(RegexpOp op, ParseFlags parse_flags);
+
+  // Use Decref() instead of delete to release Regexps.
+  // This is private to catch deletes at compile time.
+  ~Regexp();
+  void Destroy();
+  bool QuickDestroy();
+
+  // Helpers for Parse.  Listed here so they can edit Regexps.
+  class ParseState;
+  friend class ParseState;
+  friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
+                             RegexpStatus* status);
+
+  // Helper for testing [sic].
+  friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
+
+  // Computes whether Regexp is already simple.
+  bool ComputeSimple();
+
+  // Constructor that generates a concatenation or alternation,
+  // enforcing the limit on the number of subexpressions for
+  // a particular Regexp.
+  static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
+                                   ParseFlags flags, bool can_factor);
+
+  // Returns the leading string that re starts with.
+  // The returned Rune* points into a piece of re,
+  // so it must not be used after the caller calls re->Decref().
+  static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
+
+  // Removes the first n leading runes from the beginning of re.
+  // Edits re in place.
+  static void RemoveLeadingString(Regexp* re, int n);
+
+  // Returns the leading regexp in re's top-level concatenation.
+  // The returned Regexp* points at re or a sub-expression of re,
+  // so it must not be used after the caller calls re->Decref().
+  static Regexp* LeadingRegexp(Regexp* re);
+
+  // Removes LeadingRegexp(re) from re and returns the remainder.
+  // Might edit re in place.
+  static Regexp* RemoveLeadingRegexp(Regexp* re);
+
+  // Simplifies an alternation of literal strings by factoring out
+  // common prefixes.
+  static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
+  static int FactorAlternationRecursive(Regexp** sub, int nsub,
+                                        ParseFlags flags, int maxdepth);
+
+  // Is a == b?  Only efficient on regexps that have not been through
+  // Simplify yet - the expansion of a kRegexpRepeat will make this
+  // take a long time.  Do not call on such regexps, hence private.
+  static bool Equal(Regexp* a, Regexp* b);
+
+  // Allocate space for n sub-regexps.
+  void AllocSub(int n) {
+    if (n < 0 || static_cast<uint16>(n) != n)
+      LOG(FATAL) << "Cannot AllocSub " << n;
+    if (n > 1)
+      submany_ = new Regexp*[n];
+    nsub_ = n;
+  }
+
+  // Add Rune to LiteralString
+  void AddRuneToString(Rune r);
+
+  // Swaps this with that, in place.
+  void Swap(Regexp *that);
+
+  // Operator.  See description of operators above.
+  // uint8 instead of RegexpOp to control space usage.
+  uint8 op_;
+
+  // Is this regexp structure already simple
+  // (has it been returned by Simplify)?
+  // uint8 instead of bool to control space usage.
+  uint8 simple_;
+
+  // Flags saved from parsing and used during execution.
+  // (Only FoldCase is used.)
+  // uint16 instead of ParseFlags to control space usage.
+  uint16 parse_flags_;
+
+  // Reference count.  Exists so that SimplifyRegexp can build
+  // regexp structures that are dags rather than trees to avoid
+  // exponential blowup in space requirements.
+  // uint16 to control space usage.
+  // The standard regexp routines will never generate a
+  // ref greater than the maximum repeat count (100),
+  // but even so, Incref and Decref consult an overflow map
+  // when ref_ reaches kMaxRef.
+  uint16 ref_;
+  static const uint16 kMaxRef = 0xffff;
+
+  // Subexpressions.
+  // uint16 to control space usage.
+  // Concat and Alternate handle larger numbers of subexpressions
+  // by building concatenation or alternation trees.
+  // Other routines should call Concat or Alternate instead of
+  // filling in sub() by hand.
+  uint16 nsub_;
+  static const uint16 kMaxNsub = 0xffff;
+  union {
+    Regexp** submany_;  // if nsub_ > 1
+    Regexp* subone_;  // if nsub_ == 1
+  };
+
+  // Extra space for parse and teardown stacks.
+  Regexp* down_;
+
+  // Arguments to operator.  See description of operators above.
+  union {
+    struct {  // Repeat
+      int max_;
+      int min_;
+    };
+    struct {  // Capture
+      int cap_;
+      string* name_;
+    };
+    struct {  // LiteralString
+      int nrunes_;
+      Rune* runes_;
+    };
+    struct {  // CharClass
+      // These two could be in separate union members,
+      // but it wouldn't save any space (there are other two-word structs)
+      // and keeping them separate avoids confusion during parsing.
+      CharClass* cc_;
+      CharClassBuilder* ccb_;
+    };
+    Rune rune_;  // Literal
+    int match_id_;  // HaveMatch
+    void *the_union_[2];  // as big as any other element, for memset
+  };
+
+  DISALLOW_EVIL_CONSTRUCTORS(Regexp);
+};
+
+// Character class set: contains non-overlapping, non-abutting RuneRanges.
+typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
+
+class CharClassBuilder {
+ public:
+  CharClassBuilder();
+
+  typedef RuneRangeSet::iterator iterator;
+  iterator begin() { return ranges_.begin(); }
+  iterator end() { return ranges_.end(); }
+
+  int size() { return nrunes_; }
+  bool empty() { return nrunes_ == 0; }
+  bool full() { return nrunes_ == Runemax+1; }
+
+  bool Contains(Rune r);
+  bool FoldsASCII();
+  bool AddRange(Rune lo, Rune hi);  // returns whether class changed
+  CharClassBuilder* Copy();
+  void AddCharClass(CharClassBuilder* cc);
+  void Negate();
+  void RemoveAbove(Rune r);
+  CharClass* GetCharClass();
+  void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
+
+ private:
+  static const uint32 AlphaMask = (1<<26) - 1;
+  uint32 upper_;  // bitmap of A-Z
+  uint32 lower_;  // bitmap of a-z
+  int nrunes_;
+  RuneRangeSet ranges_;
+  DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
+};
+
+// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
+inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
+{
+  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
+{
+  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
+{
+  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
+}
+
+inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
+{
+  return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
+}
+
+
+
+}  // namespace re2
+
+#endif  // RE2_REGEXP_H__
--- a/re2/re2/set.cc
+++ b/re2/re2/set.cc
@ -0,0 +1,113 @@
+// Copyright 2010 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/set.h"
+
+#include "util/util.h"
+#include "re2/stringpiece.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+using namespace re2;
+
+RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
+  options_.Copy(options);
+  anchor_ = anchor;
+  prog_ = NULL;
+  compiled_ = false;
+}
+
+RE2::Set::~Set() {
+  for (int i = 0; i < re_.size(); i++)
+    re_[i]->Decref();
+  delete prog_;
+}
+
+int RE2::Set::Add(const StringPiece& pattern, string* error) {
+  if (compiled_) {
+    LOG(DFATAL) << "RE2::Set::Add after Compile";
+    return -1;
+  }
+
+  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+    options_.ParseFlags());
+
+  RegexpStatus status;
+  re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
+  if (re == NULL) {
+    if (error != NULL)
+      *error = status.Text();
+    if (options_.log_errors())
+      LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
+    return -1;
+  }
+
+  // Concatenate with match index and push on vector.
+  int n = re_.size();
+  re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
+  if (re->op() == kRegexpConcat) {
+    int nsub = re->nsub();
+    re2::Regexp** sub = new re2::Regexp*[nsub + 1];
+    for (int i = 0; i < nsub; i++)
+      sub[i] = re->sub()[i]->Incref();
+    sub[nsub] = m;
+    re->Decref();
+    re = re2::Regexp::Concat(sub, nsub + 1, pf);
+    delete[] sub;
+  } else {
+    re2::Regexp* sub[2];
+    sub[0] = re;
+    sub[1] = m;
+    re = re2::Regexp::Concat(sub, 2, pf);
+  }
+  re_.push_back(re);
+  return n;
+}
+
+bool RE2::Set::Compile() {
+  if (compiled_) {
+    LOG(DFATAL) << "RE2::Set::Compile multiple times";
+    return false;
+  }
+  compiled_ = true;
+
+  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+    options_.ParseFlags());
+  re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
+                                           re_.size(), pf);
+  re_.clear();
+  re2::Regexp* sre = re->Simplify();
+  re->Decref();
+  re = sre;
+  if (re == NULL) {
+    if (options_.log_errors())
+      LOG(ERROR) << "Error simplifying during Compile.";
+    return false;
+  }
+
+  prog_ = Prog::CompileSet(options_, anchor_, re);
+  return prog_ != NULL;
+}
+
+bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
+  if (!compiled_) {
+    LOG(DFATAL) << "RE2::Set::Match without Compile";
+    return false;
+  }
+  v->clear();
+  bool failed;
+  bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
+                              Prog::kManyMatch, NULL, &failed, v);
+  if (failed)
+    LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
+
+  if (ret == false)
+    return false;
+  if (v->size() == 0) {
+    LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
+    return false;
+  }
+  return true;
+}
--- a/re2/re2/set.h
+++ b/re2/re2/set.h
@ -0,0 +1,55 @@
+// Copyright 2010 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_SET_H
+#define RE2_SET_H
+
+#include <utility>
+#include <vector>
+
+#include "re2/re2.h"
+
+namespace re2 {
+using std::vector;
+
+// An RE2::Set represents a collection of regexps that can
+// be searched for simultaneously.
+class RE2::Set {
+ public:
+  Set(const RE2::Options& options, RE2::Anchor anchor);
+  ~Set();
+
+  // Add adds regexp pattern to the set, interpreted using the RE2 options.
+  // (The RE2 constructor's default options parameter is RE2::UTF8.)
+  // Add returns the regexp index that will be used to identify
+  // it in the result of Match, or -1 if the regexp cannot be parsed.
+  // Indices are assigned in sequential order starting from 0.
+  // Error returns do not increment the index.
+  // If an error occurs and error != NULL, *error will hold an error message.
+  int Add(const StringPiece& pattern, string* error);
+
+  // Compile prepares the Set for matching.
+  // Add must not be called again after Compile.
+  // Compile must be called before FullMatch or PartialMatch.
+  // Compile may return false if it runs out of memory.
+  bool Compile();
+
+  // Match returns true if text matches any of the regexps in the set.
+  // If so, it fills v with the indices of the matching regexps.
+  bool Match(const StringPiece& text, vector<int>* v) const;
+
+ private:
+  RE2::Options options_;
+  RE2::Anchor anchor_;
+  vector<re2::Regexp*> re_;
+  re2::Prog* prog_;
+  bool compiled_;
+  //DISALLOW_EVIL_CONSTRUCTORS(Set);
+  Set(const Set&);
+  void operator=(const Set&);
+};
+
+}  // namespace re2
+
+#endif  // RE2_SET_H
--- a/re2/re2/simplify.cc
+++ b/re2/re2/simplify.cc
@ -0,0 +1,393 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Rewrite POSIX and other features in re
+// to use simple extended regular expression features.
+// Also sort and simplify character classes.
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Parses the regexp src and then simplifies it and sets *dst to the
+// string representation of the simplified form.  Returns true on success.
+// Returns false and sets *error (if error != NULL) on error.
+bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
+                            string* dst,
+                            RegexpStatus* status) {
+  Regexp* re = Parse(src, flags, status);
+  if (re == NULL)
+    return false;
+  Regexp* sre = re->Simplify();
+  re->Decref();
+  if (sre == NULL) {
+    // Should not happen, since Simplify never fails.
+    LOG(ERROR) << "Simplify failed on " << src;
+    if (status) {
+      status->set_code(kRegexpInternalError);
+      status->set_error_arg(src);
+    }
+    return false;
+  }
+  *dst = sre->ToString();
+  sre->Decref();
+  return true;
+}
+
+// Assuming the simple_ flags on the children are accurate,
+// is this Regexp* simple?
+bool Regexp::ComputeSimple() {
+  Regexp** subs;
+  switch (op_) {
+    case kRegexpNoMatch:
+    case kRegexpEmptyMatch:
+    case kRegexpLiteral:
+    case kRegexpLiteralString:
+    case kRegexpBeginLine:
+    case kRegexpEndLine:
+    case kRegexpBeginText:
+    case kRegexpWordBoundary:
+    case kRegexpNoWordBoundary:
+    case kRegexpEndText:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpHaveMatch:
+      return true;
+    case kRegexpConcat:
+    case kRegexpAlternate:
+      // These are simple as long as the subpieces are simple.
+      subs = sub();
+      for (int i = 0; i < nsub_; i++)
+        if (!subs[i]->simple_)
+          return false;
+      return true;
+    case kRegexpCharClass:
+      // Simple as long as the char class is not empty, not full.
+      if (ccb_ != NULL)
+        return !ccb_->empty() && !ccb_->full();
+      return !cc_->empty() && !cc_->full();
+    case kRegexpCapture:
+      subs = sub();
+      return subs[0]->simple_;
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      subs = sub();
+      if (!subs[0]->simple_)
+        return false;
+      switch (subs[0]->op_) {
+        case kRegexpStar:
+        case kRegexpPlus:
+        case kRegexpQuest:
+        case kRegexpEmptyMatch:
+        case kRegexpNoMatch:
+          return false;
+        default:
+          break;
+      }
+      return true;
+    case kRegexpRepeat:
+      return false;
+  }
+  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
+  return false;
+}
+
+// Walker subclass used by Simplify.
+// The simplify walk is purely post-recursive: given the simplified children,
+// PostVisit creates the simplified result.
+// The child_args are simplified Regexp*s.
+class SimplifyWalker : public Regexp::Walker<Regexp*> {
+ public:
+  SimplifyWalker() {}
+  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
+  virtual Regexp* PostVisit(Regexp* re,
+                            Regexp* parent_arg,
+                            Regexp* pre_arg,
+                            Regexp** child_args, int nchild_args);
+  virtual Regexp* Copy(Regexp* re);
+  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
+
+ private:
+  // These functions are declared inside SimplifyWalker so that
+  // they can edit the private fields of the Regexps they construct.
+
+  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
+  // Caller must Decref return value when done with it.
+  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
+
+  // Simplifies the expression re{min,max} in terms of *, +, and ?.
+  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
+  // Caller must Decref return value when done with it.
+  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
+                                Regexp::ParseFlags parse_flags);
+
+  // Simplifies a character class by expanding any named classes
+  // into rune ranges.  Does not edit re.  Does not consume ref to re.
+  // Caller must Decref return value when done with it.
+  static Regexp* SimplifyCharClass(Regexp* re);
+
+  DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
+};
+
+// Simplifies a regular expression, returning a new regexp.
+// The new regexp uses traditional Unix egrep features only,
+// plus the Perl (?:) non-capturing parentheses.
+// Otherwise, no POSIX or Perl additions.  The new regexp
+// captures exactly the same subexpressions (with the same indices)
+// as the original.
+// Does not edit current object.
+// Caller must Decref() return value when done with it.
+
+Regexp* Regexp::Simplify() {
+  if (simple_)
+    return Incref();
+  SimplifyWalker w;
+  return w.Walk(this, NULL);
+}
+
+#define Simplify DontCallSimplify  // Avoid accidental recursion
+
+Regexp* SimplifyWalker::Copy(Regexp* re) {
+  return re->Incref();
+}
+
+Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
+  // This should never be called, since we use Walk and not
+  // WalkExponential.
+  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
+  return re->Incref();
+}
+
+Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
+  if (re->simple_) {
+    *stop = true;
+    return re->Incref();
+  }
+  return NULL;
+}
+
+Regexp* SimplifyWalker::PostVisit(Regexp* re,
+                                  Regexp* parent_arg,
+                                  Regexp* pre_arg,
+                                  Regexp** child_args,
+                                  int nchild_args) {
+  switch (re->op()) {
+    case kRegexpNoMatch:
+    case kRegexpEmptyMatch:
+    case kRegexpLiteral:
+    case kRegexpLiteralString:
+    case kRegexpBeginLine:
+    case kRegexpEndLine:
+    case kRegexpBeginText:
+    case kRegexpWordBoundary:
+    case kRegexpNoWordBoundary:
+    case kRegexpEndText:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpHaveMatch:
+      // All these are always simple.
+      re->simple_ = true;
+      return re->Incref();
+
+    case kRegexpConcat:
+    case kRegexpAlternate: {
+      // These are simple as long as the subpieces are simple.
+      // Two passes to avoid allocation in the common case.
+      bool changed = false;
+      Regexp** subs = re->sub();
+      for (int i = 0; i < re->nsub_; i++) {
+        Regexp* sub = subs[i];
+        Regexp* newsub = child_args[i];
+        if (newsub != sub) {
+          changed = true;
+          break;
+        }
+      }
+      if (!changed) {
+        for (int i = 0; i < re->nsub_; i++) {
+          Regexp* newsub = child_args[i];
+          newsub->Decref();
+        }
+        re->simple_ = true;
+        return re->Incref();
+      }
+      Regexp* nre = new Regexp(re->op(), re->parse_flags());
+      nre->AllocSub(re->nsub_);
+      Regexp** nre_subs = nre->sub();
+      for (int i = 0; i <re->nsub_; i++)
+        nre_subs[i] = child_args[i];
+      nre->simple_ = true;
+      return nre;
+    }
+
+    case kRegexpCapture: {
+      Regexp* newsub = child_args[0];
+      if (newsub == re->sub()[0]) {
+        newsub->Decref();
+        re->simple_ = true;
+        return re->Incref();
+      }
+      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
+      nre->AllocSub(1);
+      nre->sub()[0] = newsub;
+      nre->cap_ = re->cap_;
+      nre->simple_ = true;
+      return nre;
+    }
+
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest: {
+      Regexp* newsub = child_args[0];
+      // Special case: repeat the empty string as much as
+      // you want, but it's still the empty string.
+      if (newsub->op() == kRegexpEmptyMatch)
+        return newsub;
+
+      // These are simple as long as the subpiece is simple.
+      if (newsub == re->sub()[0]) {
+        newsub->Decref();
+        re->simple_ = true;
+        return re->Incref();
+      }
+
+      // These are also idempotent if flags are constant.
+      if (re->op() == newsub->op() &&
+          re->parse_flags() == newsub->parse_flags())
+        return newsub;
+
+      Regexp* nre = new Regexp(re->op(), re->parse_flags());
+      nre->AllocSub(1);
+      nre->sub()[0] = newsub;
+      nre->simple_ = true;
+      return nre;
+    }
+
+    case kRegexpRepeat: {
+      Regexp* newsub = child_args[0];
+      // Special case: repeat the empty string as much as
+      // you want, but it's still the empty string.
+      if (newsub->op() == kRegexpEmptyMatch)
+        return newsub;
+
+      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
+                                   re->parse_flags());
+      newsub->Decref();
+      nre->simple_ = true;
+      return nre;
+    }
+
+    case kRegexpCharClass: {
+      Regexp* nre = SimplifyCharClass(re);
+      nre->simple_ = true;
+      return nre;
+    }
+  }
+
+  LOG(ERROR) << "Simplify case not handled: " << re->op();
+  return re->Incref();
+}
+
+// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
+// Returns a new Regexp, handing the ref to the caller.
+Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
+                                Regexp::ParseFlags parse_flags) {
+  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
+  re->AllocSub(2);
+  Regexp** subs = re->sub();
+  subs[0] = re1;
+  subs[1] = re2;
+  return re;
+}
+
+// Simplifies the expression re{min,max} in terms of *, +, and ?.
+// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
+// Caller must Decref return value when done with it.
+// The result will *not* necessarily have the right capturing parens
+// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
+// but in the Regexp* representation, both (x) are marked as $1.
+Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
+                                       Regexp::ParseFlags f) {
+  // x{n,} means at least n matches of x.
+  if (max == -1) {
+    // Special case: x{0,} is x*
+    if (min == 0)
+      return Regexp::Star(re->Incref(), f);
+
+    // Special case: x{1,} is x+
+    if (min == 1)
+      return Regexp::Plus(re->Incref(), f);
+
+    // General case: x{4,} is xxxx+
+    Regexp* nre = new Regexp(kRegexpConcat, f);
+    nre->AllocSub(min);
+    VLOG(1) << "Simplify " << min;
+    Regexp** nre_subs = nre->sub();
+    for (int i = 0; i < min-1; i++)
+      nre_subs[i] = re->Incref();
+    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
+    return nre;
+  }
+
+  // Special case: (x){0} matches only empty string.
+  if (min == 0 && max == 0)
+    return new Regexp(kRegexpEmptyMatch, f);
+
+  // Special case: x{1} is just x.
+  if (min == 1 && max == 1)
+    return re->Incref();
+
+  // General case: x{n,m} means n copies of x and m copies of x?.
+  // The machine will do less work if we nest the final m copies,
+  // so that x{2,5} = xx(x(x(x)?)?)?
+
+  // Build leading prefix: xx.  Capturing only on the last one.
+  Regexp* nre = NULL;
+  if (min > 0) {
+    nre = new Regexp(kRegexpConcat, f);
+    nre->AllocSub(min);
+    Regexp** nre_subs = nre->sub();
+    for (int i = 0; i < min; i++)
+      nre_subs[i] = re->Incref();
+  }
+
+  // Build and attach suffix: (x(x(x)?)?)?
+  if (max > min) {
+    Regexp* suf = Regexp::Quest(re->Incref(), f);
+    for (int i = min+1; i < max; i++)
+      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
+    if (nre == NULL)
+      nre = suf;
+    else
+      nre = Concat2(nre, suf, f);
+  }
+
+  if (nre == NULL) {
+    // Some degenerate case, like min > max, or min < max < 0.
+    // This shouldn't happen, because the parser rejects such regexps.
+    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
+    return new Regexp(kRegexpNoMatch, f);
+  }
+
+  return nre;
+}
+
+// Simplifies a character class.
+// Caller must Decref return value when done with it.
+Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
+  CharClass* cc = re->cc();
+
+  // Special cases
+  if (cc->empty())
+    return new Regexp(kRegexpNoMatch, re->parse_flags());
+  if (cc->full())
+    return new Regexp(kRegexpAnyChar, re->parse_flags());
+
+  return re->Incref();
+}
+
+}  // namespace re2
--- a/re2/re2/stringpiece.h
+++ b/re2/re2/stringpiece.h
@ -0,0 +1,182 @@
+// Copyright 2001-2010 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// A string-like object that points to a sized piece of memory.
+//
+// Functions or methods may use const StringPiece& parameters to accept either
+// a "const char*" or a "string" value that will be implicitly converted to
+// a StringPiece.  The implicit conversion means that it is often appropriate
+// to include this .h file in other files rather than forward-declaring
+// StringPiece as would be appropriate for most other Google classes.
+//
+// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
+// conversions from "const char*" to "string" and back again.
+//
+//
+// Arghh!  I wish C++ literals were "string".
+
+#ifndef STRINGS_STRINGPIECE_H__
+#define STRINGS_STRINGPIECE_H__
+
+#include <string.h>
+#include <cstddef>
+#include <iosfwd>
+#include <string>
+
+namespace re2 {
+
+class StringPiece {
+ private:
+  const char*   ptr_;
+  int           length_;
+
+ public:
+  // We provide non-explicit singleton constructors so users can pass
+  // in a "const char*" or a "string" wherever a "StringPiece" is
+  // expected.
+  StringPiece() : ptr_(NULL), length_(0) { }
+  StringPiece(const char* str)
+    : ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
+  StringPiece(const std::string& str)
+    : ptr_(str.data()), length_(static_cast<int>(str.size())) { }
+  StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
+
+  // data() may return a pointer to a buffer with embedded NULs, and the
+  // returned buffer may or may not be null terminated.  Therefore it is
+  // typically a mistake to pass data() to a routine that expects a NUL
+  // terminated string.
+  const char* data() const { return ptr_; }
+  int size() const { return length_; }
+  int length() const { return length_; }
+  bool empty() const { return length_ == 0; }
+
+  void clear() { ptr_ = NULL; length_ = 0; }
+  void set(const char* data, int len) { ptr_ = data; length_ = len; }
+  void set(const char* str) {
+    ptr_ = str;
+    if (str != NULL)
+      length_ = static_cast<int>(strlen(str));
+    else
+      length_ = 0;
+  }
+  void set(const void* data, int len) {
+    ptr_ = reinterpret_cast<const char*>(data);
+    length_ = len;
+  }
+
+  char operator[](int i) const { return ptr_[i]; }
+
+  void remove_prefix(int n) {
+    ptr_ += n;
+    length_ -= n;
+  }
+
+  void remove_suffix(int n) {
+    length_ -= n;
+  }
+
+  int compare(const StringPiece& x) const {
+    int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
+    if (r == 0) {
+      if (length_ < x.length_) r = -1;
+      else if (length_ > x.length_) r = +1;
+    }
+    return r;
+  }
+
+  std::string as_string() const {
+    return std::string(data(), size());
+  }
+  // We also define ToString() here, since many other string-like
+  // interfaces name the routine that converts to a C++ string
+  // "ToString", and it's confusing to have the method that does that
+  // for a StringPiece be called "as_string()".  We also leave the
+  // "as_string()" method defined here for existing code.
+  std::string ToString() const {
+    return std::string(data(), size());
+  }
+
+  void CopyToString(std::string* target) const;
+  void AppendToString(std::string* target) const;
+
+  // Does "this" start with "x"
+  bool starts_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (memcmp(ptr_, x.ptr_, x.length_) == 0));
+  }
+
+  // Does "this" end with "x"
+  bool ends_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
+  }
+
+  // standard STL container boilerplate
+  typedef char value_type;
+  typedef const char* pointer;
+  typedef const char& reference;
+  typedef const char& const_reference;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  static const size_type npos;
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  iterator begin() const { return ptr_; }
+  iterator end() const { return ptr_ + length_; }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(ptr_ + length_);
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(ptr_);
+  }
+  // STLS says return size_type, but Google says return int
+  int max_size() const { return length_; }
+  int capacity() const { return length_; }
+
+  int copy(char* buf, size_type n, size_type pos = 0) const;
+
+  int find(const StringPiece& s, size_type pos = 0) const;
+  int find(char c, size_type pos = 0) const;
+  int rfind(const StringPiece& s, size_type pos = npos) const;
+  int rfind(char c, size_type pos = npos) const;
+
+  StringPiece substr(size_type pos, size_type n = npos) const;
+  
+  static bool _equal(const StringPiece&, const StringPiece&);
+};
+
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+  return StringPiece::_equal(x, y);
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+  return !(x == y);
+}
+
+inline bool operator<(const StringPiece& x, const StringPiece& y) {
+  const int r = memcmp(x.data(), y.data(),
+                       std::min(x.size(), y.size()));
+  return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece& x, const StringPiece& y) {
+  return y < x;
+}
+
+inline bool operator<=(const StringPiece& x, const StringPiece& y) {
+  return !(x > y);
+}
+
+inline bool operator>=(const StringPiece& x, const StringPiece& y) {
+  return !(x < y);
+}
+
+}  // namespace re2
+
+// allow StringPiece to be logged
+extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
+
+#endif  // STRINGS_STRINGPIECE_H__
--- a/re2/re2/testing/backtrack.cc
+++ b/re2/re2/testing/backtrack.cc
@ -0,0 +1,254 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc, exhaustive_test.cc, tester.cc
+//
+// Prog::BadSearchBacktrack is a backtracking regular expression search,
+// except that it remembers where it has been, trading a lot of
+// memory for a lot of time. It exists only for testing purposes.
+//
+// Let me repeat that.
+//
+// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
+//   - It uses a ton of memory.
+//   - It uses a ton of stack.
+//   - It uses CHECK and LOG(FATAL).
+//   - It implements unanchored search by repeated anchored search.
+//
+// On the other hand, it is very simple and a good reference
+// implementation for the more complicated regexp packages.
+//
+// In BUILD, this file is linked into the ":testing" library,
+// not the main library, in order to make it harder to pick up
+// accidentally.
+
+#include "util/util.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+// Backtracker holds the state for a backtracking search.
+//
+// Excluding the search parameters, the main search state
+// is just the "capture registers", which record, for the
+// current execution, the string position at which each
+// parenthesis was passed.  cap_[0] and cap_[1] are the
+// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
+//
+// To avoid infinite loops during backtracking on expressions
+// like (a*)*, the visited_[] bitmap marks the (state, string-position)
+// pairs that have already been explored and are thus not worth
+// re-exploring if we get there via another path.  Modern backtracking
+// libraries engineer their program representation differently, to make
+// such infinite loops possible to avoid without keeping a giant visited_
+// bitmap, but visited_ works fine for a reference implementation
+// and it has the nice benefit of making the search run in linear time.
+class Backtracker {
+ public:
+  explicit Backtracker(Prog* prog);
+  ~Backtracker();
+
+  bool Search(const StringPiece& text, const StringPiece& context,
+              bool anchored, bool longest,
+              StringPiece* submatch, int nsubmatch);
+
+ private:
+  // Explores from instruction ip at string position p looking for a match.
+  // Returns true if found (so that caller can stop trying other possibilities).
+  bool Visit(int id, const char* p);
+
+  // Search parameters
+  Prog* prog_;              // program being run
+  StringPiece text_;        // text being searched
+  StringPiece context_;     // greater context of text being searched
+  bool anchored_;           // whether search is anchored at text.begin()
+  bool longest_;            // whether search wants leftmost-longest match
+  bool endmatch_;           // whether search must end at text.end()
+  StringPiece *submatch_;   // submatches to fill in
+  int nsubmatch_;           //   # of submatches to fill in
+
+  // Search state
+  const char* cap_[64];     // capture registers
+  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
+  int nvisited_;            //   # of words in bitmap
+};
+
+Backtracker::Backtracker(Prog* prog)
+  : prog_(prog),
+    anchored_(false),
+    longest_(false),
+    endmatch_(false),
+    submatch_(NULL),
+    nsubmatch_(0),
+    visited_(NULL),
+    nvisited_(0) {
+}
+
+Backtracker::~Backtracker() {
+  delete[] visited_;
+}
+
+// Runs a backtracking search.
+bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
+                         bool anchored, bool longest,
+                         StringPiece* submatch, int nsubmatch) {
+  text_ = text;
+  context_ = context;
+  if (context_.begin() == NULL)
+    context_ = text;
+  if (prog_->anchor_start() && text.begin() > context_.begin())
+    return false;
+  if (prog_->anchor_end() && text.end() < context_.end())
+    return false;
+  anchored_ = anchored | prog_->anchor_start();
+  longest_ = longest | prog_->anchor_end();
+  endmatch_ = prog_->anchor_end();
+  submatch_ = submatch;
+  nsubmatch_ = nsubmatch;
+  CHECK(2*nsubmatch_ < arraysize(cap_));
+  memset(cap_, 0, sizeof cap_);
+
+  // We use submatch_[0] for our own bookkeeping,
+  // so it had better exist.
+  StringPiece sp0;
+  if (nsubmatch < 1) {
+    submatch_ = &sp0;
+    nsubmatch_ = 1;
+  }
+  submatch_[0] = NULL;
+
+  // Allocate new visited_ bitmap -- size is proportional
+  // to text, so have to reallocate on each call to Search.
+  delete[] visited_;
+  nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
+  visited_ = new uint32[nvisited_];
+  memset(visited_, 0, nvisited_*sizeof visited_[0]);
+
+  // Anchored search must start at text.begin().
+  if (anchored_) {
+    cap_[0] = text.begin();
+    return Visit(prog_->start(), text.begin());
+  }
+
+  // Unanchored search, starting from each possible text position.
+  // Notice that we have to try the empty string at the end of
+  // the text, so the loop condition is p <= text.end(), not p < text.end().
+  for (const char* p = text.begin(); p <= text.end(); p++) {
+    cap_[0] = p;
+    if (Visit(prog_->start(), p))  // Match must be leftmost; done.
+      return true;
+  }
+  return false;
+}
+
+// Explores from instruction ip at string position p looking for a match.
+// Return true if found (so that caller can stop trying other possibilities).
+bool Backtracker::Visit(int id, const char* p) {
+  // Check bitmap.  If we've already explored from here,
+  // either it didn't match or it did but we're hoping for a better match.
+  // Either way, don't go down that road again.
+  CHECK(p <= text_.end());
+  int n = id*(text_.size()+1) + (p - text_.begin());
+  CHECK_LT(n/32, nvisited_);
+  if (visited_[n/32] & (1 << (n&31)))
+    return false;
+  visited_[n/32] |= 1 << (n&31);
+
+  // Pick out byte at current position.  If at end of string,
+  // have to explore in hope of finishing a match.  Use impossible byte -1.
+  int c = -1;
+  if (p < text_.end())
+    c = *p & 0xFF;
+
+  Prog::Inst* ip = prog_->inst(id);
+  switch (ip->opcode()) {
+    default:
+      LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
+      return false;  // not reached
+
+    case kInstAlt:
+    case kInstAltMatch:
+      // Try both possible next states: out is preferred to out1.
+      if (Visit(ip->out(), p)) {
+        if (longest_)
+          Visit(ip->out1(), p);
+        return true;
+      }
+      return Visit(ip->out1(), p);
+
+    case kInstByteRange:
+      if (ip->Matches(c))
+        return Visit(ip->out(), p+1);
+      return false;
+
+    case kInstCapture:
+      if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
+        // Capture p to register, but save old value.
+        const char* q = cap_[ip->cap()];
+        cap_[ip->cap()] = p;
+        bool ret = Visit(ip->out(), p);
+        // Restore old value as we backtrack.
+        cap_[ip->cap()] = q;
+        return ret;
+      }
+      return Visit(ip->out(), p);
+
+    case kInstEmptyWidth:
+      if (ip->empty() & ~Prog::EmptyFlags(context_, p))
+        return false;
+      return Visit(ip->out(), p);
+
+    case kInstNop:
+      return Visit(ip->out(), p);
+
+    case kInstMatch:
+      // We found a match.  If it's the best so far, record the
+      // parameters in the caller's submatch_ array.
+      if (endmatch_ && p != context_.end())
+        return false;
+      cap_[1] = p;
+      if (submatch_[0].data() == NULL ||           // First match so far ...
+          (longest_ && p > submatch_[0].end())) {  // ... or better match
+        for (int i = 0; i < nsubmatch_; i++)
+          submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
+      }
+      return true;
+
+    case kInstFail:
+      return false;
+  }
+}
+
+// Runs a backtracking search.
+bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
+                                 const StringPiece& context,
+                                 Anchor anchor,
+                                 MatchKind kind,
+                                 StringPiece* match,
+                                 int nmatch) {
+  // If full match, we ask for an anchored longest match
+  // and then check that match[0] == text.
+  // So make sure match[0] exists.
+  StringPiece sp0;
+  if (kind == kFullMatch) {
+    anchor = kAnchored;
+    if (nmatch < 1) {
+      match = &sp0;
+      nmatch = 1;
+    }
+  }
+
+  // Run the search.
+  Backtracker b(this);
+  bool anchored = anchor == kAnchored;
+  bool longest = kind != kFirstMatch;
+  if (!b.Search(text, context, anchored, longest, match, nmatch))
+    return false;
+  if (kind == kFullMatch && match[0].end() != text.end())
+    return false;
+  return true;
+}
+
+}  // namespace re2
--- a/re2/re2/testing/charclass_test.cc
+++ b/re2/re2/testing/charclass_test.cc
@ -0,0 +1,223 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test character class manipulations.
+
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct CCTest {
+  struct {
+    Rune lo;
+    Rune hi;
+  } add[10];
+  int remove;
+  struct {
+    Rune lo;
+    Rune hi;
+  } final[10];
+};
+
+static CCTest tests[] = {
+  { { { 10, 20 }, {-1} }, -1,
+    { { 10, 20 }, {-1} } },
+
+  { { { 10, 20 }, { 20, 30 }, {-1} }, -1,
+    { { 10, 30 }, {-1} } },
+
+  { { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
+    { { 10, 40 }, {-1} } },
+
+  { { { 0, 50 }, { 20, 30 }, {-1} }, -1,
+    { { 0, 50 }, {-1} } },
+
+  { { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
+    { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+  { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
+    { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+  { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
+    { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+  { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
+    { { 5, 25 }, {-1} } },
+
+  { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
+    { { 10, 23 }, {-1} } },
+
+  // These check boundary cases during negation.
+  { { { 0, Runemax }, {-1} }, -1,
+    { { 0, Runemax }, {-1} } },
+
+  { { { 0, 50 }, {-1} }, -1,
+    { { 0, 50 }, {-1} } },
+
+  { { { 50, Runemax }, {-1} }, -1,
+    { { 50, Runemax }, {-1} } },
+
+  // Check RemoveAbove.
+  { { { 50, Runemax }, {-1} }, 255,
+    { { 50, 255 }, {-1} } },
+
+  { { { 50, Runemax }, {-1} }, 65535,
+    { { 50, 65535 }, {-1} } },
+
+  { { { 50, Runemax }, {-1} }, Runemax,
+    { { 50, Runemax }, {-1} } },
+
+  { { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
+    { { 50, 60 }, { 250, 255 }, {-1} } },
+
+  { { { 50, 60 }, {-1} }, 255,
+    { { 50, 60 }, {-1} } },
+
+  { { { 350, 360 }, {-1} }, 255,
+    { {-1} } },
+
+  { { {-1} }, 255,
+    { {-1} } },
+};
+
+template<class CharClass>
+static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
+  if (t == NULL) {
+    printf("\t%s:", desc);
+  } else {
+    printf("\n");
+    printf("CharClass added: [%s]", desc);
+    for (int k = 0; t->add[k].lo >= 0; k++)
+      printf(" %d-%d", t->add[k].lo, t->add[k].hi);
+    printf("\n");
+    if (t->remove >= 0)
+      printf("Removed > %d\n", t->remove);
+    printf("\twant:");
+    for (int k = 0; t->final[k].lo >= 0; k++)
+      printf(" %d-%d", t->final[k].lo, t->final[k].hi);
+    printf("\n");
+    printf("\thave:");
+  }
+
+  for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
+    printf(" %d-%d", it->lo, it->hi);
+  printf("\n");
+}
+
+bool ShouldContain(CCTest *t, int x) {
+  for (int j = 0; t->final[j].lo >= 0; j++)
+    if (t->final[j].lo <= x && x <= t->final[j].hi)
+      return true;
+  return false;
+}
+
+// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
+
+CharClass* Negate(CharClass *cc) {
+  return cc->Negate();
+}
+
+void Delete(CharClass* cc) {
+  cc->Delete();
+}
+
+CharClassBuilder* Negate(CharClassBuilder* cc) {
+  CharClassBuilder* ncc = cc->Copy();
+  ncc->Negate();
+  return ncc;
+}
+
+void Delete(CharClassBuilder* cc) {
+  delete cc;
+}
+
+template<class CharClass>
+bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
+  typename CharClass::iterator it = cc->begin();
+  int size = 0;
+  for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
+    if (it == cc->end() ||
+        it->lo != t->final[j].lo ||
+        it->hi != t->final[j].hi) {
+      Broke(desc, t, cc);
+      return false;
+    }
+    size += it->hi - it->lo + 1;
+  }
+  if (it != cc->end()) {
+    Broke(desc, t, cc);
+    return false;
+  }
+  if (cc->size() != size) {
+    Broke(desc, t, cc);
+    printf("wrong size: want %d have %d\n", size, cc->size());
+    return false;
+  }
+
+  for (int j = 0; j < 101; j++) {
+    if (j == 100)
+      j = Runemax;
+    if (ShouldContain(t, j) != cc->Contains(j)) {
+      Broke(desc, t, cc);
+      printf("want contains(%d)=%d, got %d\n",
+             j, ShouldContain(t, j), cc->Contains(j));
+      return false;
+    }
+  }
+
+  CharClass* ncc = Negate(cc);
+  for (int j = 0; j < 101; j++) {
+    if (j == 100)
+      j = Runemax;
+    if (ShouldContain(t, j) == ncc->Contains(j)) {
+      Broke(desc, t, cc);
+      Broke("ncc", NULL, ncc);
+      printf("want ncc contains(%d)!=%d, got %d\n",
+             j, ShouldContain(t, j), ncc->Contains(j));
+      Delete(ncc);
+      return false;
+    }
+    if (ncc->size() != Runemax+1 - cc->size()) {
+      Broke(desc, t, cc);
+      Broke("ncc", NULL, ncc);
+      printf("ncc size should be %d is %d\n",
+             Runemax+1 - cc->size(), ncc->size());
+      Delete(ncc);
+      return false;
+    }
+  }
+  Delete(ncc);
+  return true;
+}
+
+TEST(TestCharClassBuilder, Adds) {
+  int nfail = 0;
+  for (int i = 0; i < arraysize(tests); i++) {
+    CharClassBuilder ccb;
+    CCTest* t = &tests[i];
+    for (int j = 0; t->add[j].lo >= 0; j++)
+      ccb.AddRange(t->add[j].lo, t->add[j].hi);
+    if (t->remove >= 0)
+      ccb.RemoveAbove(t->remove);
+    if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
+      nfail++;
+    CharClass* cc = ccb.GetCharClass();
+    if (!CorrectCC(cc, t, "before copy (CharClass)"))
+      nfail++;
+    cc->Delete();
+
+    CharClassBuilder *ccb1 = ccb.Copy();
+    if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
+      nfail++;
+    cc = ccb.GetCharClass();
+    if (!CorrectCC(cc, t, "after copy (CharClass)"))
+      nfail++;
+    cc->Delete();
+    delete ccb1;
+  }
+  EXPECT_EQ(nfail, 0);
+}
+
+}  // namespace re2
--- a/re2/re2/testing/compile_test.cc
+++ b/re2/re2/testing/compile_test.cc
@ -0,0 +1,171 @@
+// Copyright 2007 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test prog.cc, compile.cc
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+#include "re2/prog.h"
+
+DEFINE_string(show, "", "regular expression to compile and dump");
+
+namespace re2 {
+
+// Simple input/output tests checking that
+// the regexp compiles to the expected code.
+// These are just to sanity check the basic implementation.
+// The real confidence tests happen by testing the NFA/DFA
+// that run the compiled code.
+
+struct Test {
+  const char* regexp;
+  const char* code;
+};
+
+static Test tests[] = {
+  { "a",
+    "1. byte [61-61] -> 2\n"
+    "2. match! 0\n" },
+  { "ab",
+    "1. byte [61-61] -> 2\n"
+    "2. byte [62-62] -> 3\n"
+    "3. match! 0\n" },
+  { "a|c",
+    "3. alt -> 1 | 2\n"
+    "1. byte [61-61] -> 4\n"
+    "2. byte [63-63] -> 4\n"
+    "4. match! 0\n" },
+  { "a|b",
+    "1. byte [61-62] -> 2\n"
+    "2. match! 0\n" },
+  { "[ab]",
+    "1. byte [61-62] -> 2\n"
+    "2. match! 0\n" },
+  { "a+",
+    "1. byte [61-61] -> 2\n"
+    "2. alt -> 1 | 3\n"
+    "3. match! 0\n" },
+  { "a+?",
+    "1. byte [61-61] -> 2\n"
+    "2. alt -> 3 | 1\n"
+    "3. match! 0\n" },
+  { "a*",
+    "2. alt -> 1 | 3\n"
+    "1. byte [61-61] -> 2\n"
+    "3. match! 0\n" },
+  { "a*?",
+    "2. alt -> 3 | 1\n"
+    "3. match! 0\n"
+    "1. byte [61-61] -> 2\n" },
+  { "a?",
+    "2. alt -> 1 | 3\n"
+    "1. byte [61-61] -> 3\n"
+    "3. match! 0\n" },
+  { "a??",
+    "2. alt -> 3 | 1\n"
+    "3. match! 0\n"
+    "1. byte [61-61] -> 3\n" },
+  { "a{4}",
+    "1. byte [61-61] -> 2\n"
+    "2. byte [61-61] -> 3\n"
+    "3. byte [61-61] -> 4\n"
+    "4. byte [61-61] -> 5\n"
+    "5. match! 0\n" },
+  { "(a)",
+    "2. capture 2 -> 1\n"
+    "1. byte [61-61] -> 3\n"
+    "3. capture 3 -> 4\n"
+    "4. match! 0\n" },
+  { "(?:a)",
+    "1. byte [61-61] -> 2\n"
+    "2. match! 0\n" },
+  { "",
+    "2. match! 0\n" },
+  { ".",
+    "3. alt -> 1 | 2\n"
+    "1. byte [00-09] -> 4\n"
+    "2. byte [0b-ff] -> 4\n"
+    "4. match! 0\n" },
+  { "[^ab]",
+    "5. alt -> 3 | 4\n"
+    "3. alt -> 1 | 2\n"
+    "4. byte [63-ff] -> 6\n"
+    "1. byte [00-09] -> 6\n"
+    "2. byte [0b-60] -> 6\n"
+    "6. match! 0\n" },
+  { "[Aa]",
+    "1. byte/i [61-61] -> 2\n"
+    "2. match! 0\n" },
+};
+
+TEST(TestRegexpCompileToProg, Simple) {
+  int failed = 0;
+  for (int i = 0; i < arraysize(tests); i++) {
+    const re2::Test& t = tests[i];
+    Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
+    if (re == NULL) {
+      LOG(ERROR) << "Cannot parse: " << t.regexp;
+      failed++;
+      continue;
+    }
+    Prog* prog = re->CompileToProg(0);
+    if (prog == NULL) {
+      LOG(ERROR) << "Cannot compile: " << t.regexp;
+      re->Decref();
+      failed++;
+      continue;
+    }
+    CHECK(re->CompileToProg(1) == NULL);
+    string s = prog->Dump();
+    if (s != t.code) {
+      LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
+      LOG(ERROR) << "Want:\n" << t.code;
+      LOG(ERROR) << "Got:\n" << s;
+      failed++;
+    }
+    delete prog;
+    re->Decref();
+  }
+  EXPECT_EQ(failed, 0);
+}
+
+// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
+// Once, erroneously split between 0x3f and 0x40 because it is
+// a 6-bit boundary.
+static struct UTF8ByteRange {
+  int lo;
+  int hi;
+} utf8ranges[] = {
+  { 0x00, 0x09 },
+  { 0x0A, 0x0A },
+  { 0x10, 0x7F },
+  { 0x80, 0x8F },
+  { 0x90, 0x9F },
+  { 0xA0, 0xBF },
+  { 0xC0, 0xC1 },
+  { 0xC2, 0xDF },
+  { 0xE0, 0xE0 },
+  { 0xE1, 0xEF },
+  { 0xF0, 0xF0 },
+  { 0xF1, 0xF3 },
+  { 0xF4, 0xF4 },
+  { 0xF5, 0xFF },
+};
+
+TEST(TestCompile, ByteRanges) {
+  Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
+  EXPECT_TRUE(re != NULL);
+  Prog* prog = re->CompileToProg(0);
+  EXPECT_TRUE(prog != NULL);
+  EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
+  for (int i = 0; i < arraysize(utf8ranges); i++)
+    for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
+      EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
+  delete prog;
+  re->Decref();
+}
+
+}  // namespace re2
--- a/re2/re2/testing/dfa_test.cc
+++ b/re2/re2/testing/dfa_test.cc
@ -0,0 +1,343 @@
+// Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "util/thread.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+DECLARE_bool(re2_dfa_bail_when_slow);
+
+DEFINE_int32(size, 8, "log2(number of DFA nodes)");
+DEFINE_int32(repeat, 2, "Repetition count.");
+DEFINE_int32(threads, 4, "number of threads");
+
+namespace re2 {
+
+// Check that multithreaded access to DFA class works.
+
+// Helper thread: builds entire DFA for prog.
+class BuildThread : public Thread {
+ public:
+  BuildThread(Prog* prog) : prog_(prog) {}
+  virtual void Run() {
+    CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
+  }
+
+ private:
+  Prog* prog_;
+};
+
+TEST(Multithreaded, BuildEntireDFA) {
+  // Create regexp with 2^FLAGS_size states in DFA.
+  string s = "a";
+  for (int i = 0; i < FLAGS_size; i++)
+    s += "[ab]";
+  s += "b";
+
+  // Check that single-threaded code works.
+  {
+    //LOG(INFO) << s;
+    Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
+    CHECK(re);
+    Prog* prog = re->CompileToProg(0);
+    CHECK(prog);
+    BuildThread* t = new BuildThread(prog);
+    t->SetJoinable(true);
+    t->Start();
+    t->Join();
+    delete t;
+    delete prog;
+    re->Decref();
+  }
+
+  // Build the DFA simultaneously in a bunch of threads.
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
+    CHECK(re);
+    Prog* prog = re->CompileToProg(0);
+    CHECK(prog);
+
+    vector<BuildThread*> threads;
+    for (int j = 0; j < FLAGS_threads; j++) {
+      BuildThread *t = new BuildThread(prog);
+      t->SetJoinable(true);
+      threads.push_back(t);
+    }
+    for (int j = 0; j < FLAGS_threads; j++)
+      threads[j]->Start();
+    for (int j = 0; j < FLAGS_threads; j++) {
+      threads[j]->Join();
+      delete threads[j];
+    }
+
+    // One more compile, to make sure everything is okay.
+    prog->BuildEntireDFA(Prog::kFirstMatch);
+    delete prog;
+    re->Decref();
+  }
+}
+
+// Check that DFA size requirements are followed.
+// BuildEntireDFA will, like SearchDFA, stop building out
+// the DFA once the memory limits are reached.
+TEST(SingleThreaded, BuildEntireDFA) {
+  // Create regexp with 2^30 states in DFA.
+  string s = "a";
+  for (int i = 0; i < 30; i++)
+    s += "[ab]";
+  s += "b";
+
+  //LOG(INFO) << s;
+  Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
+  CHECK(re);
+  int max = 24;
+  for (int i = 17; i < max; i++) {
+    int limit = 1<<i;
+    int usage, progusage, dfamem;
+    {
+      testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
+      Prog* prog = re->CompileToProg(limit);
+      CHECK(prog);
+      progusage = m.HeapGrowth();
+      dfamem = prog->dfa_mem();
+      prog->BuildEntireDFA(Prog::kFirstMatch);
+      prog->BuildEntireDFA(Prog::kLongestMatch);
+      usage = m.HeapGrowth();
+      delete prog;
+    }
+    if (!UsingMallocCounter)
+      continue;
+    //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
+    //                          limit, progusage, dfamem, usage);
+    CHECK_GT(usage, limit*9/10);
+    CHECK_LT(usage, limit + (16<<10));  // 16kB of slop okay
+  }
+  re->Decref();
+}
+
+// Generates and returns a string over binary alphabet {0,1} that contains
+// all possible binary sequences of length n as subsequences.  The obvious
+// brute force method would generate a string of length n * 2^n, but this
+// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
+// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
+// Such a string is useful for testing a DFA.  If you have a DFA
+// where distinct last n bytes implies distinct states, then running on a
+// DeBruijn string causes the DFA to need to create a new state at every
+// position in the input, never reusing any states until it gets to the
+// end of the string.  This is the worst possible case for DFA execution.
+static string DeBruijnString(int n) {
+  CHECK_LT(n, 8*sizeof(int));
+  CHECK_GT(n, 0);
+
+  vector<bool> did(1<<n);
+  for (int i = 0; i < 1<<n; i++)
+    did[i] = false;
+
+  string s;
+  for (int i = 0; i < n-1; i++)
+    s.append("0");
+  int bits = 0;
+  int mask = (1<<n) - 1;
+  for (int i = 0; i < (1<<n); i++) {
+    bits <<= 1;
+    bits &= mask;
+    if (!did[bits|1]) {
+      bits |= 1;
+      s.append("1");
+    } else {
+      s.append("0");
+    }
+    CHECK(!did[bits]);
+    did[bits] = true;
+  }
+  return s;
+}
+
+// Test that the DFA gets the right result even if it runs
+// out of memory during a search.  The regular expression
+// 0[01]{n}$ matches a binary string of 0s and 1s only if
+// the (n+1)th-to-last character is a 0.  Matching this in
+// a single forward pass (as done by the DFA) requires
+// keeping one bit for each of the last n+1 characters
+// (whether each was a 0), or 2^(n+1) possible states.
+// If we run this regexp to search in a string that contains
+// every possible n-character binary string as a substring,
+// then it will have to run through at least 2^n states.
+// States are big data structures -- certainly more than 1 byte --
+// so if the DFA can search correctly while staying within a
+// 2^n byte limit, it must be handling out-of-memory conditions
+// gracefully.
+TEST(SingleThreaded, SearchDFA) {
+  // Choice of n is mostly arbitrary, except that:
+  //   * making n too big makes the test run for too long.
+  //   * making n too small makes the DFA refuse to run,
+  //     because it has so little memory compared to the program size.
+  // Empirically, n = 18 is a good compromise between the two.
+  const int n = 18;
+
+  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
+                             Regexp::LikePerl, NULL);
+  CHECK(re);
+
+  // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
+  // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
+  string no_match = DeBruijnString(n);
+  string match = no_match + "0";
+
+  // The De Bruijn string is the worst case input for this regexp.
+  // By default, the DFA will notice that it is flushing its cache
+  // too frequently and will bail out early, so that RE2 can use the
+  // NFA implementation instead.  (The DFA loses its speed advantage
+  // if it can't get a good cache hit rate.)
+  // Tell the DFA to trudge along instead.
+  FLAGS_re2_dfa_bail_when_slow = false;
+
+  int64 usage;
+  int64 peak_usage;
+  {
+    testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
+    Prog* prog = re->CompileToProg(1<<n);
+    CHECK(prog);
+    for (int i = 0; i < 10; i++) {
+      bool matched, failed = false;
+      matched = prog->SearchDFA(match, NULL,
+                                Prog::kUnanchored, Prog::kFirstMatch,
+                                NULL, &failed, NULL);
+      CHECK(!failed);
+      CHECK(matched);
+      matched = prog->SearchDFA(no_match, NULL,
+                                Prog::kUnanchored, Prog::kFirstMatch,
+                                NULL, &failed, NULL);
+      CHECK(!failed);
+      CHECK(!matched);
+    }
+    usage = m.HeapGrowth();
+    peak_usage = m.PeakHeapGrowth();
+    delete prog;
+  }
+  re->Decref();
+
+  if (!UsingMallocCounter)
+    return;
+  //LOG(INFO) << "usage " << usage << " " << peak_usage;
+  CHECK_LT(usage, 1<<n);
+  CHECK_LT(peak_usage, 1<<n);
+}
+
+// Helper thread: searches for match, which should match,
+// and no_match, which should not.
+class SearchThread : public Thread {
+ public:
+  SearchThread(Prog* prog, const StringPiece& match,
+               const StringPiece& no_match)
+    : prog_(prog), match_(match), no_match_(no_match) {}
+
+  virtual void Run() {
+    for (int i = 0; i < 2; i++) {
+      bool matched, failed = false;
+      matched = prog_->SearchDFA(match_, NULL,
+                                 Prog::kUnanchored, Prog::kFirstMatch,
+                                 NULL, &failed, NULL);
+      CHECK(!failed);
+      CHECK(matched);
+      matched = prog_->SearchDFA(no_match_, NULL,
+                                 Prog::kUnanchored, Prog::kFirstMatch,
+                                 NULL, &failed, NULL);
+      CHECK(!failed);
+      CHECK(!matched);
+    }
+  }
+
+ private:
+  Prog* prog_;
+  StringPiece match_;
+  StringPiece no_match_;
+};
+
+TEST(Multithreaded, SearchDFA) {
+  // Same as single-threaded test above.
+  const int n = 18;
+  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
+                             Regexp::LikePerl, NULL);
+  CHECK(re);
+  string no_match = DeBruijnString(n);
+  string match = no_match + "0";
+  FLAGS_re2_dfa_bail_when_slow = false;
+
+  // Check that single-threaded code works.
+  {
+    Prog* prog = re->CompileToProg(1<<n);
+    CHECK(prog);
+    SearchThread* t = new SearchThread(prog, match, no_match);
+    t->SetJoinable(true);
+    t->Start();
+    t->Join();
+    delete t;
+    delete prog;
+  }
+
+  // Run the search simultaneously in a bunch of threads.
+  // Reuse same flags for Multithreaded.BuildDFA above.
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    //LOG(INFO) << "Search " << i;
+    Prog* prog = re->CompileToProg(1<<n);
+    CHECK(prog);
+
+    vector<SearchThread*> threads;
+    for (int j = 0; j < FLAGS_threads; j++) {
+      SearchThread *t = new SearchThread(prog, match, no_match);
+      t->SetJoinable(true);
+      threads.push_back(t);
+    }
+    for (int j = 0; j < FLAGS_threads; j++)
+      threads[j]->Start();
+    for (int j = 0; j < FLAGS_threads; j++) {
+      threads[j]->Join();
+      delete threads[j];
+    }
+    delete prog;
+  }
+  re->Decref();
+}
+
+struct ReverseTest {
+  const char *regexp;
+  const char *text;
+  bool match;
+};
+
+// Test that reverse DFA handles anchored/unanchored correctly.
+// It's in the DFA interface but not used by RE2.
+ReverseTest reverse_tests[] = {
+  { "\\A(a|b)", "abc", true },
+  { "(a|b)\\z", "cba", true },
+  { "\\A(a|b)", "cba", false },
+  { "(a|b)\\z", "abc", false },
+};
+
+TEST(DFA, ReverseMatch) {
+  int nfail = 0;
+  for (int i = 0; i < arraysize(reverse_tests); i++) {
+    const ReverseTest& t = reverse_tests[i];
+    Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+    CHECK(re);
+    Prog *prog = re->CompileToReverseProg(0);
+    CHECK(prog);
+    bool failed = false;
+    bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
+    if (matched != t.match) {
+      LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
+      nfail++;
+    }
+    delete prog;
+    re->Decref();
+  }
+  EXPECT_EQ(nfail, 0);
+}
+
+}  // namespace re2
--- a/re2/re2/testing/dump.cc
+++ b/re2/re2/testing/dump.cc
@ -0,0 +1,164 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Dump the regexp into a string showing structure.
+// Tested by parse_unittest.cc
+
+// This function traverses the regexp recursively,
+// meaning that on inputs like Regexp::Simplify of
+// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
+// it takes time and space exponential in the size of the
+// original regular expression.  It can also use stack space
+// linear in the size of the regular expression for inputs
+// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
+// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
+// As a result, Dump is provided only in the testing
+// library (see BUILD).
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/stringpiece.h"
+#include "re2/regexp.h"
+
+// Cause a link error if this file is used outside of testing.
+DECLARE_string(test_tmpdir);
+
+namespace re2 {
+
+static const char* kOpcodeNames[] = {
+  "bad",
+  "no",
+  "emp",
+  "lit",
+  "str",
+  "cat",
+  "alt",
+  "star",
+  "plus",
+  "que",
+  "rep",
+  "cap",
+  "dot",
+  "byte",
+  "bol",
+  "eol",
+  "wb",   // kRegexpWordBoundary
+  "nwb",  // kRegexpNoWordBoundary
+  "bot",
+  "eot",
+  "cc",
+  "match",
+};
+
+// Create string representation of regexp with explicit structure.
+// Nothing pretty, just for testing.
+static void DumpRegexpAppending(Regexp* re, string* s) {
+  if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
+    StringAppendF(s, "op%d", re->op());
+  } else {
+    switch (re->op()) {
+      default:
+        break;
+      case kRegexpStar:
+      case kRegexpPlus:
+      case kRegexpQuest:
+      case kRegexpRepeat:
+        if (re->parse_flags() & Regexp::NonGreedy)
+          s->append("n");
+        break;
+    }
+    s->append(kOpcodeNames[re->op()]);
+    if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
+      Rune r = re->rune();
+      if ('a' <= r && r <= 'z')
+        s->append("fold");
+    }
+    if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
+      for (int i = 0; i < re->nrunes(); i++) {
+        Rune r = re->runes()[i];
+        if ('a' <= r && r <= 'z') {
+          s->append("fold");
+          break;
+        }
+      }
+    }
+  }
+  s->append("{");
+  switch (re->op()) {
+    default:
+      break;
+    case kRegexpEndText:
+      if (!(re->parse_flags() & Regexp::WasDollar)) {
+        s->append("\\z");
+      }
+      break;
+    case kRegexpLiteral: {
+      Rune r = re->rune();
+      char buf[UTFmax+1];
+      buf[runetochar(buf, &r)] = 0;
+      s->append(buf);
+      break;
+    }
+    case kRegexpLiteralString:
+      for (int i = 0; i < re->nrunes(); i++) {
+        Rune r = re->runes()[i];
+        char buf[UTFmax+1];
+        buf[runetochar(buf, &r)] = 0;
+        s->append(buf);
+      }
+      break;
+    case kRegexpConcat:
+    case kRegexpAlternate:
+      for (int i = 0; i < re->nsub(); i++)
+        DumpRegexpAppending(re->sub()[i], s);
+      break;
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      DumpRegexpAppending(re->sub()[0], s);
+      break;
+    case kRegexpCapture:
+      if (re->name()) {
+        s->append(*re->name());
+        s->append(":");
+      }
+      DumpRegexpAppending(re->sub()[0], s);
+      break;
+    case kRegexpRepeat:
+      s->append(StringPrintf("%d,%d ", re->min(), re->max()));
+      DumpRegexpAppending(re->sub()[0], s);
+      break;
+    case kRegexpCharClass: {
+      string sep;
+      for (CharClass::iterator it = re->cc()->begin();
+           it != re->cc()->end(); ++it) {
+        RuneRange rr = *it;
+        s->append(sep);
+        if (rr.lo == rr.hi)
+          s->append(StringPrintf("%#x", rr.lo));
+        else
+          s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
+        sep = " ";
+      }
+      break;
+    }
+  }
+  s->append("}");
+}
+
+string Regexp::Dump() {
+  string s;
+
+  // Make sure being called from a unit test.
+  if (FLAGS_test_tmpdir.empty()) {
+    LOG(ERROR) << "Cannot use except for testing.";
+    return s;
+  }
+
+  DumpRegexpAppending(this, &s);
+  return s;
+}
+
+}  // namespace re2
--- a/re2/re2/testing/exhaustive1_test.cc
+++ b/re2/re2/testing/exhaustive1_test.cc
@ -0,0 +1,42 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+DECLARE_string(regexp_engines);
+
+namespace re2 {
+
+// Test simple repetition operators
+TEST(Repetition, Simple) {
+  vector<string> ops = Split(" ",
+    "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
+    "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
+    "%s* %s+ %s? %s*? %s+? %s??");
+  ExhaustiveTest(3, 2, Explode("abc."), ops,
+                 6, Explode("ab"), "(?:%s)", "");
+  ExhaustiveTest(3, 2, Explode("abc."), ops,
+                 40, Explode("a"), "(?:%s)", "");
+}
+
+// Test capturing parens -- (a) -- inside repetition operators
+TEST(Repetition, Capturing) {
+  vector<string> ops = Split(" ",
+    "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
+    "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
+    "%s* %s+ %s? %s*? %s+? %s??");
+  ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
+                 7, Explode("ab"), "(?:%s)", "");
+
+  // This would be a great test, but it runs forever when PCRE is enabled.
+  if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL)
+    ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops,
+                   100, Explode("a"), "(?:%s)", "");
+}
+
+}  // namespace re2
+
--- a/re2/re2/testing/exhaustive2_test.cc
+++ b/re2/re2/testing/exhaustive2_test.cc
@ -0,0 +1,70 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/re2.h"
+#include "re2/testing/exhaustive_tester.h"
+
+DECLARE_string(regexp_engines);
+
+namespace re2 {
+
+// Test empty string matches (aka "(?:)")
+TEST(EmptyString, Exhaustive) {
+  ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
+                 RegexpGenerator::EgrepOps(),
+                 5, Split("", "ab"), "", "");
+}
+
+// Test escaped versions of regexp syntax.
+TEST(Punctuation, Literals) {
+  vector<string> alphabet = Explode("()*+?{}[]\\^$.");
+  vector<string> escaped = alphabet;
+  for (int i = 0; i < escaped.size(); i++)
+    escaped[i] = "\\" + escaped[i];
+  ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
+                 2, alphabet, "", "");
+}
+
+// Test ^ $ . \A \z in presence of line endings.
+// Have to wrap the empty-width ones in (?:) so that
+// they can be repeated -- PCRE rejects ^* but allows (?:^)*
+TEST(LineEnds, Exhaustive) {
+  ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
+                 RegexpGenerator::EgrepOps(),
+                 4, Explode("ab\n"), "", "");
+}
+
+// Test what does and does not match \n.
+// This would be a good test, except that PCRE seems to have a bug:
+// in single-byte character set mode (the default),
+// [^a] matches \n, but in UTF-8 mode it does not.
+// So when we run the test, the tester complains that
+// we don't agree with PCRE, but it's PCRE that is at fault.
+// For what it's worth, Perl gets this right (matches
+// regardless of whether UTF-8 input is selected):
+//
+//     #!/usr/bin/perl
+//     use POSIX qw(locale_h);
+//     print "matches in latin1\n" if "\n" =~ /[^a]/;
+//     setlocale("en_US.utf8");
+//     print "matches in utf8\n" if "\n" =~ /[^a]/;
+//
+// The rule chosen for RE2 is that by default, like Perl,
+// dot does not match \n but negated character classes [^a] do.
+// (?s) will allow dot to match \n; there is no way in RE2
+// to stop [^a] from matching \n, though the underlying library
+// provides a mechanism, and RE2 could add new syntax if needed.
+//
+// TEST(Newlines, Exhaustive) {
+//   vector<string> empty_vector;
+//   ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
+//                  RegexpGenerator::EgrepOps(),
+//                  4, Explode("a\n"), "");
+// }
+
+}  // namespace re2
+
--- a/re2/re2/testing/exhaustive3_test.cc
+++ b/re2/re2/testing/exhaustive3_test.cc
@ -0,0 +1,94 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+// Test simple character classes by themselves.
+TEST(CharacterClasses, Exhaustive) {
+  vector<string> atoms = Split(" ",
+    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
+  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
+                 5, Explode("ab"), "", "");
+}
+
+// Test simple character classes inside a___b (for example, a[a]b).
+TEST(CharacterClasses, ExhaustiveAB) {
+  vector<string> atoms = Split(" ",
+    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
+  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
+                 5, Explode("ab"), "a%sb", "");
+}
+
+// Returns UTF8 for Rune r
+static string UTF8(Rune r) {
+  char buf[UTFmax+1];
+  buf[runetochar(buf, &r)] = 0;
+  return string(buf);
+}
+
+// Returns a vector of "interesting" UTF8 characters.
+// Unicode is now too big to just return all of them,
+// so UTF8Characters return a set likely to be good test cases.
+static const vector<string>& InterestingUTF8() {
+  static bool init;
+  static vector<string> v;
+
+  if (init)
+    return v;
+
+  init = true;
+  // All the Latin1 equivalents are interesting.
+  for (int i = 1; i < 256; i++)
+    v.push_back(UTF8(i));
+
+  // After that, the codes near bit boundaries are
+  // interesting, because they span byte sequence lengths.
+  for (int j = 0; j < 8; j++)
+    v.push_back(UTF8(256 + j));
+  for (int i = 512; i < Runemax; i <<= 1)
+    for (int j = -8; j < 8; j++)
+      v.push_back(UTF8(i + j));
+
+  // The codes near Runemax, including Runemax itself, are interesting.
+  for (int j = -8; j <= 0; j++)
+    v.push_back(UTF8(Runemax + j));
+
+  return v;
+}
+
+// Test interesting UTF-8 characters against character classes.
+TEST(InterestingUTF8, SingleOps) {
+  vector<string> atoms = Split(" ",
+    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
+    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
+    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
+    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
+  vector<string> ops;  // no ops
+  ExhaustiveTest(1, 0, atoms, ops,
+                 1, InterestingUTF8(), "", "");
+}
+
+// Test interesting UTF-8 characters against character classes,
+// but wrap everything inside AB.
+TEST(InterestingUTF8, AB) {
+  vector<string> atoms = Split(" ",
+    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
+    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
+    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
+    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
+  vector<string> ops;  // no ops
+  vector<string> alpha = InterestingUTF8();
+  for (int i = 0; i < alpha.size(); i++)
+    alpha[i] = "a" + alpha[i] + "b";
+  ExhaustiveTest(1, 0, atoms, ops,
+                 1, alpha, "a%sb", "");
+}
+
+}  // namespace re2
+
--- a/re2/re2/testing/exhaustive_test.cc
+++ b/re2/re2/testing/exhaustive_test.cc
@ -0,0 +1,38 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+DECLARE_string(regexp_engines);
+
+// Test very simple expressions.
+TEST(EgrepLiterals, Lowercase) {
+  EgrepTest(3, 2, "abc.", 3, "abc", "");
+}
+
+// Test mixed-case expressions.
+TEST(EgrepLiterals, MixedCase) {
+  EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
+}
+
+// Test mixed-case in case-insensitive mode.
+TEST(EgrepLiterals, FoldCase) {
+  // The punctuation characters surround A-Z and a-z
+  // in the ASCII table.  This looks for bugs in the
+  // bytemap range code in the DFA.
+  EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
+}
+
+// Test very simple expressions.
+TEST(EgrepLiterals, UTF8) {
+  EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
+}
+
+}  // namespace re2
+
--- a/re2/re2/testing/exhaustive_tester.cc
+++ b/re2/re2/testing/exhaustive_tester.cc
@ -0,0 +1,188 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+// Each test picks an alphabet (e.g., "abc"), a maximum string length,
+// a maximum regular expression length, and a maximum number of letters
+// that can appear in the regular expression.  Given these parameters,
+// it tries every possible regular expression and string, verifying that
+// the NFA, DFA, and a trivial backtracking implementation agree about
+// the location of the match.
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifndef LOGGING
+#define LOGGING 0
+#endif
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+#include "re2/testing/tester.h"
+
+DEFINE_bool(show_regexps, false, "show regexps during testing");
+
+DEFINE_int32(max_bad_regexp_inputs, 1,
+             "Stop testing a regular expression after finding this many "
+             "strings that break it.");
+
+// Compiled in debug mode, the usual tests run for over an hour.
+// Have to cut it down to make the unit test machines happy.
+DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode.");
+
+namespace re2 {
+
+static char* escape(const StringPiece& sp) {
+  static char buf[512];
+  char* p = buf;
+  *p++ = '\"';
+  for (int i = 0; i < sp.size(); i++) {
+    if(p+5 >= buf+sizeof buf)
+      LOG(FATAL) << "ExhaustiveTester escape: too long";
+    if(sp[i] == '\\' || sp[i] == '\"') {
+      *p++ = '\\';
+      *p++ = sp[i];
+    } else if(sp[i] == '\n') {
+      *p++ = '\\';
+      *p++ = 'n';
+    } else {
+      *p++ = sp[i];
+    }
+  }
+  *p++ = '\"';
+  *p = '\0';
+  return buf;
+}
+
+static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
+  if (!re.Match(input, 0, input.size(), anchor, m, n)) {
+    printf("-");
+    return;
+  }
+  for (int i = 0; i < n; i++) {
+    if (i > 0)
+      printf(" ");
+    if (m[i].begin() == NULL)
+      printf("-");
+    else
+      printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin()));
+  }
+}
+	
+// Processes a single generated regexp.
+// Compiles it using Regexp interface and PCRE, and then
+// checks that NFA, DFA, and PCRE all return the same results.
+void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
+  regexps_++;
+  string regexp = const_regexp;
+  if (!topwrapper_.empty())
+    regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
+
+  if (FLAGS_show_regexps) {
+    printf("\r%s", regexp.c_str());
+    fflush(stdout);
+  }
+
+  if (LOGGING) {
+    // Write out test cases and answers for use in testing
+    // other implementations, such as Go's regexp package.
+    if (randomstrings_)
+      LOG(ERROR) << "Cannot log with random strings.";
+    if (regexps_ == 1) {  // first
+      printf("strings\n");
+      strgen_.Reset();
+      while (strgen_.HasNext())
+        printf("%s\n", escape(strgen_.Next()));
+      printf("regexps\n");
+    }
+    printf("%s\n", escape(regexp));
+
+    RE2 re(regexp);
+    RE2::Options longest;
+    longest.set_longest_match(true);
+    RE2 relongest(regexp, longest);
+    int ngroup = re.NumberOfCapturingGroups()+1;
+    StringPiece* group = new StringPiece[ngroup];
+
+    strgen_.Reset();
+    while (strgen_.HasNext()) {
+      StringPiece input = strgen_.Next();
+      PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
+      printf(";");
+      PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
+      printf(";");
+      PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
+      printf(";");
+      PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
+      printf("\n");
+    }
+    delete[] group;
+    return;
+  }
+
+  Tester tester(regexp);
+  if (tester.error())
+    return;
+
+  strgen_.Reset();
+  strgen_.GenerateNULL();
+  if (randomstrings_)
+    strgen_.Random(stringseed_, stringcount_);
+  int bad_inputs = 0;
+  while (strgen_.HasNext()) {
+    tests_++;
+    if (!tester.TestInput(strgen_.Next())) {
+      failures_++;
+      if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
+        break;
+    }
+  }
+}
+
+// Runs an exhaustive test on the given parameters.
+void ExhaustiveTest(int maxatoms, int maxops,
+                    const vector<string>& alphabet,
+                    const vector<string>& ops,
+                    int maxstrlen, const vector<string>& stralphabet,
+                    const string& wrapper,
+                    const string& topwrapper) {
+  if (DEBUG_MODE && FLAGS_quick_debug_mode) {
+    if (maxatoms > 1)
+      maxatoms--;
+    if (maxops > 1)
+      maxops--;
+    if (maxstrlen > 1)
+      maxstrlen--;
+  }
+  ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
+                     maxstrlen, stralphabet, wrapper,
+                     topwrapper);
+  t.Generate();
+  if (!LOGGING) {
+    printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
+           t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
+  }
+  EXPECT_EQ(0, t.failures());
+}
+
+// Runs an exhaustive test using the given parameters and
+// the basic egrep operators.
+void EgrepTest(int maxatoms, int maxops, const string& alphabet,
+               int maxstrlen, const string& stralphabet,
+               const string& wrapper) {
+  const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
+
+  for (int i = 0; i < arraysize(tops); i++) {
+    ExhaustiveTest(maxatoms, maxops,
+                   Split("", alphabet),
+                   RegexpGenerator::EgrepOps(),
+                   maxstrlen,
+                   Split("", stralphabet),
+                   wrapper,
+                   tops[i]);
+  }
+}
+
+}  // namespace re2
--- a/re2/re2/testing/exhaustive_tester.h
+++ b/re2/re2/testing/exhaustive_tester.h
@ -0,0 +1,85 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
+#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
+
+#include <string>
+#include <vector>
+#include "util/util.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+// Exhaustive regular expression test: generate all regexps within parameters,
+// then generate all strings of a given length over a given alphabet,
+// then check that NFA, DFA, and PCRE agree about whether each regexp matches
+// each possible string, and if so, where the match is.
+//
+// Can also be used in a "random" mode that generates a given number
+// of random regexp and strings, allowing testing of larger expressions
+// and inputs.
+class ExhaustiveTester : public RegexpGenerator {
+ public:
+  ExhaustiveTester(int maxatoms,
+                   int maxops,
+                   const vector<string>& alphabet,
+                   const vector<string>& ops,
+                   int maxstrlen,
+                   const vector<string>& stralphabet,
+                   const string& wrapper,
+                   const string& topwrapper)
+    : RegexpGenerator(maxatoms, maxops, alphabet, ops),
+      strgen_(maxstrlen, stralphabet),
+      wrapper_(wrapper),
+      topwrapper_(topwrapper),
+      regexps_(0), tests_(0), failures_(0),
+      randomstrings_(0), stringseed_(0), stringcount_(0)  { }
+
+  int regexps()  { return regexps_; }
+  int tests()    { return tests_; }
+  int failures() { return failures_; }
+
+  // Needed for RegexpGenerator interface.
+  void HandleRegexp(const string& regexp);
+
+  // Causes testing to generate random input strings.
+  void RandomStrings(int32 seed, int32 count) {
+    randomstrings_ = true;
+    stringseed_ = seed;
+    stringcount_ = count;
+  }
+
+ private:
+  StringGenerator strgen_;
+  string wrapper_;      // Regexp wrapper - either empty or has one %s.
+  string topwrapper_;   // Regexp top-level wrapper.
+  int regexps_;   // Number of HandleRegexp calls
+  int tests_;     // Number of regexp tests.
+  int failures_;  // Number of tests failed.
+
+  bool randomstrings_;  // Whether to use random strings
+  int32 stringseed_;    // If so, the seed.
+  int stringcount_;     // If so, how many to generate.
+  DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester);
+};
+
+// Runs an exhaustive test on the given parameters.
+void ExhaustiveTest(int maxatoms, int maxops,
+                    const vector<string>& alphabet,
+                    const vector<string>& ops,
+                    int maxstrlen, const vector<string>& stralphabet,
+                    const string& wrapper,
+                    const string& topwrapper);
+
+// Runs an exhaustive test using the given parameters and
+// the basic egrep operators.
+void EgrepTest(int maxatoms, int maxops, const string& alphabet,
+               int maxstrlen, const string& stralphabet,
+               const string& wrapper);
+
+}  // namespace re2
+
+#endif  // RE2_TESTING_EXHAUSTIVE_TESTER_H__
--- a/re2/re2/testing/filtered_re2_test.cc
+++ b/re2/re2/testing/filtered_re2_test.cc
@ -0,0 +1,258 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/filtered_re2.h"
+#include "re2/re2.h"
+
+DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
+
+namespace re2 {
+
+struct FilterTestVars {
+  vector<string> atoms;
+  vector<int> atom_indices;
+  vector<int> matches;
+  RE2::Options opts;
+  FilteredRE2 f;
+};
+
+TEST(FilteredRE2Test, EmptyTest) {
+  FilterTestVars v;
+  v.f.AllMatches("foo", v.atom_indices, &v.matches);
+  EXPECT_EQ(0, v.matches.size());
+}
+
+TEST(FilteredRE2Test, SmallOrTest) {
+  FLAGS_filtered_re2_min_atom_len = 4;
+
+  FilterTestVars v;
+  int id;
+  v.f.Add("(foo|bar)", v.opts, &id);
+
+  v.f.Compile(&v.atoms);
+  EXPECT_EQ(0, v.atoms.size());
+
+  v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
+  EXPECT_EQ(1, v.matches.size());
+  EXPECT_EQ(id, v.matches[0]);
+}
+
+struct AtomTest {
+  const char* testname;
+  // If any test needs more than this many regexps or atoms, increase
+  // the size of the corresponding array.
+  const char* regexps[20];
+  const char* atoms[20];
+};
+
+AtomTest atom_tests[] = {
+  {
+    // This test checks to make sure empty patterns are allowed.
+    "CheckEmptyPattern",
+    {""},
+    {}
+  }, {
+    // This test checks that all atoms of length greater than min length
+    // are found, and no atoms that are of smaller length are found.
+    "AllAtomsGtMinLengthFound", {
+      "(abc123|def456|ghi789).*mnop[x-z]+",
+      "abc..yyy..zz",
+      "mnmnpp[a-z]+PPP"
+    }, {
+      "abc123",
+      "def456",
+      "ghi789",
+      "mnop",
+      "abc",
+      "yyy",
+      "mnmnpp",
+      "ppp"
+    }
+  }, {
+    // Test to make sure that any atoms that have another atom as a
+    // substring in an OR are removed; that is, only the shortest
+    // substring is kept.
+    "SubstrAtomRemovesSuperStrInOr", {
+      "(abc123|abc|ghi789|abc1234).*[x-z]+",
+      "abcd..yyy..yyyzzz",
+      "mnmnpp[a-z]+PPP"
+    }, {
+      "abc",
+      "ghi789",
+      "abcd",
+      "yyy",
+      "yyyzzz",
+      "mnmnpp",
+      "ppp"
+    }
+  }, {
+    // Test character class expansion.
+    "CharClassExpansion", {
+      "m[a-c][d-f]n.*[x-z]+",
+      "[x-y]bcde[ab]"
+    }, {
+      "madn", "maen", "mafn",
+      "mbdn", "mben", "mbfn",
+      "mcdn", "mcen", "mcfn",
+      "xbcdea", "xbcdeb",
+      "ybcdea", "ybcdeb"
+    }
+  }, {
+    // Test upper/lower of non-ASCII.
+    "UnicodeLower", {
+      "(?i)ΔδΠϖπΣςσ",
+      "ΛΜΝΟΠ",
+      "ψρστυ",
+    }, {
+      "δδπππσσσ",
+      "λμνοπ",
+      "ψρστυ",
+    },
+  },
+};
+
+void AddRegexpsAndCompile(const char* regexps[],
+                          int n,
+                          struct FilterTestVars* v) {
+  for (int i = 0; i < n; i++) {
+    int id;
+    v->f.Add(regexps[i], v->opts, &id);
+  }
+  v->f.Compile(&v->atoms);
+}
+
+bool CheckExpectedAtoms(const char* atoms[],
+                        int n,
+                        const char* testname,
+                        struct FilterTestVars* v) {
+  vector<string> expected;
+  for (int i = 0; i < n; i++)
+    expected.push_back(atoms[i]);
+
+  bool pass = expected.size() == v->atoms.size();
+
+  sort(v->atoms.begin(), v->atoms.end());
+  sort(expected.begin(), expected.end());
+  for (int i = 0; pass && i < n; i++)
+      pass = pass && expected[i] == v->atoms[i];
+
+  if (!pass) {
+    LOG(WARNING) << "Failed " << testname;
+    LOG(WARNING) << "Expected #atoms = " << expected.size();
+    for (int i = 0; i < expected.size(); i++)
+      LOG(WARNING) << expected[i];
+    LOG(WARNING) << "Found #atoms = " << v->atoms.size();
+    for (int i = 0; i < v->atoms.size(); i++)
+      LOG(WARNING) << v->atoms[i];
+  }
+
+  return pass;
+}
+
+TEST(FilteredRE2Test, AtomTests) {
+  FLAGS_filtered_re2_min_atom_len = 3;
+
+  int nfail = 0;
+  for (int i = 0; i < arraysize(atom_tests); i++) {
+    FilterTestVars v;
+    AtomTest* t = &atom_tests[i];
+    int natom, nregexp;
+    for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+      if (t->regexps[nregexp] == NULL)
+        break;
+    for (natom = 0; natom < arraysize(t->atoms); natom++)
+      if (t->atoms[natom] == NULL)
+        break;
+    AddRegexpsAndCompile(t->regexps, nregexp, &v);
+    if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
+      nfail++;
+  }
+  EXPECT_EQ(0, nfail);
+}
+
+void FindAtomIndices(const vector<string> atoms,
+                     const vector<string> matched_atoms,
+                     vector<int>* atom_indices) {
+  atom_indices->clear();
+  for (int i = 0; i < matched_atoms.size(); i++) {
+    int j = 0;
+    for (; j < atoms.size(); j++) {
+      if (matched_atoms[i] == atoms[j]) {
+        atom_indices->push_back(j);
+        break;
+      }
+      EXPECT_LT(j, atoms.size());
+    }
+  }
+}
+
+TEST(FilteredRE2Test, MatchEmptyPattern) {
+  FLAGS_filtered_re2_min_atom_len = 3;
+
+  FilterTestVars v;
+  AtomTest* t = &atom_tests[0];
+  // We are using the regexps used in one of the atom tests
+  // for this test. Adding the EXPECT here to make sure
+  // the index we use for the test is for the correct test.
+  EXPECT_EQ("CheckEmptyPattern", string(t->testname));
+  int nregexp;
+  for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+    if (t->regexps[nregexp] == NULL)
+      break;
+  AddRegexpsAndCompile(t->regexps, nregexp, &v);
+  string text = "0123";
+  vector<int> atom_ids;
+  vector<int> matching_regexps;
+  EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
+}
+
+TEST(FilteredRE2Test, MatchTests) {
+  FLAGS_filtered_re2_min_atom_len = 3;
+
+  FilterTestVars v;
+  AtomTest* t = &atom_tests[2];
+  // We are using the regexps used in one of the atom tests
+  // for this test.
+  EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
+  int nregexp;
+  for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+    if (t->regexps[nregexp] == NULL)
+      break;
+  AddRegexpsAndCompile(t->regexps, nregexp, &v);
+
+  string text = "abc121212xyz";
+  // atoms = abc
+  vector<int> atom_ids;
+  vector<string> atoms;
+  atoms.push_back("abc");
+  FindAtomIndices(v.atoms, atoms, &atom_ids);
+  vector<int> matching_regexps;
+  v.f.AllMatches(text, atom_ids, &matching_regexps);
+  EXPECT_EQ(1, matching_regexps.size());
+
+  text = "abc12312yyyzzz";
+  atoms.clear();
+  atoms.push_back("abc");
+  atoms.push_back("yyy");
+  atoms.push_back("yyyzzz");
+  FindAtomIndices(v.atoms, atoms, &atom_ids);
+  v.f.AllMatches(text, atom_ids, &matching_regexps);
+  EXPECT_EQ(1, matching_regexps.size());
+
+  text = "abcd12yyy32yyyzzz";
+  atoms.clear();
+  atoms.push_back("abc");
+  atoms.push_back("abcd");
+  atoms.push_back("yyy");
+  atoms.push_back("yyyzzz");
+  FindAtomIndices(v.atoms, atoms, &atom_ids);
+  LOG(INFO) << "S: " << atom_ids.size();
+  for (int i = 0; i < atom_ids.size(); i++)
+    LOG(INFO) << "i: " << i << " : " << atom_ids[i];
+  v.f.AllMatches(text, atom_ids, &matching_regexps);
+  EXPECT_EQ(2, matching_regexps.size());
+}
+
+}  //  namespace re2
--- a/re2/re2/testing/mimics_pcre_test.cc
+++ b/re2/re2/testing/mimics_pcre_test.cc
@ -0,0 +1,76 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct PCRETest {
+  const char* regexp;
+  bool should_match;
+};
+
+static PCRETest tests[] = {
+  // Most things should behave exactly.
+  { "abc",       true  },
+  { "(a|b)c",    true  },
+  { "(a*|b)c",   true  },
+  { "(a|b*)c",   true  },
+  { "a(b|c)d",   true  },
+  { "a(()|())c", true  },
+  { "ab*c",      true  },
+  { "ab+c",      true  },
+  { "a(b*|c*)d", true  },
+  { "\\W",       true  },
+  { "\\W{1,2}",  true  },
+  { "\\d",       true  },
+
+  // Check that repeated empty strings do not.
+  { "(a*)*",     false },
+  { "x(a*)*y",   false },
+  { "(a*)+",     false },
+  { "(a+)*",     true  },
+  { "(a+)+",     true  },
+  { "(a+)+",     true  },
+
+  // \v is the only character class that shouldn't.
+  { "\\b",       true  },
+  { "\\v",       false },
+  { "\\d",       true  },
+
+  // The handling of ^ in multi-line mode is different, as is
+  // the handling of $ in single-line mode.  (Both involve
+  // boundary cases if the string ends with \n.)
+  { "\\A",       true  },
+  { "\\z",       true  },
+  { "(?m)^",     false },
+  { "(?m)$",     true  },
+  { "(?-m)^",    true  },
+  { "(?-m)$",    false },  // In PCRE, == \Z
+  { "(?m)\\A",   true  },
+  { "(?m)\\z",   true  },
+  { "(?-m)\\A",  true  },
+  { "(?-m)\\z",  true  },
+};
+
+TEST(MimicsPCRE, SimpleTests) {
+  for (int i = 0; i < arraysize(tests); i++) {
+    const PCRETest& t = tests[i];
+    for (int j = 0; j < 2; j++) {
+      Regexp::ParseFlags flags = Regexp::LikePerl;
+      if (j == 0)
+        flags = flags | Regexp::Latin1;
+      Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+      CHECK(re) << " " << t.regexp;
+      CHECK_EQ(t.should_match, re->MimicsPCRE())
+        << " " << t.regexp << " "
+        << (j==0 ? "latin1" : "utf");
+      re->Decref();
+    }
+  }
+}
+
+}  // namespace re2
--- a/re2/re2/testing/null_walker.cc
+++ b/re2/re2/testing/null_walker.cc
@ -0,0 +1,44 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Null walker.  For benchmarking the walker itself.
+
+class NullWalker : public Regexp::Walker<bool> {
+ public:
+  NullWalker() { }
+  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                 bool* child_args, int nchild_args);
+
+  bool ShortVisit(Regexp* re, bool a) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "NullWalker::ShortVisit called";
+    return a;
+  }
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(NullWalker);
+};
+
+// Called after visiting re's children.  child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string).  Returns whether this clause can match an
+// empty string.
+bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                                  bool* child_args, int nchild_args) {
+  return false;
+}
+
+// Returns whether re can match an empty string.
+void Regexp::NullWalk() {
+  NullWalker w;
+  w.Walk(this, false);
+}
+
+}  // namespace re2
--- a/re2/re2/testing/parse_test.cc
+++ b/re2/re2/testing/parse_test.cc
@ -0,0 +1,376 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test parse.cc, dump.cc, and tostring.cc.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Test {
+  const char* regexp;
+  const char* parse;
+};
+
+static Test tests[] = {
+  // Base cases
+  { "a", "lit{a}" },
+  { "a.", "cat{lit{a}dot{}}" },
+  { "a.b", "cat{lit{a}dot{}lit{b}}" },
+  { "ab", "str{ab}" },
+  { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
+  { "abc", "str{abc}" },
+  { "a|^", "alt{lit{a}bol{}}" },
+  { "a|b", "cc{0x61-0x62}" },
+  { "(a)", "cap{lit{a}}" },
+  { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
+  { "a*", "star{lit{a}}" },
+  { "a+", "plus{lit{a}}" },
+  { "a?", "que{lit{a}}" },
+  { "a{2}", "rep{2,2 lit{a}}" },
+  { "a{2,3}", "rep{2,3 lit{a}}" },
+  { "a{2,}", "rep{2,-1 lit{a}}" },
+  { "a*?", "nstar{lit{a}}" },
+  { "a+?", "nplus{lit{a}}" },
+  { "a??", "nque{lit{a}}" },
+  { "a{2}?", "nrep{2,2 lit{a}}" },
+  { "a{2,3}?", "nrep{2,3 lit{a}}" },
+  { "a{2,}?", "nrep{2,-1 lit{a}}" },
+  { "", "emp{}" },
+  { "|", "emp{}" },  // alt{emp{}emp{}} but got factored
+  { "|x|", "alt{emp{}lit{x}emp{}}" },
+  { ".", "dot{}" },
+  { "^", "bol{}" },
+  { "$", "eol{}" },
+  { "\\|", "lit{|}" },
+  { "\\(", "lit{(}" },
+  { "\\)", "lit{)}" },
+  { "\\*", "lit{*}" },
+  { "\\+", "lit{+}" },
+  { "\\?", "lit{?}" },
+  { "{", "lit{{}" },
+  { "}", "lit{}}" },
+  { "\\.", "lit{.}" },
+  { "\\^", "lit{^}" },
+  { "\\$", "lit{$}" },
+  { "\\\\", "lit{\\}" },
+  { "[ace]", "cc{0x61 0x63 0x65}" },
+  { "[abc]", "cc{0x61-0x63}" },
+  { "[a-z]", "cc{0x61-0x7a}" },
+  { "[a]", "lit{a}" },
+  { "\\-", "lit{-}" },
+  { "-", "lit{-}" },
+  { "\\_", "lit{_}" },
+
+  // Posix and Perl extensions
+  { "[[:lower:]]", "cc{0x61-0x7a}" },
+  { "[a-z]", "cc{0x61-0x7a}" },
+  { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
+  { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
+  { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+  { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+  { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+  { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+  { "\\d", "cc{0x30-0x39}" },
+  { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
+  { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
+  { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
+  { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
+  { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
+  { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
+  { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+  { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
+  { "\\C", "byte{}" },
+
+  // Unicode, negatives, and a double negative.
+  { "\\p{Braille}", "cc{0x2800-0x28ff}" },
+  { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
+  { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
+  { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
+
+  // More interesting regular expressions.
+  { "a{,2}", "str{a{,2}}" },
+  { "\\.\\^\\$\\\\", "str{.^$\\}" },
+  { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
+  { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
+  { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" },  // utf-8
+  { "a*{", "cat{star{lit{a}}lit{{}}" },
+
+  // Test precedences
+  { "(?:ab)*", "star{str{ab}}" },
+  { "(ab)*", "star{cap{str{ab}}}" },
+  { "ab|cd", "alt{str{ab}str{cd}}" },
+  { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
+
+  // Test flattening.
+  { "(?:a)", "lit{a}" },
+  { "(?:ab)(?:cd)", "str{abcd}" },
+  { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
+  { "a|.", "dot{}" },
+  { ".|a", "dot{}" },
+
+  // Test Perl quoted literals
+  { "\\Q+|*?{[\\E", "str{+|*?{[}" },
+  { "\\Q+\\E+", "plus{lit{+}}" },
+  { "\\Q\\\\E", "lit{\\}" },
+  { "\\Q\\\\\\E", "str{\\\\}" },
+
+  // Test Perl \A and \z
+  { "(?m)^", "bol{}" },
+  { "(?m)$", "eol{}" },
+  { "(?-m)^", "bot{}" },
+  { "(?-m)$", "eot{}" },
+  { "(?m)\\A", "bot{}" },
+  { "(?m)\\z", "eot{\\z}" },
+  { "(?-m)\\A", "bot{}" },
+  { "(?-m)\\z", "eot{\\z}" },
+
+  // Test named captures
+  { "(?P<name>a)", "cap{name:lit{a}}" },
+
+  // Case-folded literals
+  { "[Aa]", "litfold{a}" },
+
+  // Strings
+  { "abcde", "str{abcde}" },
+  { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
+};
+
+static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
+                                       Regexp::PerlX |
+                                       Regexp::PerlClasses |
+                                       Regexp::UnicodeGroups;
+
+bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
+  return Regexp::Equal(a, b);
+}
+
+void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
+               const string& title) {
+  Regexp** re = new Regexp*[ntests];
+  for (int i = 0; i < ntests; i++) {
+    RegexpStatus status;
+    re[i] = Regexp::Parse(tests[i].regexp, flags, &status);
+    CHECK(re[i] != NULL) << " " << tests[i].regexp << " "
+                         << status.Text();
+    string s = re[i]->Dump();
+    EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp
+      << "\nparse: " << tests[i].parse << " s: " << s;
+  }
+
+  for (int i = 0; i < ntests; i++) {
+    for (int j = 0; j < ntests; j++) {
+      EXPECT_EQ(string(tests[i].parse) == tests[j].parse,
+                RegexpEqualTestingOnly(re[i], re[j]))
+        << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
+    }
+  }
+
+  for (int i = 0; i < ntests; i++)
+    re[i]->Decref();
+  delete[] re;
+}
+
+// Test that regexps parse to expected structures.
+TEST(TestParse, SimpleRegexps) {
+  TestParse(tests, arraysize(tests), kTestFlags, "simple");
+}
+
+Test foldcase_tests[] = {
+  { "AbCdE", "strfold{abcde}" },
+  { "[Aa]", "litfold{a}" },
+  { "a", "litfold{a}" },
+
+  // 0x17F is an old English long s (looks like an f) and folds to s.
+  // 0x212A is the Kelvin symbol and folds to k.
+  { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" },  // [Aa][A-z...]
+  { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+  { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+};
+
+// Test that parsing with FoldCase works.
+TEST(TestParse, FoldCase) {
+  TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
+}
+
+Test literal_tests[] = {
+  { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
+};
+
+// Test that parsing with Literal works.
+TEST(TestParse, Literal) {
+  TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
+}
+
+Test matchnl_tests[] = {
+  { ".", "dot{}" },
+  { "\n", "lit{\n}" },
+  { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
+  { "[a\\n]", "cc{0xa 0x61}" },
+};
+
+// Test that parsing with MatchNL works.
+// (Also tested above during simple cases.)
+TEST(TestParse, MatchNL) {
+  TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
+}
+
+Test nomatchnl_tests[] = {
+  { ".", "cc{0-0x9 0xb-0x10ffff}" },
+  { "\n", "lit{\n}" },
+  { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
+  { "[a\\n]", "cc{0xa 0x61}" },
+};
+
+// Test that parsing without MatchNL works.
+TEST(TestParse, NoMatchNL) {
+  TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
+}
+
+Test prefix_tests[] = {
+  { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
+  { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
+  { "abc|abd|aef|bcx|bcy",
+    "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
+      "cat{str{bc}cc{0x78-0x79}}}" },
+  { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
+  { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
+  { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
+  { "(?:xx|yy)c|(?:xx|yy)d",
+    "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" },
+  { "x{2}|x{2}[0-9]",
+    "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
+  { "x{2}y|x{2}[0-9]y",
+    "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
+};
+
+// Test that prefix factoring works.
+TEST(TestParse, Prefix) {
+  TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
+}
+
+// Invalid regular expressions
+const char* badtests[] = {
+  "(",
+  ")",
+  "(a",
+  "(a|b|",
+  "(a|b",
+  "[a-z",
+  "([a-z)",
+  "x{1001}",
+  "\xff",      // Invalid UTF-8
+  "[\xff]",
+  "[\\\xff]",
+  "\\\xff",
+  "(?P<name>a",
+  "(?P<name>",
+  "(?P<name",
+  "(?P<x y>a)",
+  "(?P<>a)",
+  "[a-Z]",
+  "(?i)[a-Z]",
+  "a{100000}",
+  "a{100000,}",
+};
+
+// Valid in Perl, bad in POSIX
+const char* only_perl[] = {
+ "[a-b-c]",
+ "\\Qabc\\E",
+ "\\Q*+?{[\\E",
+ "\\Q\\\\E",
+ "\\Q\\\\\\E",
+ "\\Q\\\\\\\\E",
+ "\\Q\\\\\\\\\\E",
+ "(?:a)",
+ "(?P<name>a)",
+};
+
+// Valid in POSIX, bad in Perl.
+const char* only_posix[] = {
+  "a++",
+  "a**",
+  "a?*",
+  "a+*",
+  "a{1}*",
+};
+
+// Test that parser rejects bad regexps.
+TEST(TestParse, InvalidRegexps) {
+  for (int i = 0; i < arraysize(badtests); i++) {
+    CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
+      << " " << badtests[i];
+    CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
+      << " " << badtests[i];
+  }
+  for (int i = 0; i < arraysize(only_posix); i++) {
+    CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
+      << " " << only_posix[i];
+    Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
+    CHECK(re) << " " << only_posix[i];
+    re->Decref();
+  }
+  for (int i = 0; i < arraysize(only_perl); i++) {
+    CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
+      << " " << only_perl[i];
+    Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
+    CHECK(re) << " " << only_perl[i];
+    re->Decref();
+  }
+}
+
+// Test that ToString produces original regexp or equivalent one.
+TEST(TestToString, EquivalentParse) {
+  for (int i = 0; i < arraysize(tests); i++) {
+    RegexpStatus status;
+    Regexp* re = Regexp::Parse(tests[i].regexp, kTestFlags, &status);
+    CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
+    string s = re->Dump();
+    EXPECT_EQ(string(tests[i].parse), s);
+    string t = re->ToString();
+    if (t != tests[i].regexp) {
+      // If ToString didn't return the original regexp,
+      // it must have found one with fewer parens.
+      // Unfortunately we can't check the length here, because
+      // ToString produces "\\{" for a literal brace,
+      // but "{" is a shorter equivalent.
+      // CHECK_LT(t.size(), strlen(tests[i].regexp))
+      //     << " t=" << t << " regexp=" << tests[i].regexp;
+
+      // Test that if we parse the new regexp we get the same structure.
+      Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
+      CHECK(nre != NULL) << " reparse " << t << " " << status.Text();
+      string ss = nre->Dump();
+      string tt = nre->ToString();
+      if (s != ss || t != tt)
+        LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
+      EXPECT_EQ(s, ss);
+      EXPECT_EQ(t, tt);
+      nre->Decref();
+    }
+    re->Decref();
+  }
+}
+
+// Test that capture error args are correct.
+TEST(NamedCaptures, ErrorArgs) {
+  RegexpStatus status;
+  Regexp* re;
+
+  re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
+  EXPECT_TRUE(re == NULL);
+  EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+  EXPECT_EQ(status.error_arg(), "(?P<name");
+
+  re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
+  EXPECT_TRUE(re == NULL);
+  EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+  EXPECT_EQ(status.error_arg(), "(?P<space bar>");
+}
+
+}  // namespace re2
--- a/re2/re2/testing/possible_match_test.cc
+++ b/re2/re2/testing/possible_match_test.cc
@ -0,0 +1,240 @@
+// Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <vector>
+#include "util/test.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+// Test that C++ strings are compared as uint8s, not int8s.
+// PossibleMatchRange doesn't depend on this, but callers probably will.
+TEST(CplusplusStrings, EightBit) {
+  string s = "\x70";
+  string t = "\xA0";
+  EXPECT_LT(s, t);
+}
+
+struct PrefixTest {
+  const char* regexp;
+  int maxlen;
+  const char* min;
+  const char* max;
+};
+
+static PrefixTest tests[] = {
+  { "",                  10,  "",           "",        },
+  { "Abcdef",            10,  "Abcdef",     "Abcdef"   },
+  { "abc(def|ghi)",      10,  "abcdef",     "abcghi"   },
+  { "a+hello",           10,  "aa",         "ahello"   },
+  { "a*hello",           10,  "a",          "hello"    },
+  { "def|abc",           10,  "abc",        "def"      },
+  { "a(b)(c)[d]",        10,  "abcd",       "abcd"     },
+  { "ab(cab|cat)",       10,  "abcab",      "abcat"    },
+  { "ab(cab|ca)x",       10,  "abcabx",     "abcax"    },
+  { "(ab|x)(c|de)",      10,  "abc",        "xde"      },
+  { "(ab|x)?(c|z)?",     10,  "",           "z"        },
+  { "[^\\s\\S]",         10,  "",           ""         },
+  { "(abc)+",             5,  "abc",        "abcac"    },
+  { "(abc)+",             2,  "ab",         "ac"       },
+  { "(abc)+",             1,  "a",          "b"        },
+  { "[a\xC3\xA1]",        4,  "a",          "\xC3\xA1" },
+  { "a*",                10,  "",           "ab"       },
+
+  { "(?i)Abcdef",        10,  "ABCDEF",     "abcdef"   },
+  { "(?i)abc(def|ghi)",  10,  "ABCDEF",     "abcghi"   },
+  { "(?i)a+hello",       10,  "AA",         "ahello"   },
+  { "(?i)a*hello",       10,  "A",          "hello"    },
+  { "(?i)def|abc",       10,  "ABC",        "def"      },
+  { "(?i)a(b)(c)[d]",    10,  "ABCD",       "abcd"     },
+  { "(?i)ab(cab|cat)",   10,  "ABCAB",      "abcat"    },
+  { "(?i)ab(cab|ca)x",   10,  "ABCABX",     "abcax"    },
+  { "(?i)(ab|x)(c|de)",  10,  "ABC",        "xde"      },
+  { "(?i)(ab|x)?(c|z)?", 10,  "",           "z"        },
+  { "(?i)[^\\s\\S]",     10,  "",           ""         },
+  { "(?i)(abc)+",         5,  "ABC",        "abcac"    },
+  { "(?i)(abc)+",         2,  "AB",         "ac"       },
+  { "(?i)(abc)+",         1,  "A",          "b"        },
+  { "(?i)[a\xC3\xA1]",    4,  "A",          "\xC3\xA1" },
+  { "(?i)a*",            10,  "",           "ab"       },
+  { "(?i)A*",            10,  "",           "ab"       },
+
+  { "\\AAbcdef",         10,  "Abcdef",     "Abcdef"   },
+  { "\\Aabc(def|ghi)",   10,  "abcdef",     "abcghi"   },
+  { "\\Aa+hello",        10,  "aa",         "ahello"   },
+  { "\\Aa*hello",        10,  "a",          "hello"    },
+  { "\\Adef|abc",        10,  "abc",        "def"      },
+  { "\\Aa(b)(c)[d]",     10,  "abcd",       "abcd"     },
+  { "\\Aab(cab|cat)",    10,  "abcab",      "abcat"    },
+  { "\\Aab(cab|ca)x",    10,  "abcabx",     "abcax"    },
+  { "\\A(ab|x)(c|de)",   10,  "abc",        "xde"      },
+  { "\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
+  { "\\A[^\\s\\S]",      10,  "",           ""         },
+  { "\\A(abc)+",          5,  "abc",        "abcac"    },
+  { "\\A(abc)+",          2,  "ab",         "ac"       },
+  { "\\A(abc)+",          1,  "a",          "b"        },
+  { "\\A[a\xC3\xA1]",     4,  "a",          "\xC3\xA1" },
+  { "\\Aa*",             10,  "",           "ab"       },
+
+  { "(?i)\\AAbcdef",         10,  "ABCDEF",     "abcdef"   },
+  { "(?i)\\Aabc(def|ghi)",   10,  "ABCDEF",     "abcghi"   },
+  { "(?i)\\Aa+hello",        10,  "AA",         "ahello"   },
+  { "(?i)\\Aa*hello",        10,  "A",          "hello"    },
+  { "(?i)\\Adef|abc",        10,  "ABC",        "def"      },
+  { "(?i)\\Aa(b)(c)[d]",     10,  "ABCD",       "abcd"     },
+  { "(?i)\\Aab(cab|cat)",    10,  "ABCAB",      "abcat"    },
+  { "(?i)\\Aab(cab|ca)x",    10,  "ABCABX",     "abcax"    },
+  { "(?i)\\A(ab|x)(c|de)",   10,  "ABC",        "xde"      },
+  { "(?i)\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
+  { "(?i)\\A[^\\s\\S]",      10,  "",           ""         },
+  { "(?i)\\A(abc)+",          5,  "ABC",        "abcac"    },
+  { "(?i)\\A(abc)+",          2,  "AB",         "ac"       },
+  { "(?i)\\A(abc)+",          1,  "A",          "b"        },
+  { "(?i)\\A[a\xC3\xA1]",     4,  "A",          "\xC3\xA1" },
+  { "(?i)\\Aa*",             10,  "",           "ab"       },
+  { "(?i)\\AA*",             10,  "",           "ab"       },
+};
+
+TEST(PossibleMatchRange, HandWritten) {
+  for (int i = 0; i < arraysize(tests); i++) {
+    for (int j = 0; j < 2; j++) {
+      const PrefixTest& t = tests[i];
+      string min, max;
+      if (j == 0) {
+        LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
+        Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+        CHECK(re);
+        Prog* prog = re->CompileToProg(0);
+        CHECK(prog);
+        CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
+          << " " << t.regexp;
+        delete prog;
+        re->Decref();
+      } else {
+        CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
+      }
+      EXPECT_EQ(t.min, min) << t.regexp;
+      EXPECT_EQ(t.max, max) << t.regexp;
+    }
+  }
+}
+
+// Test cases where PossibleMatchRange should return false.
+TEST(PossibleMatchRange, Failures) {
+  string min, max;
+
+  // Fails because no room to write max.
+  EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
+
+  // Fails because there is no max -- any non-empty string matches
+  // or begins a match.  Have to use Latin-1 input, because there
+  // are no valid UTF-8 strings beginning with byte 0xFF.
+  EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+    << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+    << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+    << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+    << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2(".*", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+    << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2("\\C*").
+               PossibleMatchRange(&min, &max, 10))
+    << "min=" << CEscape(min) << ", max=" << CEscape(max);
+
+  // Fails because it's a malformed regexp.
+  EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
+    << "min=" << CEscape(min) << ", max=" << CEscape(max);
+}
+
+// Exhaustive test: generate all regexps within parameters,
+// then generate all strings of a given length over a given alphabet,
+// then check that the prefix information agrees with whether
+// the regexp matches each of the strings.
+class PossibleMatchTester : public RegexpGenerator {
+ public:
+  PossibleMatchTester(int maxatoms,
+                      int maxops,
+                      const vector<string>& alphabet,
+                      const vector<string>& ops,
+                      int maxstrlen,
+                      const vector<string>& stralphabet)
+    : RegexpGenerator(maxatoms, maxops, alphabet, ops),
+      strgen_(maxstrlen, stralphabet),
+      regexps_(0), tests_(0) { }
+
+  int regexps()  { return regexps_; }
+  int tests()    { return tests_; }
+
+  // Needed for RegexpGenerator interface.
+  void HandleRegexp(const string& regexp);
+
+ private:
+  StringGenerator strgen_;
+
+  int regexps_;   // Number of HandleRegexp calls
+  int tests_;     // Number of regexp tests.
+
+  DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
+};
+
+// Processes a single generated regexp.
+// Checks that all accepted strings agree with the prefix range.
+void PossibleMatchTester::HandleRegexp(const string& regexp) {
+  regexps_++;
+
+  VLOG(3) << CEscape(regexp);
+
+  RE2 re(regexp, RE2::Latin1);
+  CHECK_EQ(re.error(), "");
+
+  string min, max;
+  if(!re.PossibleMatchRange(&min, &max, 10)) {
+    // There's no good max for "\\C*".  Can't use strcmp
+    // because sometimes it gets embedded in more
+    // complicated expressions.
+    if(strstr(regexp.c_str(), "\\C*"))
+      return;
+    LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
+  }
+
+  strgen_.Reset();
+  while (strgen_.HasNext()) {
+    const StringPiece& s = strgen_.Next();
+    tests_++;
+    if (!RE2::FullMatch(s, re))
+      continue;
+    CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
+    CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
+  }
+}
+
+TEST(PossibleMatchRange, Exhaustive) {
+  int natom = 3;
+  int noperator = 3;
+  int stringlen = 5;
+  if (DEBUG_MODE) {
+    natom = 2;
+    noperator = 3;
+    stringlen = 3;
+  }
+  PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
+                 RegexpGenerator::EgrepOps(),
+                 stringlen, Explode("ab4"));
+  t.Generate();
+  LOG(INFO) << t.regexps() << " regexps, "
+            << t.tests() << " tests";
+}
+
+}  // namespace re2
--- a/re2/re2/testing/random_test.cc
+++ b/re2/re2/testing/random_test.cc
@ -0,0 +1,95 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Random testing of regular expression matching.
+
+#include <stdio.h>
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+DEFINE_int32(regexpseed, 404, "Random regexp seed.");
+DEFINE_int32(regexpcount, 100, "How many random regexps to generate.");
+DEFINE_int32(stringseed, 200, "Random string seed.");
+DEFINE_int32(stringcount, 100, "How many random strings to generate.");
+
+namespace re2 {
+
+// Runs a random test on the given parameters.
+// (Always uses the same random seeds for reproducibility.
+// Can give different seeds on command line.)
+static void RandomTest(int maxatoms, int maxops,
+                       const vector<string>& alphabet,
+                       const vector<string>& ops,
+                       int maxstrlen, const vector<string>& stralphabet,
+                       const string& wrapper) {
+  // Limit to smaller test cases in debug mode,
+  // because everything is so much slower.
+  if (DEBUG_MODE) {
+    maxatoms--;
+    maxops--;
+    maxstrlen /= 2;
+  }
+
+  ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
+                     maxstrlen, stralphabet, wrapper, "");
+  t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
+  t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount);
+  printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
+         t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
+  EXPECT_EQ(0, t.failures());
+}
+
+// Tests random small regexps involving literals and egrep operators.
+TEST(Random, SmallEgrepLiterals) {
+  RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
+             15, Explode("abc"),
+             "");
+}
+
+// Tests random bigger regexps involving literals and egrep operators.
+TEST(Random, BigEgrepLiterals) {
+  RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
+             15, Explode("abc"),
+             "");
+}
+
+// Tests random small regexps involving literals, capturing parens,
+// and egrep operators.
+TEST(Random, SmallEgrepCaptures) {
+  RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
+             15, Explode("abc"),
+             "");
+}
+
+// Tests random bigger regexps involving literals, capturing parens,
+// and egrep operators.
+TEST(Random, BigEgrepCaptures) {
+  RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
+             15, Explode("abc"),
+             "");
+}
+
+// Tests random large complicated expressions, using all the possible
+// operators, some literals, some parenthesized literals, and predefined
+// character classes like \d.  (Adding larger character classes would
+// make for too many possibilities.)
+TEST(Random, Complicated) {
+  vector<string> ops = Split(" ",
+    "%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
+    "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
+    "%s{2} %s{2,} %s{3,4} %s{4,5}");
+
+  // Use (?:\b) and (?:\B) instead of \b and \B,
+  // because PCRE rejects \b* but accepts (?:\b)*.
+  // Ditto ^ and $.
+  vector<string> atoms = Split(" ",
+    ". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
+    "\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
+    "a (a) b c - \\\\");
+  vector<string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
+  RandomTest(10, 10, atoms, ops, 20, alphabet, "");
+}
+
+}  // namespace re2
+
--- a/re2/re2/testing/re2_arg_test.cc
+++ b/re2/re2/testing/re2_arg_test.cc
@ -0,0 +1,132 @@
+// Copyright 2005 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This tests to make sure numbers are parsed from strings
+// correctly.
+// Todo: Expand the test to validate strings parsed to the other types
+// supported by RE2::Arg class
+
+#include "util/test.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+struct SuccessTable {
+  const char * value_string;
+  int64 value;
+  bool success[6];
+};
+
+// Test boundary cases for different integral sizes.
+// Specifically I want to make sure that values outside the boundries
+// of an integral type will fail and that negative numbers will fail
+// for unsigned types. The following table contains the boundaries for
+// the various integral types and has entries for whether or not each
+// type can contain the given value.
+const SuccessTable kSuccessTable[] = {
+// string       integer value     short  ushort int    uint   int64  uint64
+// 0 to 2^7-1
+{ "0",          0,              { true,  true,  true,  true,  true,  true  }},
+{ "127",        127,            { true,  true,  true,  true,  true,  true  }},
+
+// -1 to -2^7
+{ "-1",         -1,             { true,  false, true,  false, true,  false }},
+{ "-128",       -128,           { true,  false, true,  false, true,  false }},
+
+// 2^7 to 2^8-1
+{ "128",        128,            { true,  true,  true,  true,  true,  true  }},
+{ "255",        255,            { true,  true,  true,  true,  true,  true  }},
+
+// 2^8 to 2^15-1
+{ "256",        256,            { true,  true,  true,  true,  true,  true  }},
+{ "32767",      32767,          { true,  true,  true,  true,  true,  true  }},
+
+// -2^7-1 to -2^15
+{ "-129",       -129,           { true,  false, true,  false, true,  false }},
+{ "-32768",     -32768,         { true,  false, true,  false, true,  false }},
+
+// 2^15 to 2^16-1
+{ "32768",      32768,          { false, true,  true,  true,  true,  true  }},
+{ "65535",      65535,          { false, true,  true,  true,  true,  true  }},
+
+// 2^16 to 2^31-1
+{ "65536",      65536,          { false, false, true,  true,  true,  true  }},
+{ "2147483647", 2147483647,     { false, false, true,  true,  true,  true  }},
+
+// -2^15-1 to -2^31
+{ "-32769",     -32769,         { false, false, true,  false, true,  false }},
+{ "-2147483648",
+  0xFFFFFFFF80000000LL,         { false, false, true,  false, true,  false }},
+
+// 2^31 to 2^32-1
+{ "2147483648", 2147483648U,    { false, false, false, true,  true,  true  }},
+{ "4294967295", 4294967295U,    { false, false, false, true,  true,  true  }},
+
+// 2^32 to 2^63-1
+{ "4294967296", 4294967296LL,   { false, false, false, false, true,  true  }},
+{ "9223372036854775807",
+  9223372036854775807LL,        { false, false, false, false, true,  true  }},
+
+// -2^31-1 to -2^63
+{ "-2147483649", -2147483649LL, { false, false, false, false, true,  false }},
+{ "-9223372036854775808",
+  0x8000000000000000LL,         { false, false, false, false, true,  false }},
+
+// 2^63 to 2^64-1
+{ "9223372036854775808",
+  9223372036854775808ULL,       { false, false, false, false, false, true  }},
+{ "18446744073709551615",
+  18446744073709551615ULL,      { false, false, false, false, false, true  }},
+
+// >= 2^64
+{ "18446744073709551616", 0,    { false, false, false, false, false, false }},
+};
+
+const int kNumStrings = ARRAYSIZE(kSuccessTable);
+
+// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M
+// macro outside of a TEST block and this seems to be the only way to
+// avoid code duplication.  I can also pull off a couple nice tricks
+// using concatenation for the type I'm checking against.
+#define PARSE_FOR_TYPE(type, column) {                                   \
+  type r;                                                                \
+  for ( int i = 0; i < kNumStrings; ++i ) {                              \
+    RE2::Arg arg(&r);                                                    \
+    const char* const p = kSuccessTable[i].value_string;                 \
+    bool retval = arg.Parse(p, strlen(p));                               \
+    bool success = kSuccessTable[i].success[column];                     \
+    ASSERT_TRUE_M(retval == success,                                     \
+      StringPrintf("Parsing '%s' for type " #type " should return %d",   \
+                   p, success).c_str());                                 \
+    if ( success ) {                                                     \
+      ASSERT_EQUALS(r, kSuccessTable[i].value);                          \
+    }                                                                    \
+  }                                                                      \
+}
+
+TEST(REArgTest, Int16Test) {
+  PARSE_FOR_TYPE(int16, 0);
+}
+
+TEST(REArgTest, Uint16Test) {
+  PARSE_FOR_TYPE(uint16, 1);
+}
+
+TEST(REArgTest, IntTest) {
+  PARSE_FOR_TYPE(int, 2);
+}
+
+TEST(REArgTest, UInt32Test) {
+  PARSE_FOR_TYPE(uint32, 3);
+}
+
+TEST(REArgTest, Iint64Test) {
+  PARSE_FOR_TYPE(int64, 4);
+}
+
+TEST(REArgTest, Uint64Test) {
+  PARSE_FOR_TYPE(uint64, 5);
+}
+
+}  // namespace re2
--- a/re2/re2/testing/re2_test.cc
+++ b/re2/re2/testing/re2_test.cc
--- a/re2/re2/testing/regexp_benchmark.cc
+++ b/re2/re2/testing/regexp_benchmark.cc
--- a/re2/re2/testing/regexp_generator.cc
+++ b/re2/re2/testing/regexp_generator.cc
@ -0,0 +1,264 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression generator: generates all possible
+// regular expressions within parameters (see regexp_generator.h for details).
+
+// The regexp generator first generates a sequence of commands in a simple
+// postfix language.  Each command in the language is a string,
+// like "a" or "%s*" or "%s|%s".
+//
+// To evaluate a command, enough arguments are popped from the value stack to
+// plug into the %s slots.  Then the result is pushed onto the stack.
+// For example, the command sequence
+//      a b %s%s c
+// results in the stack
+//      ab c
+//
+// GeneratePostfix generates all possible command sequences.
+// Then RunPostfix turns each sequence into a regular expression
+// and passes the regexp to HandleRegexp.
+
+#include <string.h>
+#include <string>
+#include <stack>
+#include <vector>
+#include "util/test.h"
+#include "re2/testing/regexp_generator.h"
+
+namespace re2 {
+
+// Returns a vector of the egrep regexp operators.
+const vector<string>& RegexpGenerator::EgrepOps() {
+  static const char *ops[] = {
+    "%s%s",
+    "%s|%s",
+    "%s*",
+    "%s+",
+    "%s?",
+    "%s\\C*",
+  };
+  static vector<string> v(ops, ops + arraysize(ops));
+  return v;
+}
+
+RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
+                                 const vector<string>& atoms,
+                                 const vector<string>& ops)
+    : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
+  // Degenerate case.
+  if (atoms_.size() == 0)
+    maxatoms_ = 0;
+  if (ops_.size() == 0)
+    maxops_ = 0;
+}
+
+// Generates all possible regular expressions (within the parameters),
+// calling HandleRegexp for each one.
+void RegexpGenerator::Generate() {
+  vector<string> postfix;
+  GeneratePostfix(&postfix, 0, 0, 0);
+}
+
+// Generates random regular expressions, calling HandleRegexp for each one.
+void RegexpGenerator::GenerateRandom(int32 seed, int n) {
+  ACMRandom acm(seed);
+  acm_ = &acm;
+
+  for (int i = 0; i < n; i++) {
+    vector<string> postfix;
+    GenerateRandomPostfix(&postfix, 0, 0, 0);
+  }
+
+  acm_ = NULL;
+}
+
+// Counts and returns the number of occurrences of "%s" in s.
+static int CountArgs(const string& s) {
+  const char *p = s.c_str();
+  int n = 0;
+  while ((p = strstr(p, "%s")) != NULL) {
+    p += 2;
+    n++;
+  }
+  return n;
+}
+
+// Generates all possible postfix command sequences.
+// Each sequence is handed off to RunPostfix to generate a regular expression.
+// The arguments are:
+//   post:  the current postfix sequence
+//   nstk:  the number of elements that would be on the stack after executing
+//          the sequence
+//   ops:   the number of operators used in the sequence
+//   atoms: the number of atoms used in the sequence
+// For example, if post were ["a", "b", "%s%s", "c"],
+// then nstk = 2, ops = 1, atoms = 3.
+//
+// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
+//
+void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
+                                      int ops, int atoms) {
+  if (nstk == 1)
+    RunPostfix(*post);
+
+  // Early out: if used too many operators or can't
+  // get back down to a single expression on the stack
+  // using binary operators, give up.
+  if (ops + nstk - 1 > maxops_)
+    return;
+
+  // Add atoms if there is room.
+  if (atoms < maxatoms_) {
+    for (int i = 0; i < atoms_.size(); i++) {
+      post->push_back(atoms_[i]);
+      GeneratePostfix(post, nstk + 1, ops, atoms + 1);
+      post->pop_back();
+    }
+  }
+
+  // Add operators if there are enough arguments.
+  if (ops < maxops_) {
+    for (int i = 0; i < ops_.size(); i++) {
+      const string& fmt = ops_[i];
+      int nargs = CountArgs(fmt);
+      if (nargs <= nstk) {
+        post->push_back(fmt);
+        GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
+        post->pop_back();
+      }
+    }
+  }
+}
+
+// Generates a random postfix command sequence.
+// Stops and returns true once a single sequence has been generated.
+bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
+                                            int ops, int atoms) {
+  for (;;) {
+    // Stop if we get to a single element, but only sometimes.
+    if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
+      RunPostfix(*post);
+      return true;
+    }
+
+    // Early out: if used too many operators or can't
+    // get back down to a single expression on the stack
+    // using binary operators, give up.
+    if (ops + nstk - 1 > maxops_)
+      return false;
+
+    // Add operators if there are enough arguments.
+    if (ops < maxops_ && acm_->Uniform(2) == 0) {
+      const string& fmt = ops_[acm_->Uniform(ops_.size())];
+      int nargs = CountArgs(fmt);
+      if (nargs <= nstk) {
+        post->push_back(fmt);
+        bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
+                                         ops + 1, atoms);
+        post->pop_back();
+        if (ret)
+          return true;
+      }
+    }
+
+    // Add atoms if there is room.
+    if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
+      post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
+      bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
+      post->pop_back();
+      if (ret)
+        return true;
+    }
+  }
+}
+
+// Interprets the postfix command sequence to create a regular expression
+// passed to HandleRegexp.  The results of operators like %s|%s are wrapped
+// in (?: ) to avoid needing to maintain a precedence table.
+void RegexpGenerator::RunPostfix(const vector<string>& post) {
+  stack<string> regexps;
+  for (int i = 0; i < post.size(); i++) {
+    switch (CountArgs(post[i])) {
+      default:
+        LOG(FATAL) << "Bad operator: " << post[i];
+      case 0:
+        regexps.push(post[i]);
+        break;
+      case 1: {
+        string a = regexps.top();
+        regexps.pop();
+        regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
+        break;
+      }
+      case 2: {
+        string b = regexps.top();
+        regexps.pop();
+        string a = regexps.top();
+        regexps.pop();
+        regexps.push("(?:" +
+                     StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
+                     ")");
+        break;
+      }
+    }
+  }
+
+  if (regexps.size() != 1) {
+    // Internal error - should never happen.
+    printf("Bad regexp program:\n");
+    for (int i = 0; i < post.size(); i++) {
+      printf("  %s\n", CEscape(post[i]).c_str());
+    }
+    printf("Stack after running program:\n");
+    while (!regexps.empty()) {
+      printf("  %s\n", CEscape(regexps.top()).c_str());
+      regexps.pop();
+    }
+    LOG(FATAL) << "Bad regexp program.";
+  }
+
+  HandleRegexp(regexps.top());
+  HandleRegexp("^(?:" + regexps.top() + ")$");
+  HandleRegexp("^(?:" + regexps.top() + ")");
+  HandleRegexp("(?:" + regexps.top() + ")$");
+}
+
+// Split s into an vector of strings, one for each UTF-8 character.
+vector<string> Explode(const StringPiece& s) {
+  vector<string> v;
+
+  for (const char *q = s.begin(); q < s.end(); ) {
+    const char* p = q;
+    Rune r;
+    q += chartorune(&r, q);
+    v.push_back(string(p, q - p));
+  }
+
+  return v;
+}
+
+// Split string everywhere a substring is found, returning
+// vector of pieces.
+vector<string> Split(const StringPiece& sep, const StringPiece& s) {
+  vector<string> v;
+
+  if (sep.size() == 0)
+    return Explode(s);
+
+  const char *p = s.begin();
+  for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
+    if (StringPiece(q, sep.size()) == sep) {
+      v.push_back(string(p, q - p));
+      p = q + sep.size();
+      q = p - 1;  // -1 for ++ in loop
+      continue;
+    }
+  }
+  if (p < s.end())
+    v.push_back(string(p, s.end() - p));
+  return v;
+}
+
+}  // namespace re2
--- a/re2/re2/testing/regexp_generator.h
+++ b/re2/re2/testing/regexp_generator.h
@ -0,0 +1,70 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression generator: generates all possible
+// regular expressions within given parameters (see below for details).
+
+#ifndef RE2_TESTING_REGEXP_GENERATOR_H__
+#define RE2_TESTING_REGEXP_GENERATOR_H__
+
+#include <string>
+#include <vector>
+#include "util/random.h"
+#include "util/util.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+// Regular expression generator.
+//
+// Given a set of atom expressions like "a", "b", or "."
+// and operators like "%s*", generates all possible regular expressions
+// using at most maxbases base expressions and maxops operators.
+// For each such expression re, calls HandleRegexp(re).
+//
+// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
+//
+class RegexpGenerator {
+ public:
+  RegexpGenerator(int maxatoms, int maxops, const vector<string>& atoms,
+                  const vector<string>& ops);
+  virtual ~RegexpGenerator() {}
+
+  // Generates all the regular expressions, calling HandleRegexp(re) for each.
+  void Generate();
+
+  // Generates n random regular expressions, calling HandleRegexp(re) for each.
+  void GenerateRandom(int32 seed, int n);
+
+  // Handles a regular expression.  Must be provided by subclass.
+  virtual void HandleRegexp(const string& regexp) = 0;
+
+  // The egrep regexp operators: * + ? | and concatenation.
+  static const vector<string>& EgrepOps();
+
+ private:
+  void RunPostfix(const vector<string>& post);
+  void GeneratePostfix(vector<string>* post, int nstk, int ops, int lits);
+  bool GenerateRandomPostfix(vector<string>* post, int nstk, int ops, int lits);
+
+  int maxatoms_;           // Maximum number of atoms allowed in expr.
+  int maxops_;             // Maximum number of ops allowed in expr.
+  vector<string> atoms_;   // Possible atoms.
+  vector<string> ops_;     // Possible ops.
+  ACMRandom* acm_;         // Random generator.
+  DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator);
+};
+
+// Helpers for preparing arguments to RegexpGenerator constructor.
+
+// Returns one string for each character in s.
+vector<string> Explode(const StringPiece& s);
+
+// Splits string everywhere sep is found, returning
+// vector of pieces.
+vector<string> Split(const StringPiece& sep, const StringPiece& s);
+
+}  // namespace re2
+
+#endif  // RE2_TESTING_REGEXP_GENERATOR_H__
--- a/re2/re2/testing/regexp_test.cc
+++ b/re2/re2/testing/regexp_test.cc
@ -0,0 +1,81 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test parse.cc, dump.cc, and tostring.cc.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+// Test that overflowed ref counts work.
+TEST(Regexp, BigRef) {
+  Regexp* re;
+  re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
+  for (int i = 0; i < 100000; i++)
+    re->Incref();
+  for (int i = 0; i < 100000; i++)
+    re->Decref();
+  CHECK_EQ(re->Ref(), 1);
+  re->Decref();
+}
+
+// Test that very large Concats work.
+// Depends on overflowed ref counts working.
+TEST(Regexp, BigConcat) {
+  Regexp* x;
+  x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
+  vector<Regexp*> v(90000, x);  // ToString bails out at 100000
+  for (int i = 0; i < v.size(); i++)
+    x->Incref();
+  CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref();
+  Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags);
+  CHECK_EQ(re->ToString(), string(v.size(), 'x'));
+  re->Decref();
+  CHECK_EQ(x->Ref(), 1) << x->Ref();
+  x->Decref();
+}
+
+TEST(Regexp, NamedCaptures) {
+  Regexp* x;
+  RegexpStatus status;
+  x = Regexp::Parse(
+      "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(4, x->NumCaptures());
+  const map<string, int>* have = x->NamedCaptures();
+  EXPECT_TRUE(have != NULL);
+  EXPECT_EQ(2, have->size());  // there are only two named groups in
+                               // the regexp: 'g1' and 'g2'.
+  map<string, int> want;
+  want["g1"] = 1;
+  want["g2"] = 3;
+  EXPECT_EQ(want, *have);
+  x->Decref();
+  delete have;
+}
+
+TEST(Regexp, CaptureNames) {
+  Regexp* x;
+  RegexpStatus status;
+  x = Regexp::Parse(
+      "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(4, x->NumCaptures());
+  const map<int, string>* have = x->CaptureNames();
+  EXPECT_TRUE(have != NULL);
+  EXPECT_EQ(3, have->size());
+  map<int, string> want;
+  want[1] = "g1";
+  want[3] = "g2";
+  want[4] = "g1";
+
+  EXPECT_EQ(want, *have);
+  x->Decref();
+  delete have;
+}
+
+}  // namespace re2
--- a/re2/re2/testing/required_prefix_test.cc
+++ b/re2/re2/testing/required_prefix_test.cc
@ -0,0 +1,67 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct PrefixTest {
+  const char* regexp;
+  bool return_value;
+  const char* prefix;
+  bool foldcase;
+  const char* suffix;
+};
+
+static PrefixTest tests[] = {
+  // If the regexp is missing a ^, there's no required prefix.
+  { "abc", false },
+  { "", false },
+  { "(?m)^", false },
+
+  // If the regexp immediately goes into
+  // something not a literal match, there's no required prefix.
+  { "^(abc)", false },
+  { "^a*",  false },
+
+  // Otherwise, it should work.
+  { "^abc$", true, "abc", false, "(?-m:$)" },
+  { "^abc", "true", "abc", false, "" },
+  { "^(?i)abc", true, "abc", true, "" },
+  { "^abcd*", true, "abc", false, "d*" },
+  { "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
+  { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
+  { "^☺abc", true, "☺abc", false, "" },
+};
+
+TEST(RequiredPrefix, SimpleTests) {
+  for (int i = 0; i < arraysize(tests); i++) {
+    const PrefixTest& t = tests[i];
+    for (int j = 0; j < 2; j++) {
+      Regexp::ParseFlags flags = Regexp::LikePerl;
+      if (j == 0)
+        flags = flags | Regexp::Latin1;
+      Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+      CHECK(re) << " " << t.regexp;
+      string p;
+      bool f = false;
+      Regexp* s = NULL;
+      CHECK_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
+        << " " << t.regexp << " " << (j==0 ? "latin1" : "utf") << " " << re->Dump();
+      if (t.return_value) {
+        CHECK_EQ(p, string(t.prefix))
+          << " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
+        CHECK_EQ(f, t.foldcase)
+          << " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
+        CHECK_EQ(s->ToString(), string(t.suffix))
+          << " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
+        s->Decref();
+      }
+      re->Decref();
+    }
+  }
+}
+
+}  // namespace re2
--- a/re2/re2/testing/search_test.cc
+++ b/re2/re2/testing/search_test.cc
@ -0,0 +1,325 @@
+// Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdlib.h>
+#include <vector>
+#include "util/test.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/testing/tester.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+struct RegexpTest {
+  const char* regexp;
+  const char* text;
+};
+
+RegexpTest simple_tests[] = {
+  { "a", "a" },
+  { "a", "zyzzyva" },
+  { "a+", "aa" },
+  { "(a+|b)+", "ab" },
+  { "ab|cd", "xabcdx" },
+  { "h.*od?", "hello\ngoodbye\n" },
+  { "h.*o", "hello\ngoodbye\n" },
+  { "h.*o", "goodbye\nhello\n" },
+  { "h.*o", "hello world" },
+  { "h.*o", "othello, world" },
+  { "[^\\s\\S]", "aaaaaaa" },
+  { "a", "aaaaaaa" },
+  { "a*", "aaaaaaa" },
+  { "a*", "" },
+  { "a*", NULL },
+  { "ab|cd", "xabcdx" },
+  { "a", "cab" },
+  { "a*b", "cab" },
+  { "((((((((((((((((((((x))))))))))))))))))))", "x" },
+  { "[abcd]", "xxxabcdxxx" },
+  { "[^x]", "xxxabcdxxx" },
+  { "[abcd]+", "xxxabcdxxx" },
+  { "[^x]+", "xxxabcdxxx" },
+  { "(fo|foo)", "fo" },
+  { "(foo|fo)", "foo" },
+
+  { "aa", "aA" },
+  { "a", "Aa" },
+  { "a", "A" },
+  { "ABC", "abc" },
+  { "abc", "XABCY" },
+  { "ABC", "xabcy" },
+
+  // Make sure ^ and $ work.
+  // The pathological cases didn't work
+  // in the original grep code.
+  { "foo|bar|[A-Z]", "foo" },
+  { "^(foo|bar|[A-Z])", "foo" },
+  { "(foo|bar|[A-Z])$", "foo\n" },
+  { "(foo|bar|[A-Z])$", "foo" },
+  { "^(foo|bar|[A-Z])$", "foo\n" },
+  { "^(foo|bar|[A-Z])$", "foo" },
+  { "^(foo|bar|[A-Z])$", "bar" },
+  { "^(foo|bar|[A-Z])$", "X" },
+  { "^(foo|bar|[A-Z])$", "XY" },
+  { "^(fo|foo)$", "fo" },
+  { "^(fo|foo)$", "foo" },
+  { "^^(fo|foo)$", "fo" },
+  { "^^(fo|foo)$", "foo" },
+  { "^$", "" },
+  { "^$", "x" },
+  { "^^$", "" },
+  { "^$$", "" },
+  { "^^$", "x" },
+  { "^$$", "x" },
+  { "^^$$", "" },
+  { "^^$$", "x" },
+  { "^^^^^^^^$$$$$$$$", "" },
+  { "^", "x" },
+  { "$", "x" },
+
+  // Word boundaries.
+  { "\\bfoo\\b", "nofoo foo that" },
+  { "a\\b", "faoa x" },
+  { "\\bbar", "bar x" },
+  { "\\bbar", "foo\nbar x" },
+  { "bar\\b", "foobar" },
+  { "bar\\b", "foobar\nxxx" },
+  { "(foo|bar|[A-Z])\\b", "foo" },
+  { "(foo|bar|[A-Z])\\b", "foo\n" },
+  { "\\b", "" },
+  { "\\b", "x" },
+  { "\\b(foo|bar|[A-Z])", "foo" },
+  { "\\b(foo|bar|[A-Z])\\b", "X" },
+  { "\\b(foo|bar|[A-Z])\\b", "XY" },
+  { "\\b(foo|bar|[A-Z])\\b", "bar" },
+  { "\\b(foo|bar|[A-Z])\\b", "foo" },
+  { "\\b(foo|bar|[A-Z])\\b", "foo\n" },
+  { "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" },
+  { "\\b(fo|foo)\\b", "fo" },
+  { "\\b(fo|foo)\\b", "foo" },
+  { "\\b\\b", "" },
+  { "\\b\\b", "x" },
+  { "\\b$", "" },
+  { "\\b$", "x" },
+  { "\\b$", "y x" },
+  { "\\b.$", "x" },
+  { "^\\b(fo|foo)\\b", "fo" },
+  { "^\\b(fo|foo)\\b", "foo" },
+  { "^\\b", "" },
+  { "^\\b", "x" },
+  { "^\\b\\b", "" },
+  { "^\\b\\b", "x" },
+  { "^\\b$", "" },
+  { "^\\b$", "x" },
+  { "^\\b.$", "x" },
+  { "^\\b.\\b$", "x" },
+  { "^^^^^^^^\\b$$$$$$$", "" },
+  { "^^^^^^^^\\b.$$$$$$", "x" },
+  { "^^^^^^^^\\b$$$$$$$", "x" },
+
+  // Non-word boundaries.
+  { "\\Bfoo\\B", "n foo xfoox that" },
+  { "a\\B", "faoa x" },
+  { "\\Bbar", "bar x" },
+  { "\\Bbar", "foo\nbar x" },
+  { "bar\\B", "foobar" },
+  { "bar\\B", "foobar\nxxx" },
+  { "(foo|bar|[A-Z])\\B", "foox" },
+  { "(foo|bar|[A-Z])\\B", "foo\n" },
+  { "\\B", "" },
+  { "\\B", "x" },
+  { "\\B(foo|bar|[A-Z])", "foo" },
+  { "\\B(foo|bar|[A-Z])\\B", "xXy" },
+  { "\\B(foo|bar|[A-Z])\\B", "XY" },
+  { "\\B(foo|bar|[A-Z])\\B", "XYZ" },
+  { "\\B(foo|bar|[A-Z])\\B", "abara" },
+  { "\\B(foo|bar|[A-Z])\\B", "xfoo_" },
+  { "\\B(foo|bar|[A-Z])\\B", "xfoo\n" },
+  { "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" },
+  { "\\B(fo|foo)\\B", "xfoo" },
+  { "\\B(foo|fo)\\B", "xfooo" },
+  { "\\B\\B", "" },
+  { "\\B\\B", "x" },
+  { "\\B$", "" },
+  { "\\B$", "x" },
+  { "\\B$", "y x" },
+  { "\\B.$", "x" },
+  { "^\\B(fo|foo)\\B", "fo" },
+  { "^\\B(fo|foo)\\B", "foo" },
+  { "^\\B", "" },
+  { "^\\B", "x" },
+  { "^\\B\\B", "" },
+  { "^\\B\\B", "x" },
+  { "^\\B$", "" },
+  { "^\\B$", "x" },
+  { "^\\B.$", "x" },
+  { "^\\B.\\B$", "x" },
+  { "^^^^^^^^\\B$$$$$$$", "" },
+  { "^^^^^^^^\\B.$$$$$$", "x" },
+  { "^^^^^^^^\\B$$$$$$$", "x" },
+
+  // PCRE uses only ASCII for \b computation.
+  // All non-ASCII are *not* word characters.
+  { "\\bx\\b", "x" },
+  { "\\bx\\b", "x>" },
+  { "\\bx\\b", "<x" },
+  { "\\bx\\b", "<x>" },
+  { "\\bx\\b", "ax" },
+  { "\\bx\\b", "xb" },
+  { "\\bx\\b", "axb" },
+  { "\\bx\\b", "«x" },
+  { "\\bx\\b", "x»" },
+  { "\\bx\\b", "«x»" },
+  { "\\bx\\b", "axb" },
+  { "\\bx\\b", "áxβ" },
+  { "\\Bx\\B", "axb" },
+  { "\\Bx\\B", "áxβ" },
+
+  // Weird boundary cases.
+  { "^$^$", "" },
+  { "^$^", "" },
+  { "$^$", "" },
+
+  { "^$^$", "x" },
+  { "^$^", "x" },
+  { "$^$", "x" },
+
+  { "^$^$", "x\ny" },
+  { "^$^", "x\ny" },
+  { "$^$", "x\ny" },
+
+  { "^$^$", "x\n\ny" },
+  { "^$^", "x\n\ny" },
+  { "$^$", "x\n\ny" },
+
+  { "^(foo\\$)$", "foo$bar" },
+  { "(foo\\$)", "foo$bar" },
+  { "^...$", "abc" },
+
+  // UTF-8
+  { "^\xe6\x9c\xac$", "\xe6\x9c\xac" },
+  { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+  { "^...$", ".\xe6\x9c\xac." },
+
+  { "^\\C\\C\\C$", "\xe6\x9c\xac" },
+  { "^\\C$", "\xe6\x9c\xac" },
+  { "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+
+  // Latin1
+  { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+  { "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+  { "^...$", ".\xe6\x9c\xac." },
+  { "^.....$", ".\xe6\x9c\xac." },
+
+  // Perl v Posix
+  { "\\B(fo|foo)\\B", "xfooo" },
+  { "(fo|foo)", "foo" },
+
+  // Octal escapes.
+  { "\\141", "a" },
+  { "\\060", "0" },
+  { "\\0600", "00" },
+  { "\\608", "08" },
+  { "\\01", "\01" },
+  { "\\018", "\01" "8" },
+
+  // Hexadecimal escapes
+  { "\\x{61}", "a" },
+  { "\\x61", "a" },
+  { "\\x{00000061}", "a" },
+
+  // Unicode scripts.
+  { "\\p{Greek}+", "aαβb" },
+  { "\\P{Greek}+", "aαβb" },
+  { "\\p{^Greek}+", "aαβb" },
+  { "\\P{^Greek}+", "aαβb" },
+
+  // Unicode properties.  Nd is decimal number.  N is any number.
+  { "[^0-9]+",  "abc123" },
+  { "\\p{Nd}+", "abc123²³¼½¾₀₉" },
+  { "\\p{^Nd}+", "abc123²³¼½¾₀₉" },
+  { "\\P{Nd}+", "abc123²³¼½¾₀₉" },
+  { "\\P{^Nd}+", "abc123²³¼½¾₀₉" },
+  { "\\pN+", "abc123²³¼½¾₀₉" },
+  { "\\p{N}+", "abc123²³¼½¾₀₉" },
+  { "\\p{^N}+", "abc123²³¼½¾₀₉" },
+
+  { "\\p{Any}+", "abc123" },
+
+  // Character classes & case folding.
+  { "(?i)[@-A]+", "@AaB" },  // matches @Aa but not B
+  { "(?i)[A-Z]+", "aAzZ" },
+  { "(?i)[^\\\\]+", "Aa\\" },  // \\ is between A-Z and a-z -
+                               // splits the ranges in an interesting way.
+
+  // would like to use, but PCRE mishandles in full-match, non-greedy mode
+  // { "(?i)[\\\\]+", "Aa" },
+
+  { "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
+
+  // Character classes & case folding.
+  { "[@-A]+", "@AaB" },
+  { "[A-Z]+", "aAzZ" },
+  { "[^\\\\]+", "Aa\\" },
+  { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
+  
+  // Anchoring.  (^abc in aabcdef was a former bug)
+  // The tester checks for a match in the text and
+  // subpieces of the text with a byte removed on either side.
+  { "^abc", "abcdef" },
+  { "^abc", "aabcdef" },
+  { "^[ay]*[bx]+c", "abcdef" },
+  { "^[ay]*[bx]+c", "aabcdef" },
+  { "def$", "abcdef" },
+  { "def$", "abcdeff" },
+  { "d[ex][fy]$", "abcdef" },
+  { "d[ex][fy]$", "abcdeff" },
+  { "[dz][ex][fy]$", "abcdef" },
+  { "[dz][ex][fy]$", "abcdeff" },
+  { "(?m)^abc", "abcdef" },
+  { "(?m)^abc", "aabcdef" },
+  { "(?m)^[ay]*[bx]+c", "abcdef" },
+  { "(?m)^[ay]*[bx]+c", "aabcdef" },
+  { "(?m)def$", "abcdef" },
+  { "(?m)def$", "abcdeff" },
+  { "(?m)d[ex][fy]$", "abcdef" },
+  { "(?m)d[ex][fy]$", "abcdeff" },
+  { "(?m)[dz][ex][fy]$", "abcdef" },
+  { "(?m)[dz][ex][fy]$", "abcdeff" },
+  { "^", "a" },
+  { "^^", "a" },
+
+  // Context.
+  // The tester checks for a match in the text and
+  // subpieces of the text with a byte removed on either side.
+  { "a", "a" },
+  { "ab*", "a" },
+  { "a\\C*", "a" },
+  
+  // Former bugs.
+  { "a\\C*|ba\\C", "baba" },
+};
+
+TEST(Regexp, SearchTests) {
+  int failures = 0;
+  for (int i = 0; i < arraysize(simple_tests); i++) {
+    const RegexpTest& t = simple_tests[i];
+    if (!TestRegexpOnText(t.regexp, t.text))
+      failures++;
+
+#ifdef LOGGING
+    // Build a dummy ExhaustiveTest call that will trigger just
+    // this one test, so that we log the test case.
+    vector<string> atom, alpha, ops;
+    atom.push_back(StringPiece(t.regexp).as_string());
+    alpha.push_back(StringPiece(t.text).as_string());
+    ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", "");
+#endif
+
+  }
+  EXPECT_EQ(failures, 0);
+}
+
+}  // namespace re2
--- a/re2/re2/testing/set_test.cc
+++ b/re2/re2/testing/set_test.cc
@ -0,0 +1,102 @@
+// Copyright 2010 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <vector>
+
+#include "util/test.h"
+#include "re2/re2.h"
+#include "re2/set.h"
+
+namespace re2 {
+
+TEST(Set, Unanchored) {
+  RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+  CHECK_EQ(s.Add("foo", NULL), 0);
+  CHECK_EQ(s.Add("(", NULL), -1);
+  CHECK_EQ(s.Add("bar", NULL), 1);
+
+  CHECK_EQ(s.Compile(), true);
+
+  vector<int> v;
+  CHECK_EQ(s.Match("foobar", &v), true);
+  CHECK_EQ(v.size(), 2);
+  CHECK_EQ(v[0], 0);
+  CHECK_EQ(v[1], 1);
+
+  v.clear();
+  CHECK_EQ(s.Match("fooba", &v), true);
+  CHECK_EQ(v.size(), 1);
+  CHECK_EQ(v[0], 0);
+
+  v.clear();
+  CHECK_EQ(s.Match("oobar", &v), true);
+  CHECK_EQ(v.size(), 1);
+  CHECK_EQ(v[0], 1);
+}
+
+TEST(Set, UnanchoredFactored) {
+  RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+  CHECK_EQ(s.Add("foo", NULL), 0);
+  CHECK_EQ(s.Add("(", NULL), -1);
+  CHECK_EQ(s.Add("foobar", NULL), 1);
+
+  CHECK_EQ(s.Compile(), true);
+
+  vector<int> v;
+  CHECK_EQ(s.Match("foobar", &v), true);
+  CHECK_EQ(v.size(), 2);
+  CHECK_EQ(v[0], 0);
+  CHECK_EQ(v[1], 1);
+
+  v.clear();
+  CHECK_EQ(s.Match("obarfoobaroo", &v), true);
+  CHECK_EQ(v.size(), 2);
+  CHECK_EQ(v[0], 0);
+  CHECK_EQ(v[1], 1);
+
+  v.clear();
+  CHECK_EQ(s.Match("fooba", &v), true);
+  CHECK_EQ(v.size(), 1);
+  CHECK_EQ(v[0], 0);
+
+  v.clear();
+  CHECK_EQ(s.Match("oobar", &v), false);
+  CHECK_EQ(v.size(), 0);
+}
+
+TEST(Set, Anchored) {
+  RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
+
+  CHECK_EQ(s.Add("foo", NULL), 0);
+  CHECK_EQ(s.Add("(", NULL), -1);
+  CHECK_EQ(s.Add("bar", NULL), 1);
+
+  CHECK_EQ(s.Compile(), true);
+
+  vector<int> v;
+  CHECK_EQ(s.Match("foobar", &v), false);
+  CHECK_EQ(v.size(), 0);
+
+  CHECK_EQ(s.Match("fooba", &v), false);
+  CHECK_EQ(v.size(), 0);
+
+  CHECK_EQ(s.Match("oobar", &v), false);
+  CHECK_EQ(v.size(), 0);
+
+  CHECK_EQ(s.Match("foo", &v), true);
+  CHECK_EQ(v.size(), 1);
+  CHECK_EQ(v[0], 0);
+
+  CHECK_EQ(s.Match("bar", &v), true);
+  CHECK_EQ(v.size(), 1);
+  CHECK_EQ(v[0], 1);
+
+}
+
+}  // namespace re2
+
--- a/re2/re2/testing/simplify_test.cc
+++ b/re2/re2/testing/simplify_test.cc
@ -0,0 +1,167 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test simplify.cc.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Test {
+  const char* regexp;
+  const char* simplified;
+};
+
+static Test tests[] = {
+  // Already-simple constructs
+  { "a", "a" },
+  { "ab", "ab" },
+  { "a|b", "[a-b]" },
+  { "ab|cd", "ab|cd" },
+  { "(ab)*", "(ab)*" },
+  { "(ab)+", "(ab)+" },
+  { "(ab)?", "(ab)?" },
+  { ".", "." },
+  { "^", "^" },
+  { "$", "$" },
+  { "[ac]", "[ac]" },
+  { "[^ac]", "[^ac]" },
+
+  // Posix character classes
+  { "[[:alnum:]]", "[0-9A-Za-z]" },
+  { "[[:alpha:]]", "[A-Za-z]" },
+  { "[[:blank:]]", "[\\t ]" },
+  { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
+  { "[[:digit:]]", "[0-9]" },
+  { "[[:graph:]]", "[!-~]" },
+  { "[[:lower:]]", "[a-z]" },
+  { "[[:print:]]", "[ -~]" },
+  { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
+  { "[[:space:]]" , "[\\t-\\r ]" },
+  { "[[:upper:]]", "[A-Z]" },
+  { "[[:xdigit:]]", "[0-9A-Fa-f]" },
+
+  // Perl character classes
+  { "\\d", "[0-9]" },
+  { "\\s", "[\\t-\\n\\f-\\r ]" },
+  { "\\w", "[0-9A-Z_a-z]" },
+  { "\\D", "[^0-9]" },
+  { "\\S", "[^\\t-\\n\\f-\\r ]" },
+  { "\\W", "[^0-9A-Z_a-z]" },
+  { "[\\d]", "[0-9]" },
+  { "[\\s]", "[\\t-\\n\\f-\\r ]" },
+  { "[\\w]", "[0-9A-Z_a-z]" },
+  { "[\\D]", "[^0-9]" },
+  { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
+  { "[\\W]", "[^0-9A-Z_a-z]" },
+
+  // Posix repetitions
+  { "a{1}", "a" },
+  { "a{2}", "aa" },
+  { "a{5}", "aaaaa" },
+  { "a{0,1}", "a?" },
+  // The next three are illegible because Simplify inserts (?:)
+  // parens instead of () parens to avoid creating extra
+  // captured subexpressions.  The comments show a version fewer parens.
+  { "(a){0,2}",                   "(?:(a)(a)?)?"     },  //       (aa?)?
+  { "(a){0,4}",       "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  //   (a(a(aa?)?)?)?
+  { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  // aa(a(a(aa?)?)?)?
+  { "a{0,2}",           "(?:aa?)?"     },  //       (aa?)?
+  { "a{0,4}",   "(?:a(?:a(?:aa?)?)?)?" },  //   (a(a(aa?)?)?)?
+  { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" },  // aa(a(a(aa?)?)?)?
+  { "a{0,}", "a*" },
+  { "a{1,}", "a+" },
+  { "a{2,}", "aa+" },
+  { "a{5,}", "aaaaa+" },
+
+  // Test that operators simplify their arguments.
+  // (Simplify used to not simplify arguments to a {} repeat.)
+  { "(?:a{1,}){1,}", "a+" },
+  { "(a{1,}b{1,})", "(a+b+)" },
+  { "a{1,}|b{1,}", "a+|b+" },
+  { "(?:a{1,})*", "(?:a+)*" },
+  { "(?:a{1,})+", "a+" },
+  { "(?:a{1,})?", "(?:a+)?" },
+  { "a{0}", "" },
+
+  // Character class simplification
+  { "[ab]", "[a-b]" },
+  { "[a-za-za-z]", "[a-z]" },
+  { "[A-Za-zA-Za-z]", "[A-Za-z]" },
+  { "[ABCDEFGH]", "[A-H]" },
+  { "[AB-CD-EF-GH]", "[A-H]" },
+  { "[W-ZP-XE-R]", "[E-Z]" },
+  { "[a-ee-gg-m]", "[a-m]" },
+  { "[a-ea-ha-m]", "[a-m]" },
+  { "[a-ma-ha-e]", "[a-m]" },
+  { "[a-zA-Z0-9 -~]", "[ -~]" },
+
+  // Empty character classes
+  { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
+
+  // Full character classes
+  { "[[:cntrl:][:^cntrl:]]", "." },
+
+  // Unicode case folding.
+  { "(?i)A", "[Aa]" },
+  { "(?i)a", "[Aa]" },
+  { "(?i)K", "[Kk\\x{212a}]" },
+  { "(?i)k", "[Kk\\x{212a}]" },
+  { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
+  { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
+  { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
+  { "(?i)[\\x00-\\x{10ffff}]", "." },
+
+  // Empty string as a regular expression.
+  // Empty string must be preserved inside parens in order
+  // to make submatches work right, so these are less
+  // interesting than they used to be.  ToString inserts
+  // explicit (?:) in place of non-parenthesized empty strings,
+  // to make them easier to spot for other parsers.
+  { "(a|b|)", "([a-b]|(?:))" },
+  { "(|)", "()" },
+  { "a()", "a()" },
+  { "(()|())", "(()|())" },
+  { "(a|)", "(a|(?:))" },
+  { "ab()cd()", "ab()cd()" },
+  { "()", "()" },
+  { "()*", "()*" },
+  { "()+", "()+" },
+  { "()?" , "()?" },
+  { "(){0}", "" },
+  { "(){1}", "()" },
+  { "(){1,}", "()+" },
+  { "(){0,2}", "(?:()()?)?" },
+};
+
+TEST(TestSimplify, SimpleRegexps) {
+  for (int i = 0; i < arraysize(tests); i++) {
+    RegexpStatus status;
+    VLOG(1) << "Testing " << tests[i].regexp;
+    Regexp* re = Regexp::Parse(tests[i].regexp,
+                               Regexp::MatchNL | (Regexp::LikePerl &
+                                                  ~Regexp::OneLine),
+                               &status);
+    CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
+    Regexp* sre = re->Simplify();
+    CHECK(sre != NULL);
+
+    // Check that already-simple regexps don't allocate new ones.
+    if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
+      CHECK(re == sre) << " " << tests[i].regexp
+        << " " << re->ToString() << " " << sre->ToString();
+    }
+
+    EXPECT_EQ(tests[i].simplified, sre->ToString())
+      << " " << tests[i].regexp << " " << sre->Dump();
+
+    re->Decref();
+    sre->Decref();
+  }
+}
+
+}  // namespace re2
--- a/re2/re2/testing/string_generator.cc
+++ b/re2/re2/testing/string_generator.cc
@ -0,0 +1,113 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// String generator: generates all possible strings of up to
+// maxlen letters using the set of letters in alpha.
+// Fetch strings using a Java-like Next()/HasNext() interface.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+StringGenerator::StringGenerator(int maxlen, const vector<string>& alphabet)
+    : maxlen_(maxlen), alphabet_(alphabet),
+      generate_null_(false),
+      random_(false), nrandom_(0), acm_(NULL) {
+
+  // Degenerate case: no letters, no non-empty strings.
+  if (alphabet_.size() == 0)
+    maxlen_ = 0;
+
+  // Next() will return empty string (digits_ is empty).
+  hasnext_ = true;
+}
+
+StringGenerator::~StringGenerator() {
+  delete acm_;
+}
+
+// Resets the string generator state to the beginning.
+void StringGenerator::Reset() {
+  digits_.clear();
+  hasnext_ = true;
+  random_ = false;
+  nrandom_ = 0;
+  generate_null_ = false;
+}
+
+// Increments the big number in digits_, returning true if successful.
+// Returns false if all the numbers have been used.
+bool StringGenerator::IncrementDigits() {
+  // First try to increment the current number.
+  for (int i = digits_.size() - 1; i >= 0; i--) {
+    if (++digits_[i] < alphabet_.size())
+      return true;
+    digits_[i] = 0;
+  }
+
+  // If that failed, make a longer number.
+  if (digits_.size() < maxlen_) {
+    digits_.push_back(0);
+    return true;
+  }
+
+  return false;
+}
+
+// Generates random digits_, return true if successful.
+// Returns false if the random sequence is over.
+bool StringGenerator::RandomDigits() {
+  if (--nrandom_ <= 0)
+    return false;
+
+  // Pick length.
+  int len = acm_->Uniform(maxlen_+1);
+  digits_.resize(len);
+  for (int i = 0; i < len; i++)
+    digits_[i] = acm_->Uniform(alphabet_.size());
+  return true;
+}
+
+// Returns the next string in the iteration, which is the one
+// currently described by digits_.  Calls IncrementDigits
+// after computing the string, so that it knows the answer
+// for subsequent HasNext() calls.
+const StringPiece& StringGenerator::Next() {
+  CHECK(hasnext_);
+  if (generate_null_) {
+    generate_null_ = false;
+    sp_ = NULL;
+    return sp_;
+  }
+  s_.clear();
+  for (int i = 0; i < digits_.size(); i++) {
+    s_ += alphabet_[digits_[i]];
+  }
+  hasnext_ = random_ ? RandomDigits() : IncrementDigits();
+  sp_ = s_;
+  return sp_;
+}
+
+// Sets generator up to return n random strings.
+void StringGenerator::Random(int32 seed, int n) {
+  if (acm_ == NULL)
+    acm_ = new ACMRandom(seed);
+  else
+    acm_->Reset(seed);
+
+  random_ = true;
+  nrandom_ = n;
+  hasnext_ = nrandom_ > 0;
+}
+
+void StringGenerator::GenerateNULL() {
+  generate_null_ = true;
+  hasnext_ = true;
+}
+
+}  // namespace re2
+
--- a/re2/re2/testing/string_generator.h
+++ b/re2/re2/testing/string_generator.h
@ -0,0 +1,58 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// String generator: generates all possible strings of up to
+// maxlen letters using the set of letters in alpha.
+// Fetch strings using a Java-like Next()/HasNext() interface.
+
+#ifndef RE2_TESTING_STRING_GENERATOR_H__
+#define RE2_TESTING_STRING_GENERATOR_H__
+
+#include <string>
+#include <vector>
+#include "util/util.h"
+#include "util/random.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+class StringGenerator {
+ public:
+  StringGenerator(int maxlen, const vector<string>& alphabet);
+  ~StringGenerator();
+  const StringPiece& Next();
+  bool HasNext() { return hasnext_; }
+
+  // Resets generator to start sequence over.
+  void Reset();
+
+  // Causes generator to emit random strings for next n calls to Next().
+  void Random(int32 seed, int n);
+
+  // Causes generator to emit a NULL as the next call.
+  void GenerateNULL();
+
+ private:
+  bool IncrementDigits();
+  bool RandomDigits();
+
+  // Global state.
+  int maxlen_;               // Maximum length string to generate.
+  vector<string> alphabet_;  // Alphabet, one string per letter.
+
+  // Iteration state.
+  StringPiece sp_;           // Last StringPiece returned by Next().
+  string s_;                 // String data in last StringPiece returned by Next().
+  bool hasnext_;             // Whether Next() can be called again.
+  vector<int> digits_;       // Alphabet indices for next string.
+  bool generate_null_;       // Whether to generate a NULL StringPiece next.
+  bool random_;              // Whether generated strings are random.
+  int nrandom_;              // Number of random strings left to generate.
+  ACMRandom* acm_;           // Random number generator
+  DISALLOW_EVIL_CONSTRUCTORS(StringGenerator);
+};
+
+}  // namespace re2
+
+#endif  // RE2_TESTING_STRING_GENERATOR_H__
--- a/re2/re2/testing/string_generator_test.cc
+++ b/re2/re2/testing/string_generator_test.cc
@ -0,0 +1,109 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test StringGenerator.
+
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/testing/string_generator.h"
+#include "re2/testing/regexp_generator.h"
+
+namespace re2 {
+
+// Returns i to the e.
+static int64 IntegerPower(int i, int e) {
+  int64 p = 1;
+  while (e-- > 0)
+    p *= i;
+  return p;
+}
+
+// Checks that for given settings of the string generator:
+//   * it generates strings that are non-decreasing in length.
+//   * strings of the same length are sorted in alphabet order.
+//   * it doesn't generate the same string twice.
+//   * it generates the right number of strings.
+//
+// If all of these hold, the StringGenerator is behaving.
+// Assumes that the alphabet is sorted, so that the generated
+// strings can just be compared lexicographically.
+static void RunTest(int len, string alphabet, bool donull) {
+  StringGenerator g(len, Explode(alphabet));
+
+  int n = 0;
+  int last_l = -1;
+  string last_s;
+
+  if (donull) {
+    g.GenerateNULL();
+    EXPECT_TRUE(g.HasNext());
+    StringPiece sp = g.Next();
+    EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
+    EXPECT_EQ(sp.size(), 0);
+  }
+
+  while (g.HasNext()) {
+    string s = g.Next().as_string();
+    n++;
+
+    // Check that all characters in s appear in alphabet.
+    for (const char *p = s.c_str(); *p != '\0'; ) {
+      Rune r;
+      p += chartorune(&r, p);
+      EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL);
+    }
+
+    // Check that string is properly ordered w.r.t. previous string.
+    int l = utflen(s.c_str());
+    EXPECT_LE(l, len);
+    if (last_l < l) {
+      last_l = l;
+    } else {
+      EXPECT_EQ(last_l, l);
+      EXPECT_LT(last_s, s);
+    }
+    last_s = s;
+  }
+
+  // Check total string count.
+  int64 m = 0;
+  int alpha = utflen(alphabet.c_str());
+  if (alpha == 0)  // Degenerate case.
+    len = 0;
+  for (int i = 0; i <= len; i++)
+    m += IntegerPower(alpha, i);
+  EXPECT_EQ(n, m);
+}
+
+TEST(StringGenerator, NoLength) {
+  RunTest(0, "abc", false);
+}
+
+TEST(StringGenerator, NoLengthNoAlphabet) {
+  RunTest(0, "", false);
+}
+
+TEST(StringGenerator, NoAlphabet) {
+  RunTest(5, "", false);
+}
+
+TEST(StringGenerator, Simple) {
+  RunTest(3, "abc", false);
+}
+
+TEST(StringGenerator, UTF8) {
+  RunTest(4, "abc\xE2\x98\xBA", false);
+}
+
+TEST(StringGenerator, GenNULL) {
+  RunTest(0, "abc", true);
+  RunTest(0, "", true);
+  RunTest(5, "", true);
+  RunTest(3, "abc", true);
+  RunTest(4, "abc\xE2\x98\xBA", true);
+}
+
+}  // namespace re2
--- a/re2/re2/testing/tester.cc
+++ b/re2/re2/testing/tester.cc
@ -0,0 +1,640 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression engine tester -- test all the implementations against each other.
+
+#include "util/util.h"
+#include "util/flags.h"
+#include "re2/testing/tester.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+DEFINE_bool(dump_prog, false, "dump regexp program");
+DEFINE_bool(log_okay, false, "log successful runs");
+DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
+
+DEFINE_int32(max_regexp_failures, 100,
+             "maximum number of regexp test failures (-1 = unlimited)");
+
+DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
+
+namespace re2 {
+
+enum {
+  kMaxSubmatch = 1+16,  // $0...$16
+};
+
+const char* engine_types[kEngineMax] = {
+  "Backtrack",
+  "NFA",
+  "DFA",
+  "DFA1",
+  "OnePass",
+  "BitState",
+  "RE2",
+  "RE2a",
+  "RE2b",
+  "PCRE",
+};
+
+// Returns the name string for the type t.
+static string EngineString(Engine t) {
+  if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
+    return StringPrintf("type%d", static_cast<int>(t));
+  }
+  return engine_types[t];
+}
+
+// Returns bit mask of engines to use.
+static uint32 Engines() {
+  static uint32 cached_engines;
+  static bool did_parse;
+
+  if (did_parse)
+    return cached_engines;
+
+  if (FLAGS_regexp_engines.empty()) {
+    cached_engines = ~0;
+  } else {
+    for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
+      if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
+        cached_engines |= 1<<i;
+  }
+
+  if (cached_engines == 0)
+    LOG(INFO) << "Warning: no engines enabled.";
+  if (!UsingPCRE)
+    cached_engines &= ~(1<<kEnginePCRE);
+  for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
+    if (cached_engines & (1<<i))
+      LOG(INFO) << EngineString(i) << " enabled";
+  }
+  did_parse = true;
+  return cached_engines;
+}
+
+// The result of running a match.
+struct TestInstance::Result {
+  bool skipped;         // test skipped: wasn't applicable
+  bool matched;         // found a match
+  bool untrusted;       // don't really trust the answer
+  bool have_submatch;   // computed all submatch info
+  bool have_submatch0;  // computed just submatch[0]
+  StringPiece submatch[kMaxSubmatch];
+};
+
+typedef TestInstance::Result Result;
+
+// Formats a single capture range s in text in the form (a,b)
+// where a and b are the starting and ending offsets of s in text.
+static string FormatCapture(const StringPiece& text, const StringPiece& s) {
+  if (s.begin() == NULL)
+    return "(?,?)";
+  return StringPrintf("(%d,%d)",
+                      static_cast<int>(s.begin() - text.begin()),
+                      static_cast<int>(s.end() - text.begin()));
+}
+
+// Returns whether text contains non-ASCII (>= 0x80) bytes.
+static bool NonASCII(const StringPiece& text) {
+  for (int i = 0; i < text.size(); i++)
+    if ((uint8)text[i] >= 0x80)
+      return true;
+  return false;
+}
+
+// Returns string representation of match kind.
+static string FormatKind(Prog::MatchKind kind) {
+  switch (kind) {
+    case Prog::kFullMatch:
+      return "full match";
+    case Prog::kLongestMatch:
+      return "longest match";
+    case Prog::kFirstMatch:
+      return "first match";
+    case Prog::kManyMatch:
+      return "many match";
+  }
+  return "???";
+}
+
+// Returns string representation of anchor kind.
+static string FormatAnchor(Prog::Anchor anchor) {
+  switch (anchor) {
+    case Prog::kAnchored:
+      return "anchored";
+    case Prog::kUnanchored:
+      return "unanchored";
+  }
+  return "???";
+}
+
+struct ParseMode {
+  Regexp::ParseFlags parse_flags;
+  string desc;
+};
+
+static const Regexp::ParseFlags single_line =
+  Regexp::LikePerl;
+static const Regexp::ParseFlags multi_line =
+  static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
+
+static ParseMode parse_modes[] = {
+  { single_line,                   "single-line"          },
+  { single_line|Regexp::Latin1,    "single-line, latin1"  },
+  { multi_line,                    "multiline"            },
+  { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
+  { multi_line|Regexp::Latin1,     "multiline, latin1"    },
+};
+
+static string FormatMode(Regexp::ParseFlags flags) {
+  for (int i = 0; i < arraysize(parse_modes); i++)
+    if (parse_modes[i].parse_flags == flags)
+      return parse_modes[i].desc;
+  return StringPrintf("%#x", static_cast<uint>(flags));
+}
+
+// Constructs and saves all the matching engines that
+// will be required for the given tests.
+TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
+                           Regexp::ParseFlags flags)
+  : regexp_str_(regexp_str),
+    kind_(kind),
+    flags_(flags),
+    error_(false),
+    regexp_(NULL),
+    num_captures_(0),
+    prog_(NULL),
+    rprog_(NULL),
+    re_(NULL),
+    re2_(NULL) {
+
+  VLOG(1) << CEscape(regexp_str);
+
+  // Compile regexp to prog.
+  // Always required - needed for backtracking (reference implementation).
+  RegexpStatus status;
+  regexp_ = Regexp::Parse(regexp_str, flags, &status);
+  if (regexp_ == NULL) {
+    LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+              << " mode: " << FormatMode(flags);
+    error_ = true;
+    return;
+  }
+  num_captures_ = regexp_->NumCaptures();
+  prog_ = regexp_->CompileToProg(0);
+  if (prog_ == NULL) {
+    LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
+    error_ = true;
+    return;
+  }
+  if (FLAGS_dump_prog) {
+    LOG(INFO) << "Prog for "
+              << " regexp "
+              << CEscape(regexp_str_)
+              << " (" << FormatKind(kind_)
+              << ", " << FormatMode(flags_)
+              << ")\n"
+              << prog_->Dump();
+  }
+
+  // Compile regexp to reversed prog.  Only needed for DFA engines.
+  if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
+    rprog_ = regexp_->CompileToReverseProg(0);
+    if (rprog_ == NULL) {
+      LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
+      error_ = true;
+      return;
+    }
+    if (FLAGS_dump_rprog)
+      LOG(INFO) << rprog_->Dump();
+  }
+
+  // Create re string that will be used for RE and RE2.
+  string re = regexp_str.as_string();
+  // Accomodate flags.
+  // Regexp::Latin1 will be accomodated below.
+  if (!(flags & Regexp::OneLine))
+    re = "(?m)" + re;
+  if (flags & Regexp::NonGreedy)
+    re = "(?U)" + re;
+  if (flags & Regexp::DotNL)
+    re = "(?s)" + re;
+
+  // Compile regexp to RE2.
+  if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
+    RE2::Options options;
+    if (flags & Regexp::Latin1)
+      options.set_encoding(RE2::Options::EncodingLatin1);
+    if (kind_ == Prog::kLongestMatch)
+      options.set_longest_match(true);
+    re2_ = new RE2(re, options);
+    if (!re2_->error().empty()) {
+      LOG(INFO) << "Cannot RE2: " << CEscape(re);
+      error_ = true;
+      return;
+    }
+  }
+
+  // Compile regexp to RE.
+  // PCRE as exposed by the RE interface isn't always usable.
+  // 1. It disagrees about handling of empty-string reptitions
+  //    like matching (a*)* against "b".  PCRE treats the (a*) as
+  //    occurring once, while we treat it as occurring not at all.
+  // 2. It treats $ as this weird thing meaning end of string
+  //    or before the \n at the end of the string.
+  // 3. It doesn't implement POSIX leftmost-longest matching.
+  // MimicsPCRE() detects 1 and 2.
+  if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
+      kind_ != Prog::kLongestMatch) {
+    PCRE_Options o;
+    o.set_option(PCRE::UTF8);
+    if (flags & Regexp::Latin1)
+      o.set_option(PCRE::None);
+    // PCRE has interface bug keeping us from finding $0, so
+    // add one more layer of parens.
+    re_ = new PCRE("("+re+")", o);
+    if (!re_->error().empty()) {
+      LOG(INFO) << "Cannot PCRE: " << CEscape(re);
+      error_ = true;
+      return;
+    }
+  }
+}
+
+TestInstance::~TestInstance() {
+  if (regexp_)
+    regexp_->Decref();
+  delete prog_;
+  delete rprog_;
+  delete re_;
+  delete re2_;
+}
+
+// Runs a single search using the named engine type.
+// This interface hides all the irregularities of the various
+// engine interfaces from the rest of this file.
+void TestInstance::RunSearch(Engine type,
+                             const StringPiece& orig_text,
+                             const StringPiece& orig_context,
+                             Prog::Anchor anchor,
+                             Result *result) {
+  memset(result, 0, sizeof *result);
+  if (regexp_ == NULL) {
+    result->skipped = true;
+    return;
+  }
+  int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
+  if (nsubmatch > kMaxSubmatch)
+    nsubmatch = kMaxSubmatch;
+
+  StringPiece text = orig_text;
+  StringPiece context = orig_context;
+
+  switch (type) {
+    default:
+      LOG(FATAL) << "Bad RunSearch type: " << (int)type;
+
+    case kEngineBacktrack:
+      if (prog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched =
+        prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
+                                     result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineNFA:
+      if (prog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched =
+        prog_->SearchNFA(text, context, anchor, kind_,
+                        result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineDFA:
+      if (prog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
+                                         &result->skipped, NULL);
+      break;
+
+    case kEngineDFA1:
+      if (prog_ == NULL || rprog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched =
+        prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
+                         &result->skipped, NULL);
+      // If anchored, no need for second run,
+      // but do it anyway to find more bugs.
+      if (result->matched) {
+        if (!rprog_->SearchDFA(result->submatch[0], context,
+                               Prog::kAnchored, Prog::kLongestMatch,
+                               result->submatch,
+                               &result->skipped, NULL)) {
+          LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
+                     << " on " << CEscape(text);
+          result->matched = false;
+        }
+      }
+      result->have_submatch0 = true;
+      break;
+
+    case kEngineOnePass:
+      if (prog_ == NULL ||
+          anchor == Prog::kUnanchored ||
+          !prog_->IsOnePass() ||
+          nsubmatch > Prog::kMaxOnePassCapture) {
+        result->skipped = true;
+        break;
+      }
+      result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
+                                      result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineBitState:
+      if (prog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched = prog_->SearchBitState(text, context, anchor, kind_,
+                                              result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineRE2:
+    case kEngineRE2a:
+    case kEngineRE2b: {
+      if (!re2_ || text.end() != context.end()) {
+        result->skipped = true;
+        break;
+      }
+
+      RE2::Anchor re_anchor;
+      if (anchor == Prog::kAnchored)
+        re_anchor = RE2::ANCHOR_START;
+      else
+        re_anchor = RE2::UNANCHORED;
+      if (kind_ == Prog::kFullMatch)
+        re_anchor = RE2::ANCHOR_BOTH;
+
+      result->matched = re2_->Match(context,
+                                    text.begin() - context.begin(),
+                                    text.end() - context.begin(),
+                                    re_anchor, result->submatch, nsubmatch);
+      result->have_submatch = nsubmatch > 0;
+      break;
+    }
+
+    case kEnginePCRE: {
+      if (!re_ || text.begin() != context.begin() ||
+          text.end() != context.end()) {
+        result->skipped = true;
+        break;
+      }
+
+      const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
+      PCRE::Arg *a = new PCRE::Arg[nsubmatch];
+      for (int i = 0; i < nsubmatch; i++) {
+        a[i] = PCRE::Arg(&result->submatch[i]);
+        argptr[i] = &a[i];
+      }
+      int consumed;
+      PCRE::Anchor pcre_anchor;
+      if (anchor == Prog::kAnchored)
+        pcre_anchor = PCRE::ANCHOR_START;
+      else
+        pcre_anchor = PCRE::UNANCHORED;
+      if (kind_ == Prog::kFullMatch)
+        pcre_anchor = PCRE::ANCHOR_BOTH;
+      re_->ClearHitLimit();
+      result->matched =
+        re_->DoMatch(text,
+                     pcre_anchor,
+                     &consumed,
+                     argptr, nsubmatch);
+      if (re_->HitLimit()) {
+        result->untrusted = true;
+        delete[] argptr;
+        delete[] a;
+        break;
+      }
+      result->have_submatch = true;
+
+      // Work around RE interface bug: PCRE returns -1 as the
+      // offsets for an unmatched subexpression, and RE should
+      // turn that into StringPiece(NULL) but in fact it uses
+      // StringPiece(text.begin() - 1, 0).  Oops.
+      for (int i = 0; i < nsubmatch; i++)
+        if (result->submatch[i].begin() == text.begin() - 1)
+          result->submatch[i] = NULL;
+      delete[] argptr;
+      delete[] a;
+      break;
+    }
+  }
+
+  if (!result->matched)
+    memset(result->submatch, 0, sizeof result->submatch);
+}
+
+// Checks whether r is okay given that correct is the right answer.
+// Specifically, r's answers have to match (but it doesn't have to
+// claim to have all the answers).
+static bool ResultOkay(const Result& r, const Result& correct) {
+  if (r.skipped)
+    return true;
+  if (r.matched != correct.matched)
+    return false;
+  if (r.have_submatch || r.have_submatch0) {
+    for (int i = 0; i < kMaxSubmatch; i++) {
+      if (correct.submatch[i].begin() != r.submatch[i].begin() ||
+          correct.submatch[i].size() != r.submatch[i].size())
+        return false;
+      if (!r.have_submatch)
+        break;
+    }
+  }
+  return true;
+}
+
+// Runs a single test.
+bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
+                           Prog::Anchor anchor) {
+  // Backtracking is the gold standard.
+  Result correct;
+  RunSearch(kEngineBacktrack, text, context, anchor, &correct);
+  if (correct.skipped) {
+    if (regexp_ == NULL)
+      return true;
+    LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
+               << " " << FormatMode(flags_);
+    return false;
+  }
+  VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
+          << " text " << CEscape(text)
+          << " (" << FormatKind(kind_)
+          << ", " << FormatAnchor(anchor)
+          << ", " << FormatMode(flags_)
+          << ")";
+
+  // Compare the others.
+  bool all_okay = true;
+  for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
+    if (!(Engines() & (1<<i)))
+      continue;
+
+    Result r;
+    RunSearch(i, text, context, anchor, &r);
+    if (ResultOkay(r, correct)) {
+      if (FLAGS_log_okay)
+        LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
+      continue;
+    }
+
+    // We disagree with PCRE on the meaning of some Unicode matches.
+    // In particular, we treat all non-ASCII UTF-8 as word characters.
+    // We also treat "empty" character sets like [^\w\W] as being
+    // impossible to match, while PCRE apparently excludes some code
+    // points (e.g., 0x0080) from both \w and \W.
+    if (i == kEnginePCRE && NonASCII(text))
+      continue;
+
+    if (!r.untrusted)
+      all_okay = false;
+
+    LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
+             context, anchor);
+    if (r.matched != correct.matched) {
+      if (r.matched) {
+        LOG(INFO) << "   Should not match (but does).";
+      } else {
+        LOG(INFO) << "   Should match (but does not).";
+        continue;
+      }
+    }
+    for (int i = 0; i < 1+num_captures_; i++) {
+      if (r.submatch[i].begin() != correct.submatch[i].begin() ||
+          r.submatch[i].end() != correct.submatch[i].end()) {
+        LOG(INFO) <<
+          StringPrintf("   $%d: should be %s is %s",
+                       i,
+                       FormatCapture(text, correct.submatch[i]).c_str(),
+                       FormatCapture(text, r.submatch[i]).c_str());
+      } else {
+        LOG(INFO) <<
+          StringPrintf("   $%d: %s ok", i,
+                       FormatCapture(text, r.submatch[i]).c_str());
+      }
+    }
+  }
+
+  if (!all_okay) {
+    if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
+      LOG(QFATAL) << "Too many regexp failures.";
+  }
+
+  return all_okay;
+}
+
+void TestInstance::LogMatch(const char* prefix, Engine e,
+                            const StringPiece& text, const StringPiece& context,
+                            Prog::Anchor anchor) {
+  LOG(INFO) << prefix
+    << EngineString(e)
+    << " regexp "
+    << CEscape(regexp_str_)
+    << " "
+    << CEscape(regexp_->ToString())
+    << " text "
+    << CEscape(text)
+    << " ("
+    << text.begin() - context.begin()
+    << ","
+    << text.end() - context.begin()
+    << ") of context "
+    << CEscape(context)
+    << " (" << FormatKind(kind_)
+    << ", " << FormatAnchor(anchor)
+    << ", " << FormatMode(flags_)
+    << ")";
+}
+
+static Prog::MatchKind kinds[] = {
+  Prog::kFirstMatch,
+  Prog::kLongestMatch,
+  Prog::kFullMatch,
+};
+
+// Test all possible match kinds and parse modes.
+Tester::Tester(const StringPiece& regexp) {
+  error_ = false;
+  for (int i = 0; i < arraysize(kinds); i++) {
+    for (int j = 0; j < arraysize(parse_modes); j++) {
+      TestInstance* t = new TestInstance(regexp, kinds[i],
+                                         parse_modes[j].parse_flags);
+      error_ |= t->error();
+      v_.push_back(t);
+    }
+  }
+}
+
+Tester::~Tester() {
+  for (int i = 0; i < v_.size(); i++)
+    delete v_[i];
+}
+
+bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
+                         Prog::Anchor anchor) {
+  bool okay = true;
+  for (int i = 0; i < v_.size(); i++)
+    okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
+  return okay;
+}
+
+static Prog::Anchor anchors[] = {
+  Prog::kAnchored,
+  Prog::kUnanchored
+};
+
+bool Tester::TestInput(const StringPiece& text) {
+  bool okay = TestInputInContext(text, text);
+  if (text.size() > 0) {
+    StringPiece sp;
+    sp = text;
+    sp.remove_prefix(1);
+    okay &= TestInputInContext(sp, text);
+    sp = text;
+    sp.remove_suffix(1);
+    okay &= TestInputInContext(sp, text);
+  }
+  return okay;
+}
+
+bool Tester::TestInputInContext(const StringPiece& text,
+                                const StringPiece& context) {
+  bool okay = true;
+  for (int i = 0; i < arraysize(anchors); i++)
+    okay &= TestCase(text, context, anchors[i]);
+  return okay;
+}
+
+bool TestRegexpOnText(const StringPiece& regexp,
+                      const StringPiece& text) {
+  Tester t(regexp);
+  return t.TestInput(text);
+}
+
+}  // namespace re2
--- a/re2/re2/testing/tester.h
+++ b/re2/re2/testing/tester.h
@ -0,0 +1,121 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Comparative tester for regular expression matching.
+// Checks all implementations against each other.
+
+#ifndef RE2_TESTING_TESTER_H__
+#define RE2_TESTING_TESTER_H__
+
+#include "re2/stringpiece.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/re2.h"
+#include "util/pcre.h"
+
+namespace re2 {
+
+class Regexp;
+
+// All the supported regexp engines.
+enum Engine {
+  kEngineBacktrack = 0,    // Prog::BadSearchBacktrack
+  kEngineNFA,              // Prog::SearchNFA
+  kEngineDFA,              // Prog::SearchDFA, only ask whether it matched
+  kEngineDFA1,             // Prog::SearchDFA, ask for match[0]
+  kEngineOnePass,          // Prog::SearchOnePass, if applicable
+  kEngineBitState,         // Prog::SearchBitState
+  kEngineRE2,              // RE2, all submatches
+  kEngineRE2a,             // RE2, only ask for match[0]
+  kEngineRE2b,             // RE2, only ask whether it matched
+  kEnginePCRE,             // PCRE (util/pcre.h)
+
+  kEngineMax,
+};
+
+// Make normal math on the enum preserve the type.
+// By default, C++ doesn't define ++ on enum, and e+1 has type int.
+static inline void operator++(Engine& e, int unused) {
+  e = static_cast<Engine>(e+1);
+}
+
+static inline Engine operator+(Engine e, int i) {
+  return static_cast<Engine>(static_cast<int>(e)+i);
+}
+
+// A TestInstance caches per-regexp state for a given
+// regular expression in a given configuration
+// (UTF-8 vs Latin1, longest vs first match, etc.).
+class TestInstance {
+ public:
+  struct Result;
+
+  TestInstance(const StringPiece& regexp, Prog::MatchKind kind,
+               Regexp::ParseFlags flags);
+  ~TestInstance();
+  Regexp::ParseFlags flags() { return flags_; }
+  bool error() { return error_; }
+
+  // Runs a single test case: search in text, which is in context,
+  // using the given anchoring.
+  bool RunCase(const StringPiece& text, const StringPiece& context,
+               Prog::Anchor anchor);
+
+ private:
+  // Runs a single search using the named engine type.
+  void RunSearch(Engine type,
+                 const StringPiece& text, const StringPiece& context,
+                 Prog::Anchor anchor,
+                 Result *result);
+
+  void LogMatch(const char* prefix, Engine e, const StringPiece& text,
+                const StringPiece& context, Prog::Anchor anchor);
+
+  const StringPiece& regexp_str_;   // regexp being tested
+  Prog::MatchKind kind_;            // kind of match
+  Regexp::ParseFlags flags_;        // flags for parsing regexp_str_
+  bool error_;                      // error during constructor?
+
+  Regexp* regexp_;                  // parsed regexp
+  int num_captures_;                // regexp_->NumCaptures() cached
+  Prog* prog_;                      // compiled program
+  Prog* rprog_;                     // compiled reverse program
+  PCRE* re_;                        // PCRE implementation
+  RE2* re2_;                        // RE2 implementation
+
+  DISALLOW_EVIL_CONSTRUCTORS(TestInstance);
+};
+
+// A group of TestInstances for all possible configurations.
+class Tester {
+ public:
+  explicit Tester(const StringPiece& regexp);
+  ~Tester();
+
+  bool error() { return error_; }
+
+  // Runs a single test case: search in text, which is in context,
+  // using the given anchoring.
+  bool TestCase(const StringPiece& text, const StringPiece& context,
+                Prog::Anchor anchor);
+
+  // Run TestCase(text, text, anchor) for all anchoring modes.
+  bool TestInput(const StringPiece& text);
+
+  // Run TestCase(text, context, anchor) for all anchoring modes.
+  bool TestInputInContext(const StringPiece& text, const StringPiece& context);
+
+ private:
+  bool error_;
+  vector<TestInstance*> v_;
+
+  DISALLOW_EVIL_CONSTRUCTORS(Tester);
+};
+
+// Run all possible tests using regexp and text.
+bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
+
+}  // namespace re2
+
+#endif  // RE2_TESTING_TESTER_H__
--- a/re2/re2/testing/unicode_test.py
+++ b/re2/re2/testing/unicode_test.py
@ -0,0 +1,207 @@
+#!/usr/bin/python2.4
+#
+# Copyright 2008 The RE2 Authors.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""Unittest for the util/regexp/re2/unicode.py module."""
+
+import os
+import StringIO
+from google3.pyglib import flags
+from google3.testing.pybase import googletest
+from google3.util.regexp.re2 import unicode
+
+_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party",
+                            "unicode", "ucd-5.1.0")
+
+
+class ConvertTest(googletest.TestCase):
+  """Test the conversion functions."""
+
+  def testUInt(self):
+    self.assertEquals(0x0000, unicode._UInt("0000"))
+    self.assertEquals(0x263A, unicode._UInt("263A"))
+    self.assertEquals(0x10FFFF, unicode._UInt("10FFFF"))
+    self.assertRaises(unicode.InputError, unicode._UInt, "263")
+    self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA")
+    self.assertRaises(unicode.InputError, unicode._UInt, "110000")
+
+  def testURange(self):
+    self.assertEquals([1, 2, 3], unicode._URange("0001..0003"))
+    self.assertEquals([1], unicode._URange("0001"))
+    self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005")
+    self.assertRaises(unicode.InputError, unicode._URange, "0003..0001")
+    self.assertRaises(unicode.InputError, unicode._URange, "0001..0001")
+
+  def testUStr(self):
+    self.assertEquals("0x263A", unicode._UStr(0x263a))
+    self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF))
+    self.assertRaises(unicode.InputError, unicode._UStr, 0x110000)
+    self.assertRaises(unicode.InputError, unicode._UStr, -1)
+
+
+_UNICODE_TABLE = """# Commented line, should be ignored.
+# The next line is blank and should be ignored.
+
+0041;Capital A;Line 1
+0061..007A;Lowercase;Line 2
+1F00;<Greek, First>;Ignored
+1FFE;<Greek, Last>;Line 3
+10FFFF;Runemax;Line 4
+0000;Zero;Line 5
+"""
+
+_BAD_TABLE1 = """
+111111;Not a code point;
+"""
+
+_BAD_TABLE2 = """
+0000;<Zero, First>;Missing <Zero, Last>
+"""
+
+_BAD_TABLE3 = """
+0010..0001;Bad range;
+"""
+
+
+class AbortError(Exception):
+  """Function should not have been called."""
+
+
+def Abort():
+  raise AbortError("Abort")
+
+
+def StringTable(s, n, f):
+  unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f)
+
+
+class ReadUnicodeTableTest(googletest.TestCase):
+  """Test the ReadUnicodeTable function."""
+
+  def testSimpleTable(self):
+
+    ncall = [0]  # can't assign to ordinary int in DoLine
+
+    def DoLine(codes, fields):
+      self.assertEquals(3, len(fields))
+      ncall[0] += 1
+      self.assertEquals("Line %d" % (ncall[0],), fields[2])
+      if ncall[0] == 1:
+        self.assertEquals([0x0041], codes)
+        self.assertEquals("0041", fields[0])
+        self.assertEquals("Capital A", fields[1])
+      elif ncall[0] == 2:
+        self.assertEquals(range(0x0061, 0x007A + 1), codes)
+        self.assertEquals("0061..007A", fields[0])
+        self.assertEquals("Lowercase", fields[1])
+      elif ncall[0] == 3:
+        self.assertEquals(range(0x1F00, 0x1FFE + 1), codes)
+        self.assertEquals("1F00..1FFE", fields[0])
+        self.assertEquals("Greek", fields[1])
+      elif ncall[0] == 4:
+        self.assertEquals([0x10FFFF], codes)
+        self.assertEquals("10FFFF", fields[0])
+        self.assertEquals("Runemax", fields[1])
+      elif ncall[0] == 5:
+        self.assertEquals([0x0000], codes)
+        self.assertEquals("0000", fields[0])
+        self.assertEquals("Zero", fields[1])
+
+    StringTable(_UNICODE_TABLE, 3, DoLine)
+    self.assertEquals(5, ncall[0])
+
+  def testErrorTables(self):
+    self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort)
+    self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort)
+    self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort)
+    self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort)
+    self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort)
+
+
+class ParseContinueTest(googletest.TestCase):
+  """Test the ParseContinue function."""
+
+  def testParseContinue(self):
+    self.assertEquals(("Private Use", "First"),
+                      unicode._ParseContinue("<Private Use, First>"))
+    self.assertEquals(("Private Use", "Last"),
+                      unicode._ParseContinue("<Private Use, Last>"))
+    self.assertEquals(("<Private Use, Blah>", None),
+                      unicode._ParseContinue("<Private Use, Blah>"))
+
+
+class CaseGroupsTest(googletest.TestCase):
+  """Test the CaseGroups function (and the CaseFoldingReader)."""
+
+  def FindGroup(self, c):
+    if type(c) == str:
+      c = ord(c)
+    for g in self.groups:
+      if c in g:
+        return g
+    return None
+
+  def testCaseGroups(self):
+    self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR)
+    self.assertEquals([ord("A"), ord("a")], self.FindGroup("a"))
+    self.assertEquals(None, self.FindGroup("0"))
+
+
+class ScriptsTest(googletest.TestCase):
+  """Test the Scripts function (and the ScriptsReader)."""
+
+  def FindScript(self, c):
+    if type(c) == str:
+      c = ord(c)
+    for script, codes in self.scripts.items():
+      for code in codes:
+        if c == code:
+          return script
+    return None
+
+  def testScripts(self):
+    self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR)
+    self.assertEquals("Latin", self.FindScript("a"))
+    self.assertEquals("Common", self.FindScript("0"))
+    self.assertEquals(None, self.FindScript(0xFFFE))
+
+
+class CategoriesTest(googletest.TestCase):
+  """Test the Categories function (and the UnicodeDataReader)."""
+
+  def FindCategory(self, c):
+    if type(c) == str:
+      c = ord(c)
+    short = None
+    for category, codes in self.categories.items():
+      for code in codes:
+        if code == c:
+          # prefer category Nd over N
+          if len(category) > 1:
+            return category
+          if short == None:
+            short = category
+    return short
+
+  def testCategories(self):
+    self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR)
+    self.assertEquals("Ll", self.FindCategory("a"))
+    self.assertEquals("Nd", self.FindCategory("0"))
+    self.assertEquals("Lo", self.FindCategory(0xAD00))  # in First, Last range
+    self.assertEquals(None, self.FindCategory(0xFFFE))
+    self.assertEquals("Lo", self.FindCategory(0x8B5A))
+    self.assertEquals("Lo", self.FindCategory(0x6C38))
+    self.assertEquals("Lo", self.FindCategory(0x92D2))
+    self.assertTrue(ord("a") in self.categories["L"])
+    self.assertTrue(ord("0") in self.categories["N"])
+    self.assertTrue(0x8B5A in self.categories["L"])
+    self.assertTrue(0x6C38 in self.categories["L"])
+    self.assertTrue(0x92D2 in self.categories["L"])
+
+def main():
+  googletest.main()
+
+if __name__ == "__main__":
+  main()
--- a/re2/re2/tostring.cc
+++ b/re2/re2/tostring.cc
@ -0,0 +1,341 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Format a regular expression structure as a string.
+// Tested by parse_test.cc
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+enum {
+  PrecAtom,
+  PrecUnary,
+  PrecConcat,
+  PrecAlternate,
+  PrecEmpty,
+  PrecParen,
+  PrecToplevel,
+};
+
+// Helper function.  See description below.
+static void AppendCCRange(string* t, Rune lo, Rune hi);
+
+// Walker to generate string in s_.
+// The arg pointers are actually integers giving the
+// context precedence.
+// The child_args are always NULL.
+class ToStringWalker : public Regexp::Walker<int> {
+ public:
+  explicit ToStringWalker(string* t) : t_(t) {}
+
+  virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
+  virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
+                        int* child_args, int nchild_args);
+  virtual int ShortVisit(Regexp* re, int parent_arg) {
+    return 0;
+  }
+
+ private:
+  string* t_;  // The string the walker appends to.
+
+  DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
+};
+
+string Regexp::ToString() {
+  string t;
+  ToStringWalker w(&t);
+  w.WalkExponential(this, PrecToplevel, 100000);
+  if (w.stopped_early())
+    t += " [truncated]";
+  return t;
+}
+
+#define ToString DontCallToString  // Avoid accidental recursion.
+
+// Visits re before children are processed.
+// Appends ( if needed and passes new precedence to children.
+int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
+  int prec = parent_arg;
+  int nprec = PrecAtom;
+
+  switch (re->op()) {
+    case kRegexpNoMatch:
+    case kRegexpEmptyMatch:
+    case kRegexpLiteral:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpBeginLine:
+    case kRegexpEndLine:
+    case kRegexpBeginText:
+    case kRegexpEndText:
+    case kRegexpWordBoundary:
+    case kRegexpNoWordBoundary:
+    case kRegexpCharClass:
+    case kRegexpHaveMatch:
+      nprec = PrecAtom;
+      break;
+
+    case kRegexpConcat:
+    case kRegexpLiteralString:
+      if (prec < PrecConcat)
+        t_->append("(?:");
+      nprec = PrecConcat;
+      break;
+
+    case kRegexpAlternate:
+      if (prec < PrecAlternate)
+        t_->append("(?:");
+      nprec = PrecAlternate;
+      break;
+
+    case kRegexpCapture:
+      t_->append("(");
+      if (re->name()) {
+        t_->append("?P<");
+        t_->append(*re->name());
+        t_->append(">");
+      }
+      nprec = PrecParen;
+      break;
+
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+    case kRegexpRepeat:
+      if (prec < PrecUnary)
+        t_->append("(?:");
+      // The subprecedence here is PrecAtom instead of PrecUnary
+      // because PCRE treats two unary ops in a row as a parse error.
+      nprec = PrecAtom;
+      break;
+  }
+
+  return nprec;
+}
+
+static void AppendLiteral(string *t, Rune r, bool foldcase) {
+  if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
+    t->append(1, '\\');
+    t->append(1, r);
+  } else if (foldcase && 'a' <= r && r <= 'z') {
+    if ('a' <= r && r <= 'z')
+      r += 'A' - 'a';
+    t->append(1, '[');
+    t->append(1, r);
+    t->append(1, r + 'a' - 'A');
+    t->append(1, ']');
+  } else {
+    AppendCCRange(t, r, r);
+  }
+}
+
+// Visits re after children are processed.
+// For childless regexps, all the work is done here.
+// For regexps with children, append any unary suffixes or ).
+int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
+                              int* child_args, int nchild_args) {
+  int prec = parent_arg;
+  switch (re->op()) {
+    case kRegexpNoMatch:
+      // There's no simple symbol for "no match", but
+      // [^0-Runemax] excludes everything.
+      t_->append("[^\\x00-\\x{10ffff}]");
+      break;
+
+    case kRegexpEmptyMatch:
+      // Append (?:) to make empty string visible,
+      // unless this is already being parenthesized.
+      if (prec < PrecEmpty)
+        t_->append("(?:)");
+      break;
+
+    case kRegexpLiteral:
+      AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
+      break;
+
+    case kRegexpLiteralString:
+      for (int i = 0; i < re->nrunes(); i++)
+        AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
+      if (prec < PrecConcat)
+        t_->append(")");
+      break;
+
+    case kRegexpConcat:
+      if (prec < PrecConcat)
+        t_->append(")");
+      break;
+
+    case kRegexpAlternate:
+      // Clumsy but workable: the children all appended |
+      // at the end of their strings, so just remove the last one.
+      if ((*t_)[t_->size()-1] == '|')
+        t_->erase(t_->size()-1);
+      else
+        LOG(DFATAL) << "Bad final char: " << t_;
+      if (prec < PrecAlternate)
+        t_->append(")");
+      break;
+
+    case kRegexpStar:
+      t_->append("*");
+      if (re->parse_flags() & Regexp::NonGreedy)
+        t_->append("?");
+      if (prec < PrecUnary)
+        t_->append(")");
+      break;
+
+    case kRegexpPlus:
+      t_->append("+");
+      if (re->parse_flags() & Regexp::NonGreedy)
+        t_->append("?");
+      if (prec < PrecUnary)
+        t_->append(")");
+      break;
+
+    case kRegexpQuest:
+      t_->append("?");
+      if (re->parse_flags() & Regexp::NonGreedy)
+        t_->append("?");
+      if (prec < PrecUnary)
+        t_->append(")");
+      break;
+
+    case kRegexpRepeat:
+      if (re->max() == -1)
+        t_->append(StringPrintf("{%d,}", re->min()));
+      else if (re->min() == re->max())
+        t_->append(StringPrintf("{%d}", re->min()));
+      else
+        t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
+      if (re->parse_flags() & Regexp::NonGreedy)
+        t_->append("?");
+      if (prec < PrecUnary)
+        t_->append(")");
+      break;
+
+    case kRegexpAnyChar:
+      t_->append(".");
+      break;
+
+    case kRegexpAnyByte:
+      t_->append("\\C");
+      break;
+
+    case kRegexpBeginLine:
+      t_->append("^");
+      break;
+
+    case kRegexpEndLine:
+      t_->append("$");
+      break;
+
+    case kRegexpBeginText:
+      t_->append("(?-m:^)");
+      break;
+
+    case kRegexpEndText:
+      if (re->parse_flags() & Regexp::WasDollar)
+        t_->append("(?-m:$)");
+      else
+        t_->append("\\z");
+      break;
+
+    case kRegexpWordBoundary:
+      t_->append("\\b");
+      break;
+
+    case kRegexpNoWordBoundary:
+      t_->append("\\B");
+      break;
+
+    case kRegexpCharClass: {
+      if (re->cc()->size() == 0) {
+        t_->append("[^\\x00-\\x{10ffff}]");
+        break;
+      }
+      t_->append("[");
+      // Heuristic: show class as negated if it contains the
+      // non-character 0xFFFE.
+      CharClass* cc = re->cc();
+      if (cc->Contains(0xFFFE)) {
+        cc = cc->Negate();
+        t_->append("^");
+      }
+      for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
+        AppendCCRange(t_, i->lo, i->hi);
+      if (cc != re->cc())
+        cc->Delete();
+      t_->append("]");
+      break;
+    }
+
+    case kRegexpCapture:
+      t_->append(")");
+      break;
+
+    case kRegexpHaveMatch:
+      // There's no syntax accepted by the parser to generate
+      // this node (it is generated by RE2::Set) so make something
+      // up that is readable but won't compile.
+      t_->append("(?HaveMatch:%d)", re->match_id());
+      break;
+  }
+
+  // If the parent is an alternation, append the | for it.
+  if (prec == PrecAlternate)
+    t_->append("|");
+
+  return 0;
+}
+
+// Appends a rune for use in a character class to the string t.
+static void AppendCCChar(string* t, Rune r) {
+  if (0x20 <= r && r <= 0x7E) {
+    if (strchr("[]^-\\", r))
+      t->append("\\");
+    t->append(1, r);
+    return;
+  }
+  switch (r) {
+    default:
+      break;
+
+    case '\r':
+      t->append("\\r");
+      return;
+
+    case '\t':
+      t->append("\\t");
+      return;
+
+    case '\n':
+      t->append("\\n");
+      return;
+
+    case '\f':
+      t->append("\\f");
+      return;
+  }
+
+  if (r < 0x100) {
+    StringAppendF(t, "\\x%02x", static_cast<int>(r));
+    return;
+  }
+  StringAppendF(t, "\\x{%x}", static_cast<int>(r));
+}
+
+static void AppendCCRange(string* t, Rune lo, Rune hi) {
+  if (lo > hi)
+    return;
+  AppendCCChar(t, lo);
+  if (lo < hi) {
+    t->append("-");
+    AppendCCChar(t, hi);
+  }
+}
+
+}  // namespace re2
--- a/re2/re2/unicode.py
+++ b/re2/re2/unicode.py
@ -0,0 +1,297 @@
+# Copyright 2008 The RE2 Authors.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""Parser for Unicode data files (as distributed by unicode.org)."""
+
+import os
+import re
+import urllib2
+
+# Directory or URL where Unicode tables reside.
+_UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd"
+
+# Largest valid Unicode code value.
+_RUNE_MAX = 0x10FFFF
+
+
+class Error(Exception):
+  """Unicode error base class."""
+
+
+class InputError(Error):
+  """Unicode input error class.  Raised on invalid input."""
+
+
+def _UInt(s):
+  """Converts string to Unicode code point ('263A' => 0x263a).
+
+  Args:
+    s: string to convert
+
+  Returns:
+    Unicode code point
+
+  Raises:
+    InputError: the string is not a valid Unicode value.
+  """
+
+  try:
+    v = int(s, 16)
+  except ValueError:
+    v = -1
+  if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
+    raise InputError("invalid Unicode value %s" % (s,))
+  return v
+
+
+def _URange(s):
+  """Converts string to Unicode range.
+
+    '0001..0003' => [1, 2, 3].
+    '0001' => [1].
+
+  Args:
+    s: string to convert
+
+  Returns:
+    Unicode range
+
+  Raises:
+    InputError: the string is not a valid Unicode range.
+  """
+  a = s.split("..")
+  if len(a) == 1:
+    return [_UInt(a[0])]
+  if len(a) == 2:
+    lo = _UInt(a[0])
+    hi = _UInt(a[1])
+    if lo < hi:
+      return range(lo, hi + 1)
+  raise InputError("invalid Unicode range %s" % (s,))
+
+
+def _UStr(v):
+  """Converts Unicode code point to hex string.
+
+    0x263a => '0x263A'.
+
+  Args:
+    v: code point to convert
+
+  Returns:
+    Unicode string
+
+  Raises:
+    InputError: the argument is not a valid Unicode value.
+  """
+  if v < 0 or v > _RUNE_MAX:
+    raise InputError("invalid Unicode value %s" % (v,))
+  return "0x%04X" % (v,)
+
+
+def _ParseContinue(s):
+  """Parses a Unicode continuation field.
+
+  These are of the form '<Name, First>' or '<Name, Last>'.
+  Instead of giving an explicit range in a single table entry,
+  some Unicode tables use two entries, one for the first
+  code value in the range and one for the last.
+  The first entry's description is '<Name, First>' instead of 'Name'
+  and the second is '<Name, Last>'.
+
+    '<Name, First>' => ('Name', 'First')
+    '<Name, Last>' => ('Name', 'Last')
+    'Anything else' => ('Anything else', None)
+
+  Args:
+    s: continuation field string
+
+  Returns:
+    pair: name and ('First', 'Last', or None)
+  """
+
+  match = re.match("<(.*), (First|Last)>", s)
+  if match is not None:
+    return match.groups()
+  return (s, None)
+
+
+def ReadUnicodeTable(filename, nfields, doline):
+  """Generic Unicode table text file reader.
+
+  The reader takes care of stripping out comments and also
+  parsing the two different ways that the Unicode tables specify
+  code ranges (using the .. notation and splitting the range across
+  multiple lines).
+
+  Each non-comment line in the table is expected to have the given
+  number of fields.  The first field is known to be the Unicode value
+  and the second field its description.
+
+  The reader calls doline(codes, fields) for each entry in the table.
+  If fn raises an exception, the reader prints that exception,
+  prefixed with the file name and line number, and continues
+  processing the file.  When done with the file, the reader re-raises
+  the first exception encountered during the file.
+
+  Arguments:
+    filename: the Unicode data file to read, or a file-like object.
+    nfields: the number of expected fields per line in that file.
+    doline: the function to call for each table entry.
+
+  Raises:
+    InputError: nfields is invalid (must be >= 2).
+  """
+
+  if nfields < 2:
+    raise InputError("invalid number of fields %d" % (nfields,))
+
+  if type(filename) == str:
+    if filename.startswith("http://"):
+      fil = urllib2.urlopen(filename)
+    else:
+      fil = open(filename, "r")
+  else:
+    fil = filename
+
+  first = None        # first code in multiline range
+  expect_last = None  # tag expected for "Last" line in multiline range
+  lineno = 0          # current line number
+  for line in fil:
+    lineno += 1
+    try:
+      # Chop # comments and white space; ignore empty lines.
+      sharp = line.find("#")
+      if sharp >= 0:
+        line = line[:sharp]
+      line = line.strip()
+      if not line:
+        continue
+
+      # Split fields on ";", chop more white space.
+      # Must have the expected number of fields.
+      fields = [s.strip() for s in line.split(";")]
+      if len(fields) != nfields:
+        raise InputError("wrong number of fields %d %d - %s" %
+                         (len(fields), nfields, line))
+
+      # The Unicode text files have two different ways
+      # to list a Unicode range.  Either the first field is
+      # itself a range (0000..FFFF), or the range is split
+      # across two lines, with the second field noting
+      # the continuation.
+      codes = _URange(fields[0])
+      (name, cont) = _ParseContinue(fields[1])
+
+      if expect_last is not None:
+        # If the last line gave the First code in a range,
+        # this one had better give the Last one.
+        if (len(codes) != 1 or codes[0] <= first or
+            cont != "Last" or name != expect_last):
+          raise InputError("expected Last line for %s" %
+                           (expect_last,))
+        codes = range(first, codes[0] + 1)
+        first = None
+        expect_last = None
+        fields[0] = "%04X..%04X" % (codes[0], codes[-1])
+        fields[1] = name
+      elif cont == "First":
+        # Otherwise, if this is the First code in a range,
+        # remember it and go to the next line.
+        if len(codes) != 1:
+          raise InputError("bad First line: range given")
+        expect_last = name
+        first = codes[0]
+        continue
+
+      doline(codes, fields)
+
+    except Exception, e:
+      print "%s:%d: %s" % (filename, lineno, e)
+      raise
+
+  if expect_last is not None:
+    raise InputError("expected Last line for %s; got EOF" %
+                     (expect_last,))
+
+
+def CaseGroups(unicode_dir=_UNICODE_DIR):
+  """Returns list of Unicode code groups equivalent under case folding.
+
+  Each group is a sorted list of code points,
+  and the list of groups is sorted by first code point
+  in the group.
+
+  Args:
+    unicode_dir: Unicode data directory
+
+  Returns:
+    list of Unicode code groups
+  """
+
+  # Dict mapping lowercase code point to fold-equivalent group.
+  togroup = {}
+
+  def DoLine(codes, fields):
+    """Process single CaseFolding.txt line, updating togroup."""
+    (_, foldtype, lower, _) = fields
+    if foldtype not in ("C", "S"):
+      return
+    lower = _UInt(lower)
+    togroup.setdefault(lower, [lower]).extend(codes)
+
+  ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
+
+  groups = togroup.values()
+  for g in groups:
+    g.sort()
+  groups.sort()
+  return togroup, groups
+
+
+def Scripts(unicode_dir=_UNICODE_DIR):
+  """Returns dict mapping script names to code lists.
+
+  Args:
+    unicode_dir: Unicode data directory
+
+  Returns:
+    dict mapping script names to code lists
+  """
+
+  scripts = {}
+
+  def DoLine(codes, fields):
+    """Process single Scripts.txt line, updating scripts."""
+    (_, name) = fields
+    scripts.setdefault(name, []).extend(codes)
+
+  ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
+  return scripts
+
+
+def Categories(unicode_dir=_UNICODE_DIR):
+  """Returns dict mapping category names to code lists.
+
+  Args:
+    unicode_dir: Unicode data directory
+
+  Returns:
+    dict mapping category names to code lists
+  """
+
+  categories = {}
+
+  def DoLine(codes, fields):
+    """Process single UnicodeData.txt line, updating categories."""
+    category = fields[2]
+    categories.setdefault(category, []).extend(codes)
+    # Add codes from Lu into L, etc.
+    if len(category) > 1:
+      short = category[0]
+      categories.setdefault(short, []).extend(codes)
+
+  ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
+  return categories
+
--- a/re2/re2/unicode_casefold.cc
+++ b/re2/re2/unicode_casefold.cc
@ -0,0 +1,469 @@
+
+// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
+// make_unicode_casefold.py >unicode_casefold.cc
+
+#include "re2/unicode_casefold.h"
+
+namespace re2 {
+
+
+// 1029 groups, 2079 pairs, 282 ranges
+CaseFold unicode_casefold[] = {
+	{ 65, 90, 32 },
+	{ 97, 106, -32 },
+	{ 107, 107, 8383 },
+	{ 108, 114, -32 },
+	{ 115, 115, 268 },
+	{ 116, 122, -32 },
+	{ 181, 181, 743 },
+	{ 192, 214, 32 },
+	{ 216, 222, 32 },
+	{ 223, 223, 7615 },
+	{ 224, 228, -32 },
+	{ 229, 229, 8262 },
+	{ 230, 246, -32 },
+	{ 248, 254, -32 },
+	{ 255, 255, 121 },
+	{ 256, 303, EvenOdd },
+	{ 306, 311, EvenOdd },
+	{ 313, 328, OddEven },
+	{ 330, 375, EvenOdd },
+	{ 376, 376, -121 },
+	{ 377, 382, OddEven },
+	{ 383, 383, -300 },
+	{ 384, 384, 195 },
+	{ 385, 385, 210 },
+	{ 386, 389, EvenOdd },
+	{ 390, 390, 206 },
+	{ 391, 392, OddEven },
+	{ 393, 394, 205 },
+	{ 395, 396, OddEven },
+	{ 398, 398, 79 },
+	{ 399, 399, 202 },
+	{ 400, 400, 203 },
+	{ 401, 402, OddEven },
+	{ 403, 403, 205 },
+	{ 404, 404, 207 },
+	{ 405, 405, 97 },
+	{ 406, 406, 211 },
+	{ 407, 407, 209 },
+	{ 408, 409, EvenOdd },
+	{ 410, 410, 163 },
+	{ 412, 412, 211 },
+	{ 413, 413, 213 },
+	{ 414, 414, 130 },
+	{ 415, 415, 214 },
+	{ 416, 421, EvenOdd },
+	{ 422, 422, 218 },
+	{ 423, 424, OddEven },
+	{ 425, 425, 218 },
+	{ 428, 429, EvenOdd },
+	{ 430, 430, 218 },
+	{ 431, 432, OddEven },
+	{ 433, 434, 217 },
+	{ 435, 438, OddEven },
+	{ 439, 439, 219 },
+	{ 440, 441, EvenOdd },
+	{ 444, 445, EvenOdd },
+	{ 447, 447, 56 },
+	{ 452, 452, EvenOdd },
+	{ 453, 453, OddEven },
+	{ 454, 454, -2 },
+	{ 455, 455, OddEven },
+	{ 456, 456, EvenOdd },
+	{ 457, 457, -2 },
+	{ 458, 458, EvenOdd },
+	{ 459, 459, OddEven },
+	{ 460, 460, -2 },
+	{ 461, 476, OddEven },
+	{ 477, 477, -79 },
+	{ 478, 495, EvenOdd },
+	{ 497, 497, OddEven },
+	{ 498, 498, EvenOdd },
+	{ 499, 499, -2 },
+	{ 500, 501, EvenOdd },
+	{ 502, 502, -97 },
+	{ 503, 503, -56 },
+	{ 504, 543, EvenOdd },
+	{ 544, 544, -130 },
+	{ 546, 563, EvenOdd },
+	{ 570, 570, 10795 },
+	{ 571, 572, OddEven },
+	{ 573, 573, -163 },
+	{ 574, 574, 10792 },
+	{ 575, 576, 10815 },
+	{ 577, 578, OddEven },
+	{ 579, 579, -195 },
+	{ 580, 580, 69 },
+	{ 581, 581, 71 },
+	{ 582, 591, EvenOdd },
+	{ 592, 592, 10783 },
+	{ 593, 593, 10780 },
+	{ 594, 594, 10782 },
+	{ 595, 595, -210 },
+	{ 596, 596, -206 },
+	{ 598, 599, -205 },
+	{ 601, 601, -202 },
+	{ 603, 603, -203 },
+	{ 608, 608, -205 },
+	{ 611, 611, -207 },
+	{ 613, 613, 42280 },
+	{ 616, 616, -209 },
+	{ 617, 617, -211 },
+	{ 619, 619, 10743 },
+	{ 623, 623, -211 },
+	{ 625, 625, 10749 },
+	{ 626, 626, -213 },
+	{ 629, 629, -214 },
+	{ 637, 637, 10727 },
+	{ 640, 640, -218 },
+	{ 643, 643, -218 },
+	{ 648, 648, -218 },
+	{ 649, 649, -69 },
+	{ 650, 651, -217 },
+	{ 652, 652, -71 },
+	{ 658, 658, -219 },
+	{ 837, 837, 84 },
+	{ 880, 883, EvenOdd },
+	{ 886, 887, EvenOdd },
+	{ 891, 893, 130 },
+	{ 902, 902, 38 },
+	{ 904, 906, 37 },
+	{ 908, 908, 64 },
+	{ 910, 911, 63 },
+	{ 913, 929, 32 },
+	{ 931, 931, 31 },
+	{ 932, 939, 32 },
+	{ 940, 940, -38 },
+	{ 941, 943, -37 },
+	{ 945, 945, -32 },
+	{ 946, 946, 30 },
+	{ 947, 948, -32 },
+	{ 949, 949, 64 },
+	{ 950, 951, -32 },
+	{ 952, 952, 25 },
+	{ 953, 953, 7173 },
+	{ 954, 954, 54 },
+	{ 955, 955, -32 },
+	{ 956, 956, -775 },
+	{ 957, 959, -32 },
+	{ 960, 960, 22 },
+	{ 961, 961, 48 },
+	{ 962, 962, EvenOdd },
+	{ 963, 965, -32 },
+	{ 966, 966, 15 },
+	{ 967, 968, -32 },
+	{ 969, 969, 7517 },
+	{ 970, 971, -32 },
+	{ 972, 972, -64 },
+	{ 973, 974, -63 },
+	{ 975, 975, 8 },
+	{ 976, 976, -62 },
+	{ 977, 977, 35 },
+	{ 981, 981, -47 },
+	{ 982, 982, -54 },
+	{ 983, 983, -8 },
+	{ 984, 1007, EvenOdd },
+	{ 1008, 1008, -86 },
+	{ 1009, 1009, -80 },
+	{ 1010, 1010, 7 },
+	{ 1012, 1012, -92 },
+	{ 1013, 1013, -96 },
+	{ 1015, 1016, OddEven },
+	{ 1017, 1017, -7 },
+	{ 1018, 1019, EvenOdd },
+	{ 1021, 1023, -130 },
+	{ 1024, 1039, 80 },
+	{ 1040, 1071, 32 },
+	{ 1072, 1103, -32 },
+	{ 1104, 1119, -80 },
+	{ 1120, 1153, EvenOdd },
+	{ 1162, 1215, EvenOdd },
+	{ 1216, 1216, 15 },
+	{ 1217, 1230, OddEven },
+	{ 1231, 1231, -15 },
+	{ 1232, 1319, EvenOdd },
+	{ 1329, 1366, 48 },
+	{ 1377, 1414, -48 },
+	{ 4256, 4293, 7264 },
+	{ 7545, 7545, 35332 },
+	{ 7549, 7549, 3814 },
+	{ 7680, 7776, EvenOdd },
+	{ 7777, 7777, 58 },
+	{ 7778, 7829, EvenOdd },
+	{ 7835, 7835, -59 },
+	{ 7838, 7838, -7615 },
+	{ 7840, 7935, EvenOdd },
+	{ 7936, 7943, 8 },
+	{ 7944, 7951, -8 },
+	{ 7952, 7957, 8 },
+	{ 7960, 7965, -8 },
+	{ 7968, 7975, 8 },
+	{ 7976, 7983, -8 },
+	{ 7984, 7991, 8 },
+	{ 7992, 7999, -8 },
+	{ 8000, 8005, 8 },
+	{ 8008, 8013, -8 },
+	{ 8017, 8017, 8 },
+	{ 8019, 8019, 8 },
+	{ 8021, 8021, 8 },
+	{ 8023, 8023, 8 },
+	{ 8025, 8025, -8 },
+	{ 8027, 8027, -8 },
+	{ 8029, 8029, -8 },
+	{ 8031, 8031, -8 },
+	{ 8032, 8039, 8 },
+	{ 8040, 8047, -8 },
+	{ 8048, 8049, 74 },
+	{ 8050, 8053, 86 },
+	{ 8054, 8055, 100 },
+	{ 8056, 8057, 128 },
+	{ 8058, 8059, 112 },
+	{ 8060, 8061, 126 },
+	{ 8064, 8071, 8 },
+	{ 8072, 8079, -8 },
+	{ 8080, 8087, 8 },
+	{ 8088, 8095, -8 },
+	{ 8096, 8103, 8 },
+	{ 8104, 8111, -8 },
+	{ 8112, 8113, 8 },
+	{ 8115, 8115, 9 },
+	{ 8120, 8121, -8 },
+	{ 8122, 8123, -74 },
+	{ 8124, 8124, -9 },
+	{ 8126, 8126, -7289 },
+	{ 8131, 8131, 9 },
+	{ 8136, 8139, -86 },
+	{ 8140, 8140, -9 },
+	{ 8144, 8145, 8 },
+	{ 8152, 8153, -8 },
+	{ 8154, 8155, -100 },
+	{ 8160, 8161, 8 },
+	{ 8165, 8165, 7 },
+	{ 8168, 8169, -8 },
+	{ 8170, 8171, -112 },
+	{ 8172, 8172, -7 },
+	{ 8179, 8179, 9 },
+	{ 8184, 8185, -128 },
+	{ 8186, 8187, -126 },
+	{ 8188, 8188, -9 },
+	{ 8486, 8486, -7549 },
+	{ 8490, 8490, -8415 },
+	{ 8491, 8491, -8294 },
+	{ 8498, 8498, 28 },
+	{ 8526, 8526, -28 },
+	{ 8544, 8559, 16 },
+	{ 8560, 8575, -16 },
+	{ 8579, 8580, OddEven },
+	{ 9398, 9423, 26 },
+	{ 9424, 9449, -26 },
+	{ 11264, 11310, 48 },
+	{ 11312, 11358, -48 },
+	{ 11360, 11361, EvenOdd },
+	{ 11362, 11362, -10743 },
+	{ 11363, 11363, -3814 },
+	{ 11364, 11364, -10727 },
+	{ 11365, 11365, -10795 },
+	{ 11366, 11366, -10792 },
+	{ 11367, 11372, OddEven },
+	{ 11373, 11373, -10780 },
+	{ 11374, 11374, -10749 },
+	{ 11375, 11375, -10783 },
+	{ 11376, 11376, -10782 },
+	{ 11378, 11379, EvenOdd },
+	{ 11381, 11382, OddEven },
+	{ 11390, 11391, -10815 },
+	{ 11392, 11491, EvenOdd },
+	{ 11499, 11502, OddEven },
+	{ 11520, 11557, -7264 },
+	{ 42560, 42605, EvenOdd },
+	{ 42624, 42647, EvenOdd },
+	{ 42786, 42799, EvenOdd },
+	{ 42802, 42863, EvenOdd },
+	{ 42873, 42876, OddEven },
+	{ 42877, 42877, -35332 },
+	{ 42878, 42887, EvenOdd },
+	{ 42891, 42892, OddEven },
+	{ 42893, 42893, -42280 },
+	{ 42896, 42897, EvenOdd },
+	{ 42912, 42921, EvenOdd },
+	{ 65313, 65338, 32 },
+	{ 65345, 65370, -32 },
+	{ 66560, 66599, 40 },
+	{ 66600, 66639, -40 },
+};
+int num_unicode_casefold = 282;
+
+// 1029 groups, 1050 pairs, 163 ranges
+CaseFold unicode_tolower[] = {
+	{ 65, 90, 32 },
+	{ 181, 181, 775 },
+	{ 192, 214, 32 },
+	{ 216, 222, 32 },
+	{ 256, 302, EvenOddSkip },
+	{ 306, 310, EvenOddSkip },
+	{ 313, 327, OddEvenSkip },
+	{ 330, 374, EvenOddSkip },
+	{ 376, 376, -121 },
+	{ 377, 381, OddEvenSkip },
+	{ 383, 383, -268 },
+	{ 385, 385, 210 },
+	{ 386, 388, EvenOddSkip },
+	{ 390, 390, 206 },
+	{ 391, 391, OddEven },
+	{ 393, 394, 205 },
+	{ 395, 395, OddEven },
+	{ 398, 398, 79 },
+	{ 399, 399, 202 },
+	{ 400, 400, 203 },
+	{ 401, 401, OddEven },
+	{ 403, 403, 205 },
+	{ 404, 404, 207 },
+	{ 406, 406, 211 },
+	{ 407, 407, 209 },
+	{ 408, 408, EvenOdd },
+	{ 412, 412, 211 },
+	{ 413, 413, 213 },
+	{ 415, 415, 214 },
+	{ 416, 420, EvenOddSkip },
+	{ 422, 422, 218 },
+	{ 423, 423, OddEven },
+	{ 425, 425, 218 },
+	{ 428, 428, EvenOdd },
+	{ 430, 430, 218 },
+	{ 431, 431, OddEven },
+	{ 433, 434, 217 },
+	{ 435, 437, OddEvenSkip },
+	{ 439, 439, 219 },
+	{ 440, 440, EvenOdd },
+	{ 444, 444, EvenOdd },
+	{ 452, 452, 2 },
+	{ 453, 453, OddEven },
+	{ 455, 455, 2 },
+	{ 456, 456, EvenOdd },
+	{ 458, 458, 2 },
+	{ 459, 475, OddEvenSkip },
+	{ 478, 494, EvenOddSkip },
+	{ 497, 497, 2 },
+	{ 498, 500, EvenOddSkip },
+	{ 502, 502, -97 },
+	{ 503, 503, -56 },
+	{ 504, 542, EvenOddSkip },
+	{ 544, 544, -130 },
+	{ 546, 562, EvenOddSkip },
+	{ 570, 570, 10795 },
+	{ 571, 571, OddEven },
+	{ 573, 573, -163 },
+	{ 574, 574, 10792 },
+	{ 577, 577, OddEven },
+	{ 579, 579, -195 },
+	{ 580, 580, 69 },
+	{ 581, 581, 71 },
+	{ 582, 590, EvenOddSkip },
+	{ 837, 837, 116 },
+	{ 880, 882, EvenOddSkip },
+	{ 886, 886, EvenOdd },
+	{ 902, 902, 38 },
+	{ 904, 906, 37 },
+	{ 908, 908, 64 },
+	{ 910, 911, 63 },
+	{ 913, 929, 32 },
+	{ 931, 939, 32 },
+	{ 962, 962, EvenOdd },
+	{ 975, 975, 8 },
+	{ 976, 976, -30 },
+	{ 977, 977, -25 },
+	{ 981, 981, -15 },
+	{ 982, 982, -22 },
+	{ 984, 1006, EvenOddSkip },
+	{ 1008, 1008, -54 },
+	{ 1009, 1009, -48 },
+	{ 1012, 1012, -60 },
+	{ 1013, 1013, -64 },
+	{ 1015, 1015, OddEven },
+	{ 1017, 1017, -7 },
+	{ 1018, 1018, EvenOdd },
+	{ 1021, 1023, -130 },
+	{ 1024, 1039, 80 },
+	{ 1040, 1071, 32 },
+	{ 1120, 1152, EvenOddSkip },
+	{ 1162, 1214, EvenOddSkip },
+	{ 1216, 1216, 15 },
+	{ 1217, 1229, OddEvenSkip },
+	{ 1232, 1318, EvenOddSkip },
+	{ 1329, 1366, 48 },
+	{ 4256, 4293, 7264 },
+	{ 7680, 7828, EvenOddSkip },
+	{ 7835, 7835, -58 },
+	{ 7838, 7838, -7615 },
+	{ 7840, 7934, EvenOddSkip },
+	{ 7944, 7951, -8 },
+	{ 7960, 7965, -8 },
+	{ 7976, 7983, -8 },
+	{ 7992, 7999, -8 },
+	{ 8008, 8013, -8 },
+	{ 8025, 8025, -8 },
+	{ 8027, 8027, -8 },
+	{ 8029, 8029, -8 },
+	{ 8031, 8031, -8 },
+	{ 8040, 8047, -8 },
+	{ 8072, 8079, -8 },
+	{ 8088, 8095, -8 },
+	{ 8104, 8111, -8 },
+	{ 8120, 8121, -8 },
+	{ 8122, 8123, -74 },
+	{ 8124, 8124, -9 },
+	{ 8126, 8126, -7173 },
+	{ 8136, 8139, -86 },
+	{ 8140, 8140, -9 },
+	{ 8152, 8153, -8 },
+	{ 8154, 8155, -100 },
+	{ 8168, 8169, -8 },
+	{ 8170, 8171, -112 },
+	{ 8172, 8172, -7 },
+	{ 8184, 8185, -128 },
+	{ 8186, 8187, -126 },
+	{ 8188, 8188, -9 },
+	{ 8486, 8486, -7517 },
+	{ 8490, 8490, -8383 },
+	{ 8491, 8491, -8262 },
+	{ 8498, 8498, 28 },
+	{ 8544, 8559, 16 },
+	{ 8579, 8579, OddEven },
+	{ 9398, 9423, 26 },
+	{ 11264, 11310, 48 },
+	{ 11360, 11360, EvenOdd },
+	{ 11362, 11362, -10743 },
+	{ 11363, 11363, -3814 },
+	{ 11364, 11364, -10727 },
+	{ 11367, 11371, OddEvenSkip },
+	{ 11373, 11373, -10780 },
+	{ 11374, 11374, -10749 },
+	{ 11375, 11375, -10783 },
+	{ 11376, 11376, -10782 },
+	{ 11378, 11378, EvenOdd },
+	{ 11381, 11381, OddEven },
+	{ 11390, 11391, -10815 },
+	{ 11392, 11490, EvenOddSkip },
+	{ 11499, 11501, OddEvenSkip },
+	{ 42560, 42604, EvenOddSkip },
+	{ 42624, 42646, EvenOddSkip },
+	{ 42786, 42798, EvenOddSkip },
+	{ 42802, 42862, EvenOddSkip },
+	{ 42873, 42875, OddEvenSkip },
+	{ 42877, 42877, -35332 },
+	{ 42878, 42886, EvenOddSkip },
+	{ 42891, 42891, OddEven },
+	{ 42893, 42893, -42280 },
+	{ 42896, 42896, EvenOdd },
+	{ 42912, 42920, EvenOddSkip },
+	{ 65313, 65338, 32 },
+	{ 66560, 66599, 40 },
+};
+int num_unicode_tolower = 163;
+
+
+
+} // namespace re2
+
+
--- a/re2/re2/unicode_casefold.h
+++ b/re2/re2/unicode_casefold.h
@ -0,0 +1,75 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Unicode case folding tables.
+
+// The Unicode case folding tables encode the mapping from one Unicode point
+// to the next largest Unicode point with equivalent folding.  The largest
+// point wraps back to the first.  For example, the tables map:
+//
+//     'A' -> 'a'
+//     'a' -> 'A'
+//
+//     'K' -> 'k'
+//     'k' -> 'K'  (Kelvin symbol)
+//     'K' -> 'K'
+//
+// Like everything Unicode, these tables are big.  If we represent the table
+// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
+// Most table entries look like the ones around them:
+// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
+// Instead of listing all the pairs explicitly, we make a list of ranges
+// and deltas, so that the table entries for 'A' through 'Z' can be represented
+// as a single entry { 'A', 'Z', +32 }.
+//
+// In addition to blocks that map to each other (A-Z mapping to a-z)
+// there are blocks of pairs that individually map to each other
+// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
+// For those, the special delta value EvenOdd marks even/odd pairs
+// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
+//
+// In this form, the table has 274 entries, about 3kB.  If we were to split
+// the table into one for 16-bit codes and an overflow table for larger ones,
+// we could get it down to about 1.5kB, but that's not worth the complexity.
+//
+// The grouped form also allows for efficient fold range calculations
+// rather than looping one character at a time.
+
+#ifndef RE2_UNICODE_CASEFOLD_H__
+#define RE2_UNICODE_CASEFOLD_H__
+
+#include "util/util.h"
+
+namespace re2 {
+
+enum {
+  EvenOdd = 1,
+  OddEven = -1,
+  EvenOddSkip = 1<<30,
+  OddEvenSkip,
+};
+
+struct CaseFold {
+  uint32 lo;
+  uint32 hi;
+  int32 delta;
+};
+
+extern CaseFold unicode_casefold[];
+extern int num_unicode_casefold;
+
+extern CaseFold unicode_tolower[];
+extern int num_unicode_tolower;
+
+// Returns the CaseFold* in the tables that contains rune.
+// If rune is not in the tables, returns the first CaseFold* after rune.
+// If rune is larger than any value in the tables, returns NULL.
+extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
+
+// Returns the result of applying the fold f to the rune r.
+extern Rune ApplyFold(CaseFold *f, Rune r);
+
+}  // namespace re2
+
+#endif  // RE2_UNICODE_CASEFOLD_H__
--- a/re2/re2/unicode_groups.cc
+++ b/re2/re2/unicode_groups.cc
--- a/re2/re2/unicode_groups.h
+++ b/re2/re2/unicode_groups.h
@ -0,0 +1,64 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Unicode character groups.
+
+// The codes get split into ranges of 16-bit codes
+// and ranges of 32-bit codes.  It would be simpler
+// to use only 32-bit ranges, but these tables are large
+// enough to warrant extra care.
+//
+// Using just 32-bit ranges gives 27 kB of data.
+// Adding 16-bit ranges gives 18 kB of data.
+// Adding an extra table of 16-bit singletons would reduce
+// to 16.5 kB of data but make the data harder to use;
+// we don't bother.
+
+#ifndef RE2_UNICODE_GROUPS_H__
+#define RE2_UNICODE_GROUPS_H__
+
+#include "util/util.h"
+
+namespace re2 {
+
+struct URange16
+{
+  uint16 lo;
+  uint16 hi;
+};
+
+struct URange32
+{
+  uint32 lo;
+  uint32 hi;
+};
+
+struct UGroup
+{
+  const char *name;
+  int sign;  // +1 for [abc], -1 for [^abc]
+  URange16 *r16;
+  int nr16;
+  URange32 *r32;
+  int nr32;
+};
+
+// Named by property or script name (e.g., "Nd", "N", "Han").
+// Negated groups are not included.
+extern UGroup unicode_groups[];
+extern int num_unicode_groups;
+
+// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
+// Negated groups are included.
+extern UGroup posix_groups[];
+extern int num_posix_groups;
+
+// Named by Perl name (e.g., "\\d", "\\D").
+// Negated groups are included.
+extern UGroup perl_groups[];
+extern int num_perl_groups;
+
+}  // namespace re2
+
+#endif  // RE2_UNICODE_GROUPS_H__
--- a/re2/re2/variadic_function.h
+++ b/re2/re2/variadic_function.h
@ -0,0 +1,346 @@
+// Copyright 2010 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_VARIADIC_FUNCTION_H_
+#define RE2_VARIADIC_FUNCTION_H_
+
+namespace re2 {
+
+template <typename Result, typename Param0, typename Param1, typename Arg,
+          Result (*Func)(Param0, Param1, const Arg* const [], int count)>
+class VariadicFunction2 {
+ public:
+  VariadicFunction2() {}
+
+  Result operator()(Param0 p0, Param1 p1) const {
+    return Func(p0, p1, 0, 0);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
+    const Arg* const args[] = { &a0 };
+    return Func(p0, p1, args, 1);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
+    const Arg* const args[] = { &a0, &a1 };
+    return Func(p0, p1, args, 2);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2) const {
+    const Arg* const args[] = { &a0, &a1, &a2 };
+    return Func(p0, p1, args, 3);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3 };
+    return Func(p0, p1, args, 4);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
+    return Func(p0, p1, args, 5);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
+    return Func(p0, p1, args, 6);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
+    return Func(p0, p1, args, 7);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
+    return Func(p0, p1, args, 8);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
+    return Func(p0, p1, args, 9);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9 };
+    return Func(p0, p1, args, 10);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10 };
+    return Func(p0, p1, args, 11);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11 };
+    return Func(p0, p1, args, 12);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12 };
+    return Func(p0, p1, args, 13);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13 };
+    return Func(p0, p1, args, 14);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14 };
+    return Func(p0, p1, args, 15);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15 };
+    return Func(p0, p1, args, 16);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
+    return Func(p0, p1, args, 17);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
+    return Func(p0, p1, args, 18);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
+    return Func(p0, p1, args, 19);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
+    return Func(p0, p1, args, 20);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
+        &a20 };
+    return Func(p0, p1, args, 21);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21 };
+    return Func(p0, p1, args, 22);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22 };
+    return Func(p0, p1, args, 23);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23 };
+    return Func(p0, p1, args, 24);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24 };
+    return Func(p0, p1, args, 25);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24, &a25 };
+    return Func(p0, p1, args, 26);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
+      const Arg& a26) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24, &a25, &a26 };
+    return Func(p0, p1, args, 27);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
+      const Arg& a26, const Arg& a27) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24, &a25, &a26, &a27 };
+    return Func(p0, p1, args, 28);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
+      const Arg& a26, const Arg& a27, const Arg& a28) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
+    return Func(p0, p1, args, 29);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
+      const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
+    return Func(p0, p1, args, 30);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
+      const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
+      const Arg& a30) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
+    return Func(p0, p1, args, 31);
+  }
+
+  Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
+      const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
+      const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
+      const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
+      const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
+      const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
+      const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
+      const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
+      const Arg& a30, const Arg& a31) const {
+    const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
+        &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
+        &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
+    return Func(p0, p1, args, 32);
+  }
+};
+
+}  // namespace re2
+
+#endif  // RE2_VARIADIC_FUNCTION_H_
--- a/re2/re2/walker-inl.h
+++ b/re2/re2/walker-inl.h
@ -0,0 +1,244 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Helper class for traversing Regexps without recursion.
+// Clients should declare their own subclasses that override
+// the PreVisit and PostVisit methods, which are called before
+// and after visiting the subexpressions.
+
+// Not quite the Visitor pattern, because (among other things)
+// the Visitor pattern is recursive.
+
+#ifndef RE2_WALKER_INL_H__
+#define RE2_WALKER_INL_H__
+
+#include "re2/regexp.h"
+
+namespace re2 {
+
+template<typename T> struct WalkState;
+
+template<typename T> class Regexp::Walker {
+ public:
+  Walker();
+  virtual ~Walker();
+
+  // Virtual method called before visiting re's children.
+  // PreVisit passes ownership of its return value to its caller.
+  // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
+  // and passed to the child PreVisits and PostVisits as parent_arg.
+  // At the top-most Regexp, parent_arg is arg passed to walk.
+  // If PreVisit sets *stop to true, the walk does not recurse
+  // into the children.  Instead it behaves as though the return
+  // value from PreVisit is the return value from PostVisit.
+  // The default PreVisit returns parent_arg.
+  virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
+
+  // Virtual method called after visiting re's children.
+  // The pre_arg is the T that PreVisit returned.
+  // The child_args is a vector of the T that the child PostVisits returned.
+  // PostVisit takes ownership of pre_arg.
+  // PostVisit takes ownership of the Ts
+  // in *child_args, but not the vector itself.
+  // PostVisit passes ownership of its return value
+  // to its caller.
+  // The default PostVisit simply returns pre_arg.
+  virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
+                      T* child_args, int nchild_args);
+
+  // Virtual method called to copy a T,
+  // when Walk notices that more than one child is the same re.
+  virtual T Copy(T arg);
+
+  // Virtual method called to do a "quick visit" of the re,
+  // but not its children.  Only called once the visit budget
+  // has been used up and we're trying to abort the walk
+  // as quickly as possible.  Should return a value that
+  // makes sense for the parent PostVisits still to be run.
+  // This function is (hopefully) only called by
+  // WalkExponential, but must be implemented by all clients,
+  // just in case.
+  virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
+
+  // Walks over a regular expression.
+  // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
+  // Returns the T returned by PostVisit on re.
+  T Walk(Regexp* re, T top_arg);
+
+  // Like Walk, but doesn't use Copy.  This can lead to
+  // exponential runtimes on cross-linked Regexps like the
+  // ones generated by Simplify.  To help limit this,
+  // at most max_visits nodes will be visited and then
+  // the walk will be cut off early.
+  // If the walk *is* cut off early, ShortVisit(re)
+  // will be called on regexps that cannot be fully
+  // visited rather than calling PreVisit/PostVisit.
+  T WalkExponential(Regexp* re, T top_arg, int max_visits);
+
+  // Clears the stack.  Should never be necessary, since
+  // Walk always enters and exits with an empty stack.
+  // Logs DFATAL if stack is not already clear.
+  void Reset();
+
+  // Returns whether walk was cut off.
+  bool stopped_early() { return stopped_early_; }
+
+ private:
+  // Walk state for the entire traversal.
+  stack<WalkState<T> >* stack_;
+  bool stopped_early_;
+  int max_visits_;
+
+  T WalkInternal(Regexp* re, T top_arg, bool use_copy);
+
+  DISALLOW_EVIL_CONSTRUCTORS(Walker);
+};
+
+template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
+                                                   T parent_arg,
+                                                   bool* stop) {
+  return parent_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
+                                                    T parent_arg,
+                                                    T pre_arg,
+                                                    T* child_args,
+                                                    int nchild_args) {
+  return pre_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::Copy(T arg) {
+  return arg;
+}
+
+// State about a single level in the traversal.
+template<typename T> struct WalkState {
+  WalkState<T>(Regexp* re, T parent)
+    : re(re),
+      n(-1),
+      parent_arg(parent),
+      child_args(NULL) { }
+
+  Regexp* re;  // The regexp
+  int n;  // The index of the next child to process; -1 means need to PreVisit
+  T parent_arg;  // Accumulated arguments.
+  T pre_arg;
+  T child_arg;  // One-element buffer for child_args.
+  T* child_args;
+};
+
+template<typename T> Regexp::Walker<T>::Walker() {
+  stack_ = new stack<WalkState<T> >;
+  stopped_early_ = false;
+}
+
+template<typename T> Regexp::Walker<T>::~Walker() {
+  Reset();
+  delete stack_;
+}
+
+// Clears the stack.  Should never be necessary, since
+// Walk always enters and exits with an empty stack.
+// Logs DFATAL if stack is not already clear.
+template<typename T> void Regexp::Walker<T>::Reset() {
+  if (stack_ && stack_->size() > 0) {
+    LOG(DFATAL) << "Stack not empty.";
+    while (stack_->size() > 0) {
+      delete stack_->top().child_args;
+      stack_->pop();
+    }
+  }
+}
+
+template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
+                                                       bool use_copy) {
+  Reset();
+
+  if (re == NULL) {
+    LOG(DFATAL) << "Walk NULL";
+    return top_arg;
+  }
+
+  stack_->push(WalkState<T>(re, top_arg));
+
+  WalkState<T>* s;
+  for (;;) {
+    T t;
+    s = &stack_->top();
+    Regexp* re = s->re;
+    switch (s->n) {
+      case -1: {
+        if (--max_visits_ < 0) {
+          stopped_early_ = true;
+          t = ShortVisit(re, s->parent_arg);
+          break;
+        }
+        bool stop = false;
+        s->pre_arg = PreVisit(re, s->parent_arg, &stop);
+        if (stop) {
+          t = s->pre_arg;
+          break;
+        }
+        s->n = 0;
+        s->child_args = NULL;
+        if (re->nsub_ == 1)
+          s->child_args = &s->child_arg;
+        else if (re->nsub_ > 1)
+          s->child_args = new T[re->nsub_];
+        // Fall through.
+      }
+      default: {
+        if (re->nsub_ > 0) {
+          Regexp** sub = re->sub();
+          if (s->n < re->nsub_) {
+            if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
+              s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
+              s->n++;
+            } else {
+              stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
+            }
+            continue;
+          }
+        }
+
+        t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
+        if (re->nsub_ > 1)
+          delete[] s->child_args;
+        break;
+      }
+    }
+
+    // We've finished stack_->top().
+    // Update next guy down.
+    stack_->pop();
+    if (stack_->size() == 0)
+      return t;
+    s = &stack_->top();
+    if (s->child_args != NULL)
+      s->child_args[s->n] = t;
+    else
+      s->child_arg = t;
+    s->n++;
+  }
+}
+
+template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
+  // Without the exponential walking behavior,
+  // this budget should be more than enough for any
+  // regexp, and yet not enough to get us in trouble
+  // as far as CPU time.
+  max_visits_ = 1000000;
+  return WalkInternal(re, top_arg, true);
+}
+
+template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
+                                                          int max_visits) {
+  max_visits_ = max_visits;
+  return WalkInternal(re, top_arg, false);
+}
+
+}  // namespace re2
+
+#endif  // RE2_WALKER_INL_H__
--- a/re2/runtests
+++ b/re2/runtests
@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+success=true
+for i
+do
+	printf "%-40s" $i
+	if sh -c "$i >$i.log 2>&1" 2>/dev/null
+	then
+		echo PASS
+	else
+		echo FAIL';' output in $i.log
+		success=false
+	fi
+done
+
+if $success; then
+	echo 'ALL TESTS PASSED.'
+	exit 0
+fi
+echo 'TESTS FAILED.'
+exit 1
--- a/re2/testinstall.cc
+++ b/re2/testinstall.cc
@ -0,0 +1,13 @@
+#include <re2/re2.h>
+#include <stdio.h>
+
+using namespace re2;
+
+int main(void) {
+	if(RE2::FullMatch("axbyc", "a.*b.*c")) {
+		printf("PASS\n");
+		return 0;
+	}
+	printf("FAIL\n");
+	return 2;
+}
--- a/re2/util/arena.cc
+++ b/re2/util/arena.cc
@ -0,0 +1,168 @@
+// Copyright 2000 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/util.h"
+
+namespace re2 {
+
+// ----------------------------------------------------------------------
+// UnsafeArena::UnsafeArena()
+// UnsafeArena::~UnsafeArena()
+//    Destroying the arena automatically calls Reset()
+// ----------------------------------------------------------------------
+
+
+UnsafeArena::UnsafeArena(const size_t block_size)
+  : block_size_(block_size),
+    freestart_(NULL),                   // set for real in Reset()
+    last_alloc_(NULL),
+    remaining_(0),
+    blocks_alloced_(1),
+    overflow_blocks_(NULL) {
+  assert(block_size > kDefaultAlignment);
+
+  first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
+  first_blocks_[0].size = block_size_;
+
+  Reset();
+}
+
+UnsafeArena::~UnsafeArena() {
+  FreeBlocks();
+  assert(overflow_blocks_ == NULL);    // FreeBlocks() should do that
+  // The first X blocks stay allocated always by default.  Delete them now.
+  for (int i = 0; i < blocks_alloced_; i++)
+    free(first_blocks_[i].mem);
+}
+
+// ----------------------------------------------------------------------
+// UnsafeArena::Reset()
+//    Clears all the memory an arena is using.
+// ----------------------------------------------------------------------
+
+void UnsafeArena::Reset() {
+  FreeBlocks();
+  freestart_ = first_blocks_[0].mem;
+  remaining_ = first_blocks_[0].size;
+  last_alloc_ = NULL;
+
+  // We do not know for sure whether or not the first block is aligned,
+  // so we fix that right now.
+  const int overage = reinterpret_cast<uintptr_t>(freestart_) &
+                      (kDefaultAlignment-1);
+  if (overage > 0) {
+    const int waste = kDefaultAlignment - overage;
+    freestart_ += waste;
+    remaining_ -= waste;
+  }
+  freestart_when_empty_ = freestart_;
+  assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1)));
+}
+
+// -------------------------------------------------------------
+// UnsafeArena::AllocNewBlock()
+//    Adds and returns an AllocatedBlock.
+//    The returned AllocatedBlock* is valid until the next call
+//    to AllocNewBlock or Reset.  (i.e. anything that might
+//    affect overflow_blocks_).
+// -------------------------------------------------------------
+
+UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) {
+  AllocatedBlock *block;
+  // Find the next block.
+  if ( blocks_alloced_ < arraysize(first_blocks_) ) {
+    // Use one of the pre-allocated blocks
+    block = &first_blocks_[blocks_alloced_++];
+  } else {                   // oops, out of space, move to the vector
+    if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>;
+    // Adds another block to the vector.
+    overflow_blocks_->resize(overflow_blocks_->size()+1);
+    // block points to the last block of the vector.
+    block = &overflow_blocks_->back();
+  }
+
+  block->mem = reinterpret_cast<char*>(malloc(block_size));
+  block->size = block_size;
+
+  return block;
+}
+
+// ----------------------------------------------------------------------
+// UnsafeArena::GetMemoryFallback()
+//    We take memory out of our pool, aligned on the byte boundary
+//    requested.  If we don't have space in our current pool, we
+//    allocate a new block (wasting the remaining space in the
+//    current block) and give you that.  If your memory needs are
+//    too big for a single block, we make a special your-memory-only
+//    allocation -- this is equivalent to not using the arena at all.
+// ----------------------------------------------------------------------
+
+void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) {
+  if (size == 0)
+    return NULL;             // stl/stl_alloc.h says this is okay
+
+  assert(align > 0 && 0 == (align & (align - 1)));  // must be power of 2
+
+  // If the object is more than a quarter of the block size, allocate
+  // it separately to avoid wasting too much space in leftover bytes
+  if (block_size_ == 0 || size > block_size_/4) {
+    // then it gets its own block in the arena
+    assert(align <= kDefaultAlignment);   // because that's what new gives us
+    // This block stays separate from the rest of the world; in particular
+    // we don't update last_alloc_ so you can't reclaim space on this block.
+    return AllocNewBlock(size)->mem;
+  }
+
+  const int overage =
+    (reinterpret_cast<uintptr_t>(freestart_) & (align-1));
+  if (overage) {
+    const int waste = align - overage;
+    freestart_ += waste;
+    if (waste < remaining_) {
+      remaining_ -= waste;
+    } else {
+      remaining_ = 0;
+    }
+  }
+  if (size > remaining_) {
+    AllocatedBlock *block = AllocNewBlock(block_size_);
+    freestart_ = block->mem;
+    remaining_ = block->size;
+  }
+  remaining_ -= size;
+  last_alloc_ = freestart_;
+  freestart_ += size;
+  assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0);
+  return reinterpret_cast<void*>(last_alloc_);
+}
+
+// ----------------------------------------------------------------------
+// UnsafeArena::FreeBlocks()
+//    Unlike GetMemory(), which does actual work, ReturnMemory() is a
+//    no-op: we don't "free" memory until Reset() is called.  We do
+//    update some stats, though.  Note we do no checking that the
+//    pointer you pass in was actually allocated by us, or that it
+//    was allocated for the size you say, so be careful here!
+//       FreeBlocks() does the work for Reset(), actually freeing all
+//    memory allocated in one fell swoop.
+// ----------------------------------------------------------------------
+
+void UnsafeArena::FreeBlocks() {
+  for ( int i = 1; i < blocks_alloced_; ++i ) {  // keep first block alloced
+    free(first_blocks_[i].mem);
+    first_blocks_[i].mem = NULL;
+    first_blocks_[i].size = 0;
+  }
+  blocks_alloced_ = 1;
+  if (overflow_blocks_ != NULL) {
+    vector<AllocatedBlock>::iterator it;
+    for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
+      free(it->mem);
+    }
+    delete overflow_blocks_;             // These should be used very rarely
+    overflow_blocks_ = NULL;
+  }
+}
+
+}  // namespace re2
--- a/re2/util/arena.h
+++ b/re2/util/arena.h
@ -0,0 +1,103 @@
+// Copyright 2000 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Sometimes it is necessary to allocate a large number of small
+// objects.  Doing this the usual way (malloc, new) is slow,
+// especially for multithreaded programs.  An UnsafeArena provides a
+// mark/release method of memory management: it asks for a large chunk
+// from the operating system and doles it out bit by bit as required.
+// Then you free all the memory at once by calling UnsafeArena::Reset().
+// The "Unsafe" refers to the fact that UnsafeArena is not safe to
+// call from multiple threads.
+//
+// The global operator new that can be used as follows:
+//
+//   #include "lib/arena-inl.h"
+//
+//   UnsafeArena arena(1000);
+//   Foo* foo = new (AllocateInArena, &arena) Foo;
+//
+
+#ifndef RE2_UTIL_ARENA_H_
+#define RE2_UTIL_ARENA_H_
+
+namespace re2 {
+
+// This class is thread-compatible.
+class UnsafeArena {
+ public:
+  UnsafeArena(const size_t block_size);
+  virtual ~UnsafeArena();
+
+  void Reset();
+
+  // This should be the worst-case alignment for any type.  This is
+  // good for IA-32, SPARC version 7 (the last one I know), and
+  // supposedly Alpha.  i386 would be more time-efficient with a
+  // default alignment of 8, but ::operator new() uses alignment of 4,
+  // and an assertion will fail below after the call to MakeNewBlock()
+  // if you try to use a larger alignment.
+#ifdef __i386__
+  static const int kDefaultAlignment = 4;
+#else
+  static const int kDefaultAlignment = 8;
+#endif
+
+ private:
+  void* GetMemoryFallback(const size_t size, const int align);
+
+ public:
+  void* GetMemory(const size_t size, const int align) {
+    if ( size > 0 && size < remaining_ && align == 1 ) {       // common case
+      last_alloc_ = freestart_;
+      freestart_ += size;
+      remaining_ -= size;
+      return reinterpret_cast<void*>(last_alloc_);
+    }
+    return GetMemoryFallback(size, align);
+  }
+
+ private:
+  struct AllocatedBlock {
+    char *mem;
+    size_t size;
+  };
+
+  // The returned AllocatedBlock* is valid until the next call to AllocNewBlock
+  // or Reset (i.e. anything that might affect overflow_blocks_).
+  AllocatedBlock *AllocNewBlock(const size_t block_size);
+
+  const AllocatedBlock *IndexToBlock(int index) const;
+
+  const size_t block_size_;
+  char* freestart_;         // beginning of the free space in most recent block
+  char* freestart_when_empty_;  // beginning of the free space when we're empty
+  char* last_alloc_;         // used to make sure ReturnBytes() is safe
+  size_t remaining_;
+  // STL vector isn't as efficient as it could be, so we use an array at first
+  int blocks_alloced_;       // how many of the first_blocks_ have been alloced
+  AllocatedBlock first_blocks_[16];   // the length of this array is arbitrary
+  // if the first_blocks_ aren't enough, expand into overflow_blocks_.
+  vector<AllocatedBlock>* overflow_blocks_;
+
+  void FreeBlocks();         // Frees all except first block
+
+  DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena);
+};
+
+// Operators for allocation on the arena
+// Syntax: new (AllocateInArena, arena) MyClass;
+// STL containers, etc.
+enum AllocateInArenaType { AllocateInArena };
+
+}  // namespace re2
+
+inline void* operator new(size_t size,
+                          re2::AllocateInArenaType /* unused */,
+                          re2::UnsafeArena *arena) {
+  return reinterpret_cast<char*>(arena->GetMemory(size, 1));
+}
+
+#endif  // RE2_UTIL_ARENA_H_
+
--- a/re2/util/atomicops.h
+++ b/re2/util/atomicops.h
@ -0,0 +1,79 @@
+// Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_UTIL_ATOMICOPS_H__
+#define RE2_UTIL_ATOMICOPS_H__
+
+#if defined(__i386__)
+
+static inline void WriteMemoryBarrier() {
+  int x;
+  __asm__ __volatile__("xchgl (%0),%0"  // The lock prefix is implicit for xchg.
+                       :: "r" (&x));
+}
+
+#elif defined(__x86_64__)
+
+// 64-bit implementations of memory barrier can be simpler, because
+// "sfence" is guaranteed to exist.
+static inline void WriteMemoryBarrier() {
+  __asm__ __volatile__("sfence" : : : "memory");
+}
+
+#elif defined(__ppc__)
+
+static inline void WriteMemoryBarrier() {
+  __asm__ __volatile__("eieio" : : : "memory");
+}
+
+#elif defined(__alpha__)
+
+static inline void WriteMemoryBarrier() {
+  __asm__ __volatile__("wmb" : : : "memory");
+}
+
+#else
+
+#include "util/mutex.h"
+
+static inline void WriteMemoryBarrier() {
+  // Slight overkill, but good enough:
+  // any mutex implementation must have
+  // a read barrier after the lock operation and
+  // a write barrier before the unlock operation.
+  //
+  // It may be worthwhile to write architecture-specific
+  // barriers for the common platforms, as above, but
+  // this is a correct fallback.
+  re2::Mutex mu;
+  re2::MutexLock l(&mu);
+}
+
+/*
+#error Need WriteMemoryBarrier for architecture.
+
+// Windows
+inline void WriteMemoryBarrier() {
+  LONG x;
+  ::InterlockedExchange(&x, 0);
+}
+*/
+
+#endif
+
+// Alpha has very weak memory ordering. If relying on WriteBarriers, must one
+// use read barriers for the readers too.
+#if defined(__alpha__)
+
+static inline void MaybeReadMemoryBarrier() {
+  __asm__ __volatile__("mb" : : : "memory");
+}
+
+#else
+
+static inline void MaybeReadMemoryBarrier() {}
+
+#endif // __alpha__
+
+#endif  // RE2_UTIL_ATOMICOPS_H__
--- a/re2/util/benchmark.cc
+++ b/re2/util/benchmark.cc
@ -0,0 +1,153 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/util.h"
+#include "util/flags.h"
+#include "util/benchmark.h"
+#include "re2/re2.h"
+
+DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
+
+using testing::Benchmark;
+using namespace re2;
+
+static Benchmark* benchmarks[10000];
+static int nbenchmarks;
+
+void Benchmark::Register() {
+	benchmarks[nbenchmarks] = this;
+	if(lo < 1)
+		lo = 1;
+	if(hi < lo)
+		hi = lo;
+	nbenchmarks++;
+}
+
+static int64 nsec() {
+	struct timeval tv;
+	if(gettimeofday(&tv, 0) < 0)
+		return -1;
+	return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
+}
+
+static int64 bytes;
+static int64 ns;
+static int64 t0;
+static int64 items;
+
+void SetBenchmarkBytesProcessed(long long x) {
+	bytes = x;
+}
+
+void StopBenchmarkTiming() {
+	if(t0 != 0)
+		ns += nsec() - t0;
+	t0 = 0;
+}
+
+void StartBenchmarkTiming() {
+	if(t0 == 0)
+		t0 = nsec();
+}
+
+void SetBenchmarkItemsProcessed(int n) {
+	items = n;
+}
+
+void BenchmarkMemoryUsage() {
+	// TODO(rsc): Implement.
+}
+
+int NumCPUs() {
+	return 1;
+}
+
+static void runN(Benchmark *b, int n, int siz) {
+	bytes = 0;
+	items = 0;
+	ns = 0;
+	t0 = nsec();
+	if(b->fn)
+		b->fn(n);
+	else if(b->fnr)
+		b->fnr(n, siz);
+	else {
+		fprintf(stderr, "%s: missing function\n", b->name);
+		exit(2);
+	}
+	if(t0 != 0)
+		ns += nsec() - t0;
+}
+
+static int round(int n) {
+	int base = 1;
+	
+	while(base*10 < n)
+		base *= 10;
+	if(n < 2*base)
+		return 2*base;
+	if(n < 5*base)
+		return 5*base;
+	return 10*base;
+}
+
+void RunBench(Benchmark* b, int nthread, int siz) {
+	int n, last;
+
+	// TODO(rsc): Threaded benchmarks.
+	if(nthread != 1)
+		return;
+	
+	// run once in case it's expensive
+	n = 1;
+	runN(b, n, siz);
+	while(ns < (int)1e9 && n < (int)1e9) {
+		last = n;
+		if(ns/n == 0)
+			n = 1e9;
+		else
+			n = 1e9 / (ns/n);
+		
+		n = max(last+1, min(n+n/2, 100*last));
+		n = round(n);
+		runN(b, n, siz);
+	}
+	
+	char mb[100];
+	char suf[100];
+	mb[0] = '\0';
+	suf[0] = '\0';
+	if(ns > 0 && bytes > 0)
+		snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9));
+	if(b->fnr || b->lo != b->hi) {
+		if(siz >= (1<<20))
+			snprintf(suf, sizeof suf, "/%dM", siz/(1<<20));
+		else if(siz >= (1<<10))
+			snprintf(suf, sizeof suf, "/%dK", siz/(1<<10));
+		else
+			snprintf(suf, sizeof suf, "/%d", siz);
+	}
+	printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb);
+	fflush(stdout);
+}
+
+static int match(const char* name, int argc, const char** argv) {
+	if(argc == 1)
+		return 1;
+	for(int i = 1; i < argc; i++)
+		if(RE2::PartialMatch(name, argv[i]))
+			return 1;
+	return 0;
+}
+
+int main(int argc, const char** argv) {
+	for(int i = 0; i < nbenchmarks; i++) {
+		Benchmark* b = benchmarks[i];
+		if(match(b->name, argc, argv))
+			for(int j = b->threadlo; j <= b->threadhi; j++)
+				for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1)
+					RunBench(b, j, k);
+	}
+}
+
--- a/re2/util/benchmark.h
+++ b/re2/util/benchmark.h
@ -0,0 +1,41 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_UTIL_BENCHMARK_H__
+#define RE2_UTIL_BENCHMARK_H__
+
+namespace testing {
+struct Benchmark {
+  const char* name;
+  void (*fn)(int);
+  void (*fnr)(int, int);
+  int lo;
+  int hi;
+  int threadlo;
+  int threadhi;
+  
+  void Register();
+  Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
+  Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
+  void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
+  Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
+};
+}  // namespace testing
+
+void SetBenchmarkBytesProcessed(long long);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+void BenchmarkMemoryUsage();
+void SetBenchmarkItemsProcessed(int);
+
+int NumCPUs();
+
+#define BENCHMARK(f) \
+	::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
+
+#define BENCHMARK_RANGE(f, lo, hi) \
+	::testing::Benchmark* _benchmark_##f = \
+	(new ::testing::Benchmark(#f, f, lo, hi))
+
+#endif  // RE2_UTIL_BENCHMARK_H__
--- a/re2/util/flags.h
+++ b/re2/util/flags.h
@ -0,0 +1,27 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Simplified version of Google's command line flags.
+// Does not support parsing the command line.
+// If you want to do that, see
+// http://code.google.com/p/google-gflags
+
+#ifndef RE2_UTIL_FLAGS_H__
+#define RE2_UTIL_FLAGS_H__
+
+#define DEFINE_flag(type, name, deflt, desc) \
+	namespace re2 { type FLAGS_##name = deflt; }
+
+#define DECLARE_flag(type, name) \
+	namespace re2 { extern type FLAGS_##name; }
+
+#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
+#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
+#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
+
+#define DECLARE_bool(name) DECLARE_flag(bool, name)
+#define DECLARE_int32(name) DECLARE_flag(int32, name)
+#define DECLARE_string(name) DECLARE_flag(string, name)
+
+#endif  // RE2_UTIL_FLAGS_H__
--- a/re2/util/hash.cc
+++ b/re2/util/hash.cc
@ -0,0 +1,231 @@
+// Modified by Russ Cox to add "namespace re2".
+// Also threw away all but hashword and hashword2.
+// http://burtleburtle.net/bob/c/lookup3.c
+
+/*
+-------------------------------------------------------------------------------
+lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+
+These are functions for producing 32-bit hashes for hash table lookup.
+hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() 
+are externally useful functions.  Routines to test the hash are included 
+if SELF_TEST is defined.  You can use this free for any purpose.  It's in
+the public domain.  It has no warranty.
+
+You probably want to use hashlittle().  hashlittle() and hashbig()
+hash byte arrays.  hashlittle() is is faster than hashbig() on
+little-endian machines.  Intel and AMD are little-endian machines.
+On second thought, you probably want hashlittle2(), which is identical to
+hashlittle() except it returns two 32-bit hashes for the price of one.  
+You could implement hashbig2() if you wanted but I haven't bothered here.
+
+If you want to find a hash of, say, exactly 7 integers, do
+  a = i1;  b = i2;  c = i3;
+  mix(a,b,c);
+  a += i4; b += i5; c += i6;
+  mix(a,b,c);
+  a += i7;
+  final(a,b,c);
+then use c as the hash value.  If you have a variable length array of
+4-byte integers to hash, use hashword().  If you have a byte array (like
+a character string), use hashlittle().  If you have several byte arrays, or
+a mix of things, see the comments above hashlittle().  
+
+Why is this so big?  I read 12 bytes at a time into 3 4-byte integers, 
+then mix those integers.  This is fast (you can do a lot more thorough
+mixing with 12*3 instructions on 3 integers than you can with 3 instructions
+on 1 byte), but shoehorning those bytes into integers efficiently is messy.
+-------------------------------------------------------------------------------
+*/
+
+#include "util/util.h"
+
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or 
+  all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+    4  6  8 16 19  4
+    9 15  3 18 27 15
+   14  9  3  7 17  3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta.  I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose 
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche.  There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a.  The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism.  Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism.  I did what I could.  Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different.  This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or 
+  all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+  4  8 15 26 3 22 24
+ 10  8 15 26 3 22 24
+ 11  8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+namespace re2 {
+
+/*
+--------------------------------------------------------------------
+ This works on all machines.  To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+
+ The function hashword() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes.  hashlittle() is more complicated than hashword() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+uint32 hashword(
+const uint32 *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32        initval)         /* the previous hash, or an arbitrary value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  { 
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  return c;
+}
+
+
+/*
+--------------------------------------------------------------------
+hashword2() -- same as hashword(), but take two seeds and return two
+32-bit values.  pc and pb must both be nonnull, and *pc and *pb must
+both be initialized with seeds.  If you pass in (*pb)==0, the output 
+(*pc) will be the same as the return value from hashword().
+--------------------------------------------------------------------
+*/
+void hashword2 (
+const uint32 *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32       *pc,                      /* IN: seed OUT: primary hash value */
+uint32       *pb)               /* IN: more seed OUT: secondary hash value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
+  c += *pb;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  { 
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  *pc=c; *pb=b;
+}
+
+}  // namespace re2
--- a/re2/util/logging.h
+++ b/re2/util/logging.h
@ -0,0 +1,78 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Simplified version of Google's logging.
+
+#ifndef RE2_UTIL_LOGGING_H__
+#define RE2_UTIL_LOGGING_H__
+
+#include <unistd.h>  /* for write */
+#include <sstream>
+
+// Debug-only checking.
+#define DCHECK(condition) assert(condition)
+#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
+#define DCHECK_NE(val1, val2) assert((val1) != (val2))
+#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
+#define DCHECK_LT(val1, val2) assert((val1) < (val2))
+#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
+#define DCHECK_GT(val1, val2) assert((val1) > (val2))
+
+// Always-on checking
+#define CHECK(x)	if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
+#define CHECK_LT(x, y)	CHECK((x) < (y))
+#define CHECK_GT(x, y)	CHECK((x) > (y))
+#define CHECK_LE(x, y)	CHECK((x) <= (y))
+#define CHECK_GE(x, y)	CHECK((x) >= (y))
+#define CHECK_EQ(x, y)	CHECK((x) == (y))
+#define CHECK_NE(x, y)	CHECK((x) != (y))
+
+#define LOG_INFO LogMessage(__FILE__, __LINE__)
+#define LOG_ERROR LOG_INFO
+#define LOG_WARNING LOG_INFO
+#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
+#define LOG_QFATAL LOG_FATAL
+
+#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
+
+#ifdef NDEBUG
+#define DEBUG_MODE 0
+#define LOG_DFATAL LOG_ERROR
+#else
+#define DEBUG_MODE 1
+#define LOG_DFATAL LOG_FATAL
+#endif
+
+#define LOG(severity) LOG_ ## severity.stream()
+
+class LogMessage {
+ public:
+  LogMessage(const char* file, int line) {
+    stream() << file << ":" << line << ": ";
+  }
+  ~LogMessage() {
+    stream() << "\n";
+    string s = str_.str();
+    if(write(2, s.data(), s.size()) < 0) {}  // shut up gcc
+  }
+  ostream& stream() { return str_; }
+ 
+ private:
+  std::ostringstream str_;
+  DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
+};
+
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line)
+    : LogMessage(file, line) { }
+  ~LogMessageFatal() {
+    std::cerr << "\n";
+    abort();
+  }
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal);
+};
+
+#endif  // RE2_UTIL_LOGGING_H__
--- a/Show More
+++ b/Show More