Import Upstream version 0.14

This commit is contained in:
zhangyichun 2022-09-29 15:22:54 +08:00
commit f03dd0cdde
143 changed files with 47281 additions and 0 deletions

63
Changes Normal file
View File

@ -0,0 +1,63 @@
0.14 2021-01-31
- Lexical $_ has been removed (Perl 5.24) (PR #1, thanks Tim Heaney)
- Expose the NamedCapturingGroups method (PR #2, thanks rouzier)
- Fix build on macOS by defaulting RE2 to use C++11 ("tr1/unordered_set" isn't provided by the clang C++ library
anymore and it's 2021)
- Link to GitHub issue tracker as CPAN RT is going away.
0.13 2015-01-18
- Unbreak Windows build
- Up minimum perl to 5.12
0.12 2015-01-17
- Fix for `"" =~ {}` crashing
- Fix for building on perl >= 5.20 (RT #95144, thanks Tony C. for the patch)
- Fix build with -Werror=format-security (RT #96338)
0.11 2012-07-29
- Support named capture groups
- Support perl >= 5.17.1; add a nulled out op_comp to engine struct
0.10 2012-07-24
- Add missing compat-rx.h file
0.09 2012-04-01 (Brought to you from the 2012 QA Hackathon in Paris)
- Thread destruction fixes
0.08 2011-04-18
- Add files I forgot to add
0.07_01 2011-04-16
- Use cophh API
- Support -strict mode
0.07 2011-04-11
- RT #67192: Fix /s support
- Attempt to compile with -O3 as RE2 does
- Fix leak in possible_match_range
- Fix compilation on gcc 4.6 (RE2 issue 35)
0.06 2011-04-02
- RT #67153: Fix interpolation of RE2 into RE2
(qr// stringification included the x flag which RE2 doesn't support)
0.05 2011-02-06
- Allow setting of RE2's max_mem to control memory bound
- Improve documentation
0.04 2011-01-29
- Remove various UNIXisms from RE2, now builds under Win32/Strawberry
(still needs gmake installed, which comes with Strawberry)
0.03 2011-01-23
- Pass more options from MakeMaker to RE2, should now work on x86_64 again
- Run RE2's own test suite as part of build if we can
0.02 2011-01-22
- Use ExtUtils::CppGuess and try to find GNU make
0.01 2011-01-16
- Fixes for //g, captures, generally many things
0.01_01 2010-07-25
- Initial dev. version

143
MANIFEST Normal file
View File

@ -0,0 +1,143 @@
Changes
compat-cophh.h
compat-rx.h
lib/re/engine/RE2.pm
Makefile.PL
MANIFEST This list of files
MANIFEST.SKIP
ppport.h
RE2.xs
re2/.hgignore
re2/AUTHORS
re2/CONTRIBUTORS
re2/libre2.symbols
re2/libre2.symbols.darwin
re2/LICENSE
re2/Makefile
re2/re2/bitstate.cc
re2/re2/compile.cc
re2/re2/dfa.cc
re2/re2/filtered_re2.cc
re2/re2/filtered_re2.h
re2/re2/make_perl_groups.pl
re2/re2/make_unicode_casefold.py
re2/re2/make_unicode_groups.py
re2/re2/Makefile
re2/re2/mimics_pcre.cc
re2/re2/nfa.cc
re2/re2/onepass.cc
re2/re2/parse.cc
re2/re2/perl_groups.cc
re2/re2/prefilter.cc
re2/re2/prefilter.h
re2/re2/prefilter_tree.cc
re2/re2/prefilter_tree.h
re2/re2/prog.cc
re2/re2/prog.h
re2/re2/re2.cc
re2/re2/re2.h
re2/re2/regexp.cc
re2/re2/regexp.h
re2/re2/set.cc
re2/re2/set.h
re2/re2/simplify.cc
re2/re2/stringpiece.h
re2/re2/testing/backtrack.cc
re2/re2/testing/charclass_test.cc
re2/re2/testing/compile_test.cc
re2/re2/testing/dfa_test.cc
re2/re2/testing/dump.cc
re2/re2/testing/exhaustive1_test.cc
re2/re2/testing/exhaustive2_test.cc
re2/re2/testing/exhaustive3_test.cc
re2/re2/testing/exhaustive_test.cc
re2/re2/testing/exhaustive_tester.cc
re2/re2/testing/exhaustive_tester.h
re2/re2/testing/filtered_re2_test.cc
re2/re2/testing/mimics_pcre_test.cc
re2/re2/testing/null_walker.cc
re2/re2/testing/parse_test.cc
re2/re2/testing/possible_match_test.cc
re2/re2/testing/random_test.cc
re2/re2/testing/re2_arg_test.cc
re2/re2/testing/re2_test.cc
re2/re2/testing/regexp_benchmark.cc
re2/re2/testing/regexp_generator.cc
re2/re2/testing/regexp_generator.h
re2/re2/testing/regexp_test.cc
re2/re2/testing/required_prefix_test.cc
re2/re2/testing/search_test.cc
re2/re2/testing/set_test.cc
re2/re2/testing/simplify_test.cc
re2/re2/testing/string_generator.cc
re2/re2/testing/string_generator.h
re2/re2/testing/string_generator_test.cc
re2/re2/testing/tester.cc
re2/re2/testing/tester.h
re2/re2/testing/unicode_test.py
re2/re2/tostring.cc
re2/re2/unicode.py
re2/re2/unicode_casefold.cc
re2/re2/unicode_casefold.h
re2/re2/unicode_groups.cc
re2/re2/unicode_groups.h
re2/re2/variadic_function.h
re2/re2/walker-inl.h
re2/README
re2/runtests
re2/testinstall.cc
re2/util/arena.cc
re2/util/arena.h
re2/util/atomicops.h
re2/util/benchmark.cc
re2/util/benchmark.h
re2/util/flags.h
re2/util/hash.cc
re2/util/logging.h
re2/util/mutex.h
re2/util/pcre.cc
re2/util/pcre.h
re2/util/random.cc
re2/util/random.h
re2/util/rune.cc
re2/util/sparse_array.h
re2/util/sparse_array_test.cc
re2/util/sparse_set.h
re2/util/stringpiece.cc
re2/util/stringprintf.cc
re2/util/strutil.cc
re2/util/test.cc
re2/util/test.h
re2/util/thread.cc
re2/util/thread.h
re2/util/utf.h
re2/util/util.h
re2/util/valgrind.cc
re2/util/valgrind.h
re2_xs.cc
re2_xs.h
README
t/00.compile.t
t/00.re2-tests.t
t/01.basic.t
t/02.chars.t
t/03.modifiers.t
t/04.multiline.t
t/05.url.t
t/06.matchrange.t
t/07.utf8.t
t/08.pos.t
t/09.mem.t
t/10.options.t
t/ree-pcre/capture.t
t/ree-pcre/import.t
t/ree-pcre/match.t
t/ree-pcre/qr.t
t/ree-pcre/s.t
t/ree-pcre/split-null.t
t/ree-pcre/split.t
t/ree-pcre/subexp.t
t/ree-pcre/unimport.t
TODO
META.yml Module YAML meta-data (added by MakeMaker)
META.json Module JSON meta-data (added by MakeMaker)

46
MANIFEST.SKIP Normal file
View File

@ -0,0 +1,46 @@
#!start included /Users/dgl/.perl5/lib/perl5/ExtUtils/MANIFEST.SKIP
# Avoid version control files.
\bRCS\b
\bCVS\b
\bSCCS\b
,v$
\B\.svn\b
\B\.git\b
\B\.gitignore\b
\b_darcs\b
# Avoid Makemaker generated and utility files.
\bMANIFEST\.bak
^Makefile$
\bblib/
\bMakeMaker-\d
\bpm_to_blib\.ts$
\bpm_to_blib$
\bblibdirs\.ts$ # 6.18 through 6.25 generated this
# Avoid Module::Build generated and utility files.
\bBuild$
\b_build/
# Avoid temp and backup files.
~$
\.old$
\#$
\b\.#
\.bak$
# Avoid Devel::Cover files.
\bcover_db\b
#!end included /Users/dgl/.perl5/lib/perl5/ExtUtils/MANIFEST.SKIP
.*\.o$
.*\.c$
.*\.bs$
.*\.gz$
.*\.tar$
.*\.so$
MYMETA.yml
^misc/
^re2/obj/

43
META.json Normal file
View File

@ -0,0 +1,43 @@
{
"abstract" : "RE2 regex engine",
"author" : [
"David Leadbeater <dgl@dgl.cx>"
],
"dynamic_config" : 1,
"generated_by" : "ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010",
"license" : [
"perl_5"
],
"meta-spec" : {
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
"version" : 2
},
"name" : "re-engine-RE2",
"no_index" : {
"directory" : [
"t",
"inc"
]
},
"prereqs" : {
"build" : {
"requires" : {
"ExtUtils::MakeMaker" : "0"
}
},
"configure" : {
"requires" : {
"ExtUtils::CppGuess" : "0",
"Test::More" : "0.88"
}
}
},
"release_status" : "stable",
"resources" : {
"bugtracker" : {
"web" : "https://github.com/dgl/re-engine-RE2/issues"
}
},
"version" : "0.14",
"x_serialization_backend" : "JSON::PP version 2.97001"
}

24
META.yml Normal file
View File

@ -0,0 +1,24 @@
---
abstract: 'RE2 regex engine'
author:
- 'David Leadbeater <dgl@dgl.cx>'
build_requires:
ExtUtils::MakeMaker: '0'
configure_requires:
ExtUtils::CppGuess: '0'
Test::More: '0.88'
dynamic_config: 1
generated_by: 'ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010'
license: perl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: '1.4'
name: re-engine-RE2
no_index:
directory:
- t
- inc
resources:
bugtracker: https://github.com/dgl/re-engine-RE2/issues
version: '0.14'
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'

132
Makefile.PL Normal file
View File

@ -0,0 +1,132 @@
use 5.012;
use strict;
use warnings;
use Config;
use ExtUtils::MakeMaker;
use ExtUtils::CppGuess;
# TODO: Optionally use system libre2, via ExtUtils::Liblist?
my @objects = qw(RE2.o re2_xs.o re2/obj/libre2.a);
my $guess = ExtUtils::CppGuess->new;
my %opt = (
NAME => 're::engine::RE2',
AUTHOR => 'David Leadbeater <dgl@dgl.cx>',
VERSION_FROM => 'lib/re/engine/RE2.pm',
ABSTRACT_FROM => 'lib/re/engine/RE2.pm',
LICENSE => 'perl',
INC => '-Ire2',
PMLIBDIRS => ["lib"],
OBJECT => join(" ", @objects),
test => {TESTS => 't/*.t t/ree-pcre/*.t'},
CONFIGURE_REQUIRES => {
"ExtUtils::CppGuess" => 0,
"Test::More" => 0.88,
},
$guess->makemaker_options
);
if(eval { ExtUtils::MakeMaker->VERSION(6.46) }) {
$opt{META_MERGE} = {
'meta-spec' => { version => 2 },
resources => {
repository => 'https://github.com/dgl/re-engine-RE2',
bugtracker => {
web => 'https://github.com/dgl/re-engine-RE2/issues',
}
}
}
}
# If the user didn't explicitly provide optimisation settings, we'll try to do
# it ourselves, but only for gcc.
my $cc = (map +(/^CC=(.*)/i), @ARGV)[0] || $Config{cc};
if(!grep(/^OPTIMIZE=/i, @ARGV)
and my $gcc_version = gcc_version($cc)) {
say "Compiling on gcc $gcc_version";
my $optimize = $Config{optimize};
if($gcc_version) {
$optimize =~ s/-O[s0-2]/-O3/ and say "Optimize level set to -O3";
}
# Attempt to work out if we have a gcc that is likely to support -flto.
# This is probably a lot of work for a minimal gain, but it's worth a try.
if($gcc_version >= 4.5) {
my $try_optimize = "$optimize -flto";
# Try to use this flag
if(gcc_try(cc => $cc, %opt, OPTIMIZE => $try_optimize)) {
$optimize = $try_optimize;
}
# gcc 4.9 needs this otherwise it gets rid of nearly everything in libre2.a.
$try_optimize = "$optimize -ffat-lto-objects";
if(gcc_try(cc => $cc, %opt, OPTIMIZE => $try_optimize)) {
$optimize = $try_optimize;
}
}
say "OPTIMIZE is now: $optimize";
$opt{OPTIMIZE} = $optimize;
}
if(defined $Config{usethreads} && $Config{usethreads} eq 'define') {
if(defined $Config{i_pthread} && $Config{i_pthread} eq 'define') {
$opt{DEFINE} = "-DHAVE_PTHREAD -pthread";
} else {
# For now this allows compilation under Win32/Strawberry, but might cause weird crashes on thread
# destruction...
$opt{DEFINE} = "-DNO_THREADS";
}
} else {
$opt{DEFINE} = "-DNO_THREADS";
}
# This is a bit hacky, RE2 makefile needs GNU make, for now we'll try to find
# it, ideally should rewrite the RE2 makefile to not need GNU make.
our $MAKE;
for my $make(qw(make gmake)) {
if(qx{$make --version 2>&1} =~ /GNU Make/i) {
$MAKE = $make;
last;
}
}
if(!$MAKE) {
die "RE2 currently needs GNU Make, please install gmake.\n";
}
WriteMakefile(%opt);
sub gcc_version {
my($cc) = @_;
my $gcc_out = qx{$cc -v 2>&1};
# Just the first two digits
$gcc_out =~ /gcc version (\d+\.\d+)/ ? $1 : 0;
}
# This is highly gcc and unix specific, but that's where I care about
# optimising this anyway.
sub gcc_try {
my(%opts) = @_;
system "$opts{cc} $opts{CCFLAGS} $opts{OPTIMIZE} -c -o /dev/null /dev/null >/dev/null 2>&1";
not $?;
}
sub MY::postamble {
return <<MAKE_FRAG;
RE2_FLAGS = CC="\$(CC)" CXXFLAGS="\$(CCFLAGS) \$(CCCDLFLAGS) \$(OPTIMIZE) \$(DEFINE) -DUSE_CXX0X" LDFLAGS="\$(OTHERLDFLAGS) \$(LDLOADLIBS)"
re2/obj/libre2.a: re2/Makefile
$MAKE -C re2 obj/libre2.a \$(RE2_FLAGS)
re2-tests:
$MAKE -C re2 static-test \$(RE2_FLAGS) LDFLAGS="\$(OTHERLDFLAGS) \$(LDLOADLIBS) -lm -lpthread"
MAKE_FRAG
}

52
RE2.xs Normal file
View File

@ -0,0 +1,52 @@
#include "re2_xs.h"
#include "ppport.h"
MODULE = re::engine::RE2 PACKAGE = re::engine::RE2
PROTOTYPES: ENABLE
void
ENGINE(...)
PROTOTYPE:
PPCODE:
XPUSHs(sv_2mortal(newSViv(PTR2IV(&re2_engine))));
# Use a typemap for this maybe, especially if we add more methods like it!
void
possible_match_range(SV *self, STRLEN len = 10)
PROTOTYPE:
PPCODE:
REGEXP* rx;
SV *possible_min, *possible_max;
if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
croak("qr// reference to a re::engine::RE2 instance required");
rx = SvRX(self);
RE2_possible_match_range(aTHX_ rx, len, &possible_min, &possible_max);
mXPUSHs(possible_min);
mXPUSHs(possible_max);
HV*
named_captures(SV *self)
PROTOTYPE:
CODE:
REGEXP* rx;
if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
croak("qr// reference to a re::engine::RE2 instance required");
rx = SvRX(self);
RETVAL = RE2_named_captures(aTHX_ rx);
OUTPUT:
RETVAL
int
number_of_capture_groups(SV *self)
PROTOTYPE:
CODE:
REGEXP* rx;
if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
croak("qr// reference to a re::engine::RE2 instance required");
rx = SvRX(self);
RETVAL = RE2_number_of_capture_groups(aTHX_ rx);
OUTPUT:
RETVAL

145
README Normal file
View File

@ -0,0 +1,145 @@
NAME
re::engine::RE2 - RE2 regex engine
SYNOPSIS
use re::engine::RE2;
if ("Hello, world" =~ /Hello, (world)/) {
print "Greetings, $1!";
}
DESCRIPTION
This module replaces perl's regex engine in a given lexical scope with
RE2.
RE2 is a primarily DFA based regexp engine from Google that is very fast
at matching large amounts of text. However it does not support look
behind and some other Perl regular expression features. See
http://code.google.com/p/re2 for more information.
Fallback to normal Perl regexp is implemented by this module. If RE2 is
unable to compile a regexp it will use Perl instead, therefore features
not implemented by RE2 don't suddenly stop working, they will just use
Perl's regexp implementation.
METHODS
To access extra functionality of RE2 methods can be called on a compiled
regular expression (i.e. a "qr//").
* "possible_match_range([length = 10])"
Returns an array of two strings: where the expression will start
matching and just after where it will finish matching. See RE2's
documentation on PossibleMatchRange for further details.
Example:
my($min, $max) = qr/^(a|b)/->possible_match_range;
is $min, 'a';
is $max, 'c';'
PERFORMANCE
Performance is really the primary reason for using RE2, so here's some
benchmarks. Like any benchmark take them with a pinch of salt.
Simple matching
my $foo = "foo bar baz";
$foo =~ /foo/;
$foo =~ /foox/;
On this very simple match RE2 is actually slower:
Rate re2 re
re2 674634/s -- -76%
re 2765739/s 310% --
URL matching
Matching "m{([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^
@]+)}" against a several KB file:
Rate re re2
re 35.2/s -- -99%
re2 2511/s 7037% --
Many alternatives
Matching a string against a regexp with 17,576 alternatives ("aaa ..
zzz").
This uses trie matching on Perl (obviously RE2 does similar by default).
$ perl misc/altern.pl
Rate re re2
re 52631/s -- -91%
re2 554938/s 954% --
NOTES
* No support for "m//x"
The "/x" modifier is not supported. (There's no particular reason
for this, just RE2 itself doesn't support it). Fallback to Perl
regexp will happen automatically if "//x" is used.
* "re2/dfa.cc:447: DFA out of memory: prog size xxx mem yyy"
If you attempt to compile a really large regular expression you may
get this error. RE2 has an internal limit on memory consumption for
the DFA state tables. By default this is 8 MiB.
If you need to increase this size then use the max_mem parameter:
use re::engine::RE2 -max_mem => 8<<23; # 64MiB
* How do I tell if RE2 will be used?
See if your regexp is matching quickly or slowly ;).
Alternatively normal OO concepts apply and you may examine the
object returned by "qr//":
use re::engine::RE2;
ok qr/foo/->isa("re::engine::RE2");
# Perl Regexp used instead
ok not qr/(?<=foo)bar/->isa("re::engine::RE2");
BUGS
Known issues:
* Unicode handling
Currently the Unicode handling of re::engine::RE2 does not fully
match Perl's behaviour.
The UTF-8 flag of the regexp currently determines how the string is
matched. This is obviously broken, so will be fixed at some point.
* Final newline matching differs to Perl
"\n" =~ /$/
The above is true in Perl, false in RE2. To work around the issue
you can write "\n?\z" when you mean Perl's "$".
Please report bugs or provide patches at
<https://github.com/dgl/re-engine-RE2>.
AUTHORS
David Leadbeater <dgl[at]dgl[dot]cx>
COPYRIGHT
Copyright 2010 David Leadbeater.
Based on re::engine::PCRE:
Copyright 2007 Ævar Arnfjörð Bjarmason.
The original version was copyright 2006 Audrey Tang <cpan@audreyt.org>
and Yves Orton.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
(However the bundled copy of RE2 has a different copyright owner and is
under a BSD-like license, see re2/LICENSE.)

40
TODO Normal file
View File

@ -0,0 +1,40 @@
# -*- mode: org -*-
* Fix UTF-8 support
This turns out to be harder than I was thinking. The first step is to compile
two versions of the regexp, one for matching UTF-8 and one for matching
Latin1 (maybe on demand).
RE2 won't accept \x{...} escapes that are greater than the current character
set. I was hoping it would be possible to give a string containing these to
RE2 then let RE2 realise part of it won't match (e.g. (?:foo|\x{1234}) will
still match foo, even if the input string isn't UTF-8).
(I'm only talking about \x{...}; this is the only case I have to
care about, \p{...} *are* accepted by RE2 regardless. Due to Perl's
behaviour we can't have raw UTF-8 in the string if the UTF-8 flag
isn't on.)
The approach for now will probably be to replace \x{nnn} in strings (where
nnn>0xFF) with something that won't match (maybe [^\x00-\xff]), but allows
the other branches to match.
** Think about supporting perl 5.14's unicode regexp flags
At least at the top level, implementing within RE2 would be silly.
RE2 doesn't have all the behaviours perl does (i.e. /a is implied
for \d, etc.). Might just be a case of documenting what RE2 does,
once UTF-8 is working to some extent. An alternative could be to
make things explicit (e.g. you need to say "no feature
'unicode_strings'" if you happen to have enabled them to use RE2).
* Switch to dzil
* Support more options
** never_nl could be useful for cpangrep optimisations
* Support RE2::Set functionality
i.e. a Regexp::RE2::Set class that can have RE2 regexps added into it
then either a match method or maybe overload ~~?
* Improve tests
** See if t/re/re_tests from Perl can be used.
** Improve performance comparisons
See maybe https://github.com/axiak/pyre2/blob/master/tests/performance.py
* Support /x (probably needs RE2 changes to do properly)
* Both Perl and RE2 store the stringification of the regexp, can we avoid this?

10
compat-cophh.h Normal file
View File

@ -0,0 +1,10 @@
/* Compatibility for bits of the cophh API which was added in 5.13.7.
* This uses refcounted_he_* functions that are not part of the public perl
* API, therefore won't work on platforms with strict linkers (Windows, AIX).
*/
#if PERL_VERSION < 13 || (PERL_VERSION == 13 && PERL_SUBVERSION < 7)
#define cophh_fetch_pvs(cophh, key, flags) \
Perl_refcounted_he_fetch(aTHX_ cophh, NULL, key, sizeof(key) - 1, 0, flags)
#endif

7
compat-rx.h Normal file
View File

@ -0,0 +1,7 @@
/* Compatibility for RX_* macros added around 5.10.1. */
#ifndef RX_WRAPPED
#define RX_WRAPPED(prog) ((prog)->wrapped)
#define RX_WRAPLEN(prog) ((prog)->wraplen)
#endif

270
lib/re/engine/RE2.pm Executable file
View File

@ -0,0 +1,270 @@
package re::engine::RE2;
use 5.012;
BEGIN {
$re::engine::RE2::VERSION = "0.14";
}
use XSLoader ();
# All engines should subclass the core Regexp package
our @ISA = 'Regexp';
BEGIN
{
XSLoader::load __PACKAGE__, $re::engine::RE2::VERSION;
}
sub import
{
my $class = shift;
$^H{regcomp} = ENGINE;
if (@_) {
my %args = @_;
if (exists $args{"-max_mem"}) {
$^H{__PACKAGE__ . "::max-mem"} = $args{"-max_mem"};
}
if (exists $args{"-strict"}) {
$^H{__PACKAGE__ . "::strict"} = $args{"-strict"};
}
if (exists $args{"-longest_match"}) {
$^H{__PACKAGE__ . "::longest-match"} = $args{"-longest_match"};
}
if (exists $args{"-never_nl"}) {
$^H{__PACKAGE__ . "::never-nl"} = $args{"-never_nl"};
}
}
}
sub unimport
{
delete $^H{regcomp}
if $^H{regcomp} == ENGINE;
}
1;
__END__
=encoding utf8
=head1 NAME
re::engine::RE2 - RE2 regex engine
=head1 SYNOPSIS
use re::engine::RE2;
if ("Hello, world" =~ /Hello, (world)/) {
print "Greetings, $1!";
}
=head1 DESCRIPTION
This module replaces perl's regex engine in a given lexical scope with RE2.
RE2 is a primarily DFA based regexp engine from Google that is very fast at
matching large amounts of text. However it does not support look behind and
some other Perl regular expression features. See
L<RE2's website|http://code.google.com/p/re2> for more information.
Fallback to normal Perl regexp is implemented by this module. If RE2 is unable
to compile a regexp it will use Perl instead, therefore features not
implemented by RE2 don't suddenly stop working, they will just use Perl's
regexp implementation.
=head1 METHODS
To access extra functionality of RE2 methods can be called on a compiled
regular expression (i.e. a C<qr//>).
=over 4
=item * C<possible_match_range([length = 10])>
Returns an array of two strings: where the expression will start matching and
just after where it will finish matching. See RE2's documentation on
PossibleMatchRange for further details.
Example:
my($min, $max) = qr/^(a|b)/->possible_match_range;
is $min, 'a';
is $max, 'c';'
=item * C<named_captures()>
Returns a hash of the name captures and index.
Example:
my $named_captures = qr/(?P<a>\w+) (?P<d>\w+)/->named_captures;
is $named_captures->{a}, 1;
is $named_captures->{d}, 2;
=item * C<number_of_capture_groups()>
Return number of capture groups
Example:
my $captures = qr/(Hello), (world)/->number_of_capture_groups;
is $captures, 2;
=back
=head1 PRAGMA OPTIONS
Various options can be set by providing options to the C<use> line. These will
be pragma scoped.
=over 4
=item * C<< -max_mem => 1<<24 >>
Configure RE2's memory limit.
=item * C<< -strict => 1 >>
Be strict, i.e. don't allow regexps that are not supported by RE2.
=item * C<< -longest_match => 1 >>
Match on the longest match in alternations. For example with this option set
matching C<"abc"> against C<(a|abc)> will match C<"abc">, without depending on
order.
=item * C<< -never_nl => 1 >>
Never match a newline (C<"\n">) even if the provided regexp contains it.
=back
=head1 PERFORMANCE
Performance is really the primary reason for using RE2, so here's some
benchmarks. Like any benchmark take them with a pinch of salt.
=head2 Simple matching
my $foo = "foo bar baz";
$foo =~ /foo/;
$foo =~ /foox/;
On this very simple match RE2 is actually slower:
Rate re2 re
re2 674634/s -- -76%
re 2765739/s 310% --
=head2 URL matching
Matching C<m{([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^
@]+)}> against a several KB file:
Rate re re2
re 35.2/s -- -99%
re2 2511/s 7037% --
=head2 Many alternatives
Matching a string against a regexp with 17,576 alternatives (C<aaa .. zzz>).
This uses trie matching on Perl (obviously RE2 does similar by default).
$ perl misc/altern.pl
Rate re re2
re 52631/s -- -91%
re2 554938/s 954% --
=head1 NOTES
=over 4
=item * No support for C<m//x>
The C</x> modifier is not supported. (There's no particular reason for this,
just RE2 itself doesn't support it). Fallback to Perl regexp will happen
automatically if C<//x> is used.
=item * "re2/dfa.cc:447: DFA out of memory: prog size xxx mem yyy"
If you attempt to compile a really large regular expression you may get this
error. RE2 has an internal limit on memory consumption for the DFA state
tables. By default this is 8 MiB.
If you need to increase this size then use the max_mem parameter:
use re::engine::RE2 -max_mem => 8<<23; # 64MiB
=item * How do I tell if RE2 will be used?
See if your regexp is matching quickly or slowly ;).
Alternatively normal OO concepts apply and you may examine the object returned
by C<qr//>:
use re::engine::RE2;
ok qr/foo/->isa("re::engine::RE2");
# Perl Regexp used instead
ok not qr/(?<=foo)bar/->isa("re::engine::RE2");
If you wish to force RE2, use the C<-strict> option.
=back
=head1 BUGS
Known issues:
=over 4
=item * Unicode handling
Currently the Unicode handling of re::engine::RE2 does not fully match Perl's
behaviour.
The UTF-8 flag of the regexp currently determines how the string is matched.
This is obviously broken, so will be fixed at some point.
=item * Final newline matching differs to Perl
"\n" =~ /$/
The above is true in Perl, false in RE2. To work around the issue you can write
C<\n?\z> when you mean Perl's C<$>.
=back
Please report bugs or provide patches at <https://github.com/dgl/re-engine-RE2>.
=head1 AUTHORS
David Leadbeater E<lt>dgl[at]dgl[dot]cxE<gt>
=head1 COPYRIGHT
Copyright 2010 David Leadbeater.
Based on L<re::engine::PCRE>:
Copyright 2007 E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason.
The original version was copyright 2006 Audrey Tang
E<lt>cpan@audreyt.orgE<gt> and Yves Orton.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
(However the bundled copy of RE2 has a different copyright owner and is under a
BSD-like license, see F<re2/LICENSE>.)
=cut

7063
ppport.h Normal file

File diff suppressed because it is too large Load Diff

7
re2/.hgignore Normal file
View File

@ -0,0 +1,7 @@
syntax:glob
*.pyc
*.orig
core
syntax:regexp
^obj/

12
re2/AUTHORS Normal file
View File

@ -0,0 +1,12 @@
# This is the official list of RE2 authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Google Inc.
Stefano Rivera <stefano.rivera@gmail.com>

33
re2/CONTRIBUTORS Normal file
View File

@ -0,0 +1,33 @@
# This is the official list of people who can contribute
# (and typically have contributed) code to the RE2 repository.
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees are listed here
# but not in AUTHORS, because Google holds the copyright.
#
# The submission process automatically checks to make sure
# that people submitting code are listed in this file (by email address).
#
# Names should be added to this file only after verifying that
# the individual or the individual's organization has agreed to
# the appropriate Contributor License Agreement, found here:
#
# http://code.google.com/legal/individual-cla-v1.0.html
# http://code.google.com/legal/corporate-cla-v1.0.html
#
# The agreement for individuals can be filled out on the web.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file, depending on whether the
# individual or corporate CLA was used.
# Names should be added to this file like so:
# Name <email address>
# Please keep the list sorted.
Rob Pike <r@google.com>
Russ Cox <rsc@swtch.com>
Sanjay Ghemawat <sanjay@google.com>
Stefano Rivera <stefano.rivera@gmail.com>
Srinivasan Venkatachary <vsri@google.com>

27
re2/LICENSE Normal file
View File

@ -0,0 +1,27 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

287
re2/Makefile Normal file
View File

@ -0,0 +1,287 @@
# Copyright 2009 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
all: obj/libre2.a obj/so/libre2.so
# to build against PCRE for testing or benchmarking,
# uncomment the next two lines
# CCPCRE=-I/usr/local/include -DUSEPCRE
# LDPCRE=-L/usr/local/lib -lpcre
#CC=g++
#CXXFLAGS=-Wall -O3 -g -pthread # can override
RE2_CXXFLAGS=-Wno-sign-compare -c -I. $(CCPCRE) # required
#LDFLAGS=-pthread
AR=ar
ARFLAGS=rsc
NM=nm
NMFLAGS=-p
# Variables mandated by GNU, the arbiter of all good taste on the internet.
# http://www.gnu.org/prep/standards/standards.html
prefix=/usr/local
exec_prefix=$(prefix)
bindir=$(exec_prefix)/bin
includedir=$(prefix)/include
libdir=$(exec_prefix)/lib
INSTALL=install
INSTALL_PROGRAM=$(INSTALL)
INSTALL_DATA=$(INSTALL) -m 644
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
SONAME=0
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
# access for Unicode data), uncomment the following line:
# REBUILD_TABLES=1
#ifeq ($(shell uname),Darwin)
#MAKE_SHARED_LIBRARY=g++ -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin
#else
#MAKE_SHARED_LIBRARY=g++ -shared -Wl,-soname,libre2.so.0,--version-script=libre2.symbols $(LDFLAGS)
#endif
INSTALL_HFILES=\
re2/re2.h\
re2/set.h\
re2/stringpiece.h\
re2/variadic_function.h\
HFILES=\
util/arena.h\
util/atomicops.h\
util/benchmark.h\
util/flags.h\
util/logging.h\
util/mutex.h\
util/pcre.h\
util/random.h\
util/sparse_array.h\
util/sparse_set.h\
util/test.h\
util/utf.h\
util/util.h\
util/valgrind.h\
re2/filtered_re2.h\
re2/prefilter.h\
re2/prefilter_tree.h\
re2/prog.h\
re2/re2.h\
re2/regexp.h\
re2/set.h\
re2/stringpiece.h\
re2/testing/exhaustive_tester.h\
re2/testing/regexp_generator.h\
re2/testing/string_generator.h\
re2/testing/tester.h\
re2/unicode_casefold.h\
re2/unicode_groups.h\
re2/variadic_function.h\
re2/walker-inl.h\
OFILES=\
obj/util/arena.o\
obj/util/hash.o\
obj/util/rune.o\
obj/util/stringpiece.o\
obj/util/stringprintf.o\
obj/util/strutil.o\
obj/util/valgrind.o\
obj/re2/bitstate.o\
obj/re2/compile.o\
obj/re2/dfa.o\
obj/re2/filtered_re2.o\
obj/re2/mimics_pcre.o\
obj/re2/nfa.o\
obj/re2/onepass.o\
obj/re2/parse.o\
obj/re2/perl_groups.o\
obj/re2/prefilter.o\
obj/re2/prefilter_tree.o\
obj/re2/prog.o\
obj/re2/re2.o\
obj/re2/regexp.o\
obj/re2/set.o\
obj/re2/simplify.o\
obj/re2/tostring.o\
obj/re2/unicode_casefold.o\
obj/re2/unicode_groups.o\
TESTOFILES=\
obj/util/pcre.o\
obj/util/random.o\
obj/util/thread.o\
obj/re2/testing/backtrack.o\
obj/re2/testing/dump.o\
obj/re2/testing/exhaustive_tester.o\
obj/re2/testing/null_walker.o\
obj/re2/testing/regexp_generator.o\
obj/re2/testing/string_generator.o\
obj/re2/testing/tester.o\
TESTS=\
obj/test/charclass_test\
obj/test/compile_test\
obj/test/filtered_re2_test\
obj/test/mimics_pcre_test\
obj/test/parse_test\
obj/test/possible_match_test\
obj/test/re2_test\
obj/test/re2_arg_test\
obj/test/regexp_test\
obj/test/required_prefix_test\
obj/test/search_test\
obj/test/set_test\
obj/test/simplify_test\
obj/test/string_generator_test\
BIGTESTS=\
obj/test/dfa_test\
obj/test/exhaustive1_test\
obj/test/exhaustive2_test\
obj/test/exhaustive3_test\
obj/test/exhaustive_test\
obj/test/random_test\
SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES))
STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
obj:
mkdir $@
obj/re2: obj
cd obj && mkdir re2 || echo Okay
obj/util: obj
cd obj && mkdir util || echo Okay
obj/test: obj
cd obj && mkdir test || echo Okay
obj/re2/testing: obj/re2
cd obj/re2 && mkdir testing || echo Okay
obj/%.o: obj/re2 obj/re2/testing obj/util %.cc $(HFILES)
$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
obj/dbg/%.o: obj/dbg %.cc $(HFILES)
$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
obj/so/%.o: obj/so %.cc $(HFILES)
$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
obj/%.o: obj %.c $(HFILES)
$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
obj/dbg/%.o: obj/dbg %.c $(HFILES)
$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) $*.c
obj/so/%.o: obj/so %.c $(HFILES)
$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
obj/libre2.a: $(OFILES)
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
obj/dbg/libre2.a: obj/dbg $(DOFILES)
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
obj/so/libre2.so: obj/so $(SOFILES)
$(MAKE_SHARED_LIBRARY) -o $@.0 $(SOFILES)
ln -sf libre2.so.0 $@
obj/test/%: obj/test obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
$(CC) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
obj/dbg/test/%: obj/dbg/test obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
$(CC) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
$(CC) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
obj/test/regexp_benchmark: obj/test obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
$(CC) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
ifdef REBUILD_TABLES
re2/perl_groups.cc: re2/make_perl_groups.pl
perl $< > $@
re2/unicode_%.cc: re2/make_unicode_%.py
python $< > $@
endif
distclean: clean
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
clean:
rm -rf obj
rm -f re2/*.pyc
testofiles: $(TESTOFILES)
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
debug-test: $(DTESTS)
@echo
@echo Running debug binary tests.
@echo
@./runtests $(DTESTS)
static-test: $(TESTS)
@echo
@echo Running static binary tests.
@echo
@./runtests $(TESTS)
shared-test: $(STESTS)
@echo
@echo Running dynamic binary tests.
@echo
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS)
debug-bigtest: $(DTESTS) $(DBIGTESTS)
@./runtests $(DTESTS) $(DBIGTESTS)
static-bigtest: $(TESTS) $(BIGTESTS)
@./runtests $(TESTS) $(BIGTESTS)
shared-bigtest: $(STESTS) $(SBIGTESTS)
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) $(SBIGTESTS)
benchmark: obj/test/regexp_benchmark
install: obj/libre2.a obj/so/libre2.so.0
mkdir -p $(DESTDIR)$(includedir)/re2
$(INSTALL_DATA) $(DESTDIR)$(INSTALL_HFILES) $(includedir)/re2
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
$(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME)
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so
testinstall:
@mkdir -p obj
cp testinstall.cc obj
(cd obj && g++ -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall)
LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
benchlog: obj/test/regexp_benchmark
(echo '==BENCHMARK==' `hostname` `date`; \
(uname -a; g++ --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
echo; \
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
# Keep gmake from deleting intermediate files it creates.
# This makes repeated builds faster and preserves debug info on OS X.
.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
obj/dbg/libre2.a obj/so/libre2.a \
obj/test/% obj/so/test/% obj/dbg/test/%

19
re2/README Normal file
View File

@ -0,0 +1,19 @@
This is the source code repository for RE2, a regular expression library.
For documentation about how to install and use RE2,
visit http://code.google.com/p/re2/.
The short version is:
make
make test
make install
make testinstall
Unless otherwise noted, the RE2 source files are distributed
under the BSD-style license found in the LICENSE file.
RE2's native language is C++.
An Inferno wrapper is at http://code.google.com/p/inferno-re2/.
A Python wrapper is at http://github.com/facebook/pyre2/.
A Ruby wrapper is at http://github.com/axic/rre2/.

15
re2/libre2.symbols Normal file
View File

@ -0,0 +1,15 @@
{
global:
# re2::RE2*
_ZN3re23RE2*;
_ZNK3re23RE2*;
# re2::StringPiece*
_ZN3re211StringPiece*;
_ZNK3re211StringPiece*;
# operator==(re2::StringPiece const&, re2::StringPiece const&)
_ZeqRKN3re211StringPieceES2_;
# operator<<(std::ostream&, re2::StringPiece const&)
_ZlsRSoRKN3re211StringPieceE;
local:
*;
};

11
re2/libre2.symbols.darwin Normal file
View File

@ -0,0 +1,11 @@
# Linker doesn't like these unmangled:
# re2::RE2*
__ZN3re23RE2*
__ZNK3re23RE2*
# re2::StringPiece*
__ZN3re211StringPiece*
__ZNK3re211StringPiece*
# operator==(re2::StringPiece const&, re2::StringPiece const&)
__ZeqRKN3re211StringPieceES2_
# operator<<(std::ostream&, re2::StringPiece const&)
__ZlsRSoRKN3re211StringPieceE

1
re2/re2/Makefile Normal file
View File

@ -0,0 +1 @@

378
re2/re2/bitstate.cc Normal file
View File

@ -0,0 +1,378 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Like
// testing/backtrack.cc, it allocates a bit vector with (length of
// text) * (length of prog) bits, to make sure it never explores the
// same (character position, instruction) state multiple times. This
// limits the search to run in time linear in the length of the text.
//
// Unlike testing/backtrack.cc, SearchBitState is not recursive
// on the text.
//
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct Job {
int id;
int arg;
const char* p;
};
class BitState {
public:
explicit BitState(Prog* prog);
~BitState();
// The usual Search prototype.
// Can only call Search once per BitState.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p, int arg);
bool GrowStack();
bool TrySearch(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether match must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char** cap_; // capture registers
int ncap_;
static const int VisitedBits = 32;
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
Job *job_; // stack of text positions to explore
int njob_;
int maxjob_;
};
BitState::BitState(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
cap_(NULL),
ncap_(0),
visited_(NULL),
nvisited_(0),
job_(NULL),
njob_(0),
maxjob_(0) {
}
BitState::~BitState() {
delete[] visited_;
delete[] job_;
delete[] cap_;
}
// Should the search visit the pair ip, p?
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
uint n = id * (text_.size() + 1) + (p - text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
return true;
}
// Grow the stack.
bool BitState::GrowStack() {
// VLOG(0) << "Reallocate.";
maxjob_ *= 2;
Job* newjob = new Job[maxjob_];
memmove(newjob, job_, njob_*sizeof job_[0]);
delete[] job_;
job_ = newjob;
if (njob_ >= maxjob_) {
LOG(DFATAL) << "Job stack overflow.";
return false;
}
return true;
}
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p, int arg) {
if (njob_ >= maxjob_) {
if (!GrowStack())
return;
}
int op = prog_->inst(id)->opcode();
if (op == kInstFail)
return;
// Only check ShouldVisit when arg == 0.
// When arg > 0, we are continuing a previous visit.
if (arg == 0 && !ShouldVisit(id, p))
return;
Job* j = &job_[njob_++];
j->id = id;
j->p = p;
j->arg = arg;
}
// Try a search from instruction id0 in state p0.
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
const char* end = text_.end();
njob_ = 0;
Push(id0, p0, 0);
while (njob_ > 0) {
// Pop job off stack.
--njob_;
int id = job_[njob_].id;
const char* p = job_[njob_].p;
int arg = job_[njob_].arg;
// Optimization: rather than push and pop,
// code that is going to Push and continue
// the loop simply updates ip, p, and arg
// and jumps to CheckAndLoop. We have to
// do the ShouldVisit check that Push
// would have, but we avoid the stack
// manipulation.
if (0) {
CheckAndLoop:
if (!ShouldVisit(id, p))
continue;
}
// Visit ip, p.
// VLOG(0) << "Job: " << ip->id() << " "
// << (p - text_.begin()) << " " << arg;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
case kInstFail:
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
return false;
case kInstAlt:
// Cannot just
// Push(ip->out1(), p, 0);
// Push(ip->out(), p, 0);
// If, during the processing of ip->out(), we encounter
// ip->out1() via another path, we want to process it then.
// Pushing it here will inhibit that. Instead, re-push
// ip with arg==1 as a reminder to push ip->out1() later.
switch (arg) {
case 0:
Push(id, p, 1); // come back when we're done
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); try ip->out1().
arg = 0;
id = ip->out1();
goto CheckAndLoop;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstAltMatch:
// One opcode is byte range; the other leads to match.
if (ip->greedy(prog_)) {
// out1 is the match
Push(ip->out1(), p, 0);
id = ip->out1();
p = end;
goto CheckAndLoop;
}
// out is the match - non-greedy
Push(ip->out(), end, 0);
id = ip->out();
goto CheckAndLoop;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (ip->Matches(c)) {
id = ip->out();
p++;
goto CheckAndLoop;
}
continue;
}
case kInstCapture:
switch (arg) {
case 0:
if (0 <= ip->cap() && ip->cap() < ncap_) {
// Capture p to register, but save old value.
Push(id, cap_[ip->cap()], 1); // come back when we're done
cap_[ip->cap()] = p;
}
// Continue on.
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); restore the old value.
cap_[ip->cap()] = p;
continue;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
continue;
id = ip->out();
goto CheckAndLoop;
case kInstNop:
id = ip->out();
goto CheckAndLoop;
case kInstMatch: {
if (endmatch_ && p != text_.end())
continue;
// VLOG(0) << "Found match.";
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
return true;
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
}
// If going for first match, we're done.
if (!longest_)
return true;
// If we used the entire text, no longer match is possible.
if (p == text_.end())
return true;
// Otherwise, continue on in hope of a longer match.
continue;
}
}
}
return matched;
}
// Search text (within context) for prog_.
bool BitState::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && context_.begin() != text.begin())
return false;
if (prog_->anchor_end() && context_.end() != text.end())
return false;
anchored_ = anchored || prog_->anchor_start();
longest_ = longest || prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = NULL;
// Allocate scratch space.
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
visited_ = new uint32[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// VLOG(0) << "nvisited_ = " << nvisited_;
ncap_ = 2*nsubmatch;
if (ncap_ < 2)
ncap_ = 2;
cap_ = new const char*[ncap_];
memset(cap_, 0, ncap_*sizeof cap_[0]);
maxjob_ = 256;
job_ = new Job[maxjob_];
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return TrySearch(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Bit-state search.
bool Prog::SearchBitState(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
BitState b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

1138
re2/re2/compile.cc Normal file

File diff suppressed because it is too large Load Diff

2086
re2/re2/dfa.cc Normal file

File diff suppressed because it is too large Load Diff

100
re2/re2/filtered_re2.cc Normal file
View File

@ -0,0 +1,100 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <string>
#include "util/util.h"
#include "re2/filtered_re2.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
namespace re2 {
FilteredRE2::FilteredRE2()
: compiled_(false),
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::~FilteredRE2() {
for (int i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
delete prefilter_tree_;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
if (!re->ok()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< re << " due to error " << re->error();
delete re;
} else {
*id = re2_vec_.size();
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(vector<string>* atoms) {
if (compiled_ || re2_vec_.size() == 0) {
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
return;
}
for (int i = 0; i < re2_vec_.size(); i++) {
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
prefilter_tree_->Add(prefilter);
}
atoms->clear();
prefilter_tree_->Compile(atoms);
compiled_ = true;
}
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (int i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return i;
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile";
return -1;
}
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (int i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
return regexps[i];
return -1;
}
bool FilteredRE2::AllMatches(
const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const {
matching_regexps->clear();
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (int i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
matching_regexps->push_back(regexps[i]);
return !matching_regexps->empty();
}
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);
}
} // namespace re2

101
re2/re2/filtered_re2.h Normal file
View File

@ -0,0 +1,101 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string match
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. The compile returns strings that need to
// be matched. Note that all returned strings are lowercase. For
// applying regexps to a search text, the caller does the string
// matching using the strings returned. When doing the string match,
// note that the caller has to do that on lower cased version of the
// search text. Then call FirstMatch or AllMatches with a vector of
// indices of strings that were found in the text to get the actual
// regexp matches.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
~FilteredRE2();
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int *id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased. When doing string matching,
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// all Add calls are done.
void Compile(vector<string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(const StringPiece& text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const;
// The number of regexps added.
int NumRegexps() const { return re2_vec_.size(); }
private:
// Get the individual RE2 objects. Useful for testing.
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
//DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
FilteredRE2(const FilteredRE2&);
void operator=(const FilteredRE2&);
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_

110
re2/re2/make_perl_groups.pl Executable file
View File

@ -0,0 +1,110 @@
#!/usr/bin/perl
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
@posixclasses = (
"[:alnum:]",
"[:alpha:]",
"[:ascii:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:word:]",
"[:xdigit:]",
);
@perlclasses = (
"\\d",
"\\s",
"\\w",
);
sub ComputeClass($) {
my @ranges;
my ($class) = @_;
my $regexp = "[$class]";
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
if ($i <= 128 && chr($i) =~ $regexp) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
my ($cname, $name, @ranges) = @_;
print "static URange16 code${cname}[] = { /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
}
print "};\n";
my $n = @ranges;
my $escname = $name;
$escname =~ s/\\/\\\\/g;
$negname = $escname;
if ($negname =~ /:/) {
$negname =~ s/:/:^/;
} else {
$negname =~ y/a-z/A-Z/;
}
return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }";
}
my $gen = 0;
sub PrintClasses($@) {
my ($cname, @classes) = @_;
my @entries;
foreach my $cl (@classes) {
my @ranges = ComputeClass($cl);
push @entries, PrintClass(++$gen, $cl, @ranges);
}
print "UGroup ${cname}_groups[] = {\n";
foreach my $e (@entries) {
print "\t$e,\n";
}
print "};\n";
my $count = @entries;
print "int num_${cname}_groups = $count;\n";
}
print <<EOF;
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);
print <<EOF;
} // namespace re2
EOF

146
re2/re2/make_unicode_casefold.py Executable file
View File

@ -0,0 +1,146 @@
#!/usr/bin/python
# coding=utf-8
#
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# See unicode_casefold.h for description of case folding tables.
"""Generate C++ table for Unicode case folding."""
import unicode, sys
_header = """
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
def _Delta(a, b):
"""Compute the delta for b - a. Even/odd and odd/even
are handled specially, as described above."""
if a+1 == b:
if a%2 == 0:
return 'EvenOdd'
else:
return 'OddEven'
if a == b+1:
if a%2 == 0:
return 'OddEven'
else:
return 'EvenOdd'
return b - a
def _AddDelta(a, delta):
"""Return a + delta, handling EvenOdd and OddEven specially."""
if type(delta) == int:
return a+delta
if delta == 'EvenOdd':
if a%2 == 0:
return a+1
else:
return a-1
if delta == 'OddEven':
if a%2 == 1:
return a+1
else:
return a-1
print >>sys.stderr, "Bad Delta: ", delta
raise "Bad Delta"
def _MakeRanges(pairs):
"""Turn a list like [(65,97), (66, 98), ..., (90,122)]
into [(65, 90, +32)]."""
ranges = []
last = -100
def evenodd(last, a, b, r):
if a != last+1 or b != _AddDelta(a, r[2]):
return False
r[1] = a
return True
def evenoddpair(last, a, b, r):
if a != last+2:
return False
delta = r[2]
d = delta
if type(delta) is not str:
return False
if delta.endswith('Skip'):
d = delta[:-4]
else:
delta = d + 'Skip'
if b != _AddDelta(a, d):
return False
r[1] = a
r[2] = delta
return True
for a, b in pairs:
if ranges and evenodd(last, a, b, ranges[-1]):
pass
elif ranges and evenoddpair(last, a, b, ranges[-1]):
pass
else:
ranges.append([a, a, _Delta(a, b)])
last = a
return ranges
# The maximum size of a case-folding group.
# Case folding is implemented in parse.cc by a recursive process
# with a recursion depth equal to the size of the largest
# case-folding group, so it is important that this bound be small.
# The current tables have no group bigger than 4.
# If there are ever groups bigger than 10 or so, it will be
# time to rework the code in parse.cc.
MaxCasefoldGroup = 4
def main():
lowergroups, casegroups = unicode.CaseGroups()
foldpairs = []
seen = {}
for c in casegroups:
if len(c) > MaxCasefoldGroup:
raise unicode.Error("casefold group too long: %s" % (c,))
for i in range(len(c)):
if c[i-1] in seen:
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
seen[c[i-1]] = True
foldpairs.append([c[i-1], c[i]])
lowerpairs = []
for lower, group in lowergroups.iteritems():
for g in group:
if g != lower:
lowerpairs.append([g, lower])
def printpairs(name, foldpairs):
foldpairs.sort()
foldranges = _MakeRanges(foldpairs)
print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
print "CaseFold unicode_%s[] = {" % (name,)
for lo, hi, delta in foldranges:
print "\t{ %d, %d, %s }," % (lo, hi, delta)
print "};"
print "int num_unicode_%s = %d;" % (name, len(foldranges),)
print ""
print _header
printpairs("casefold", foldpairs)
printpairs("tolower", lowerpairs)
print _trailer
if __name__ == '__main__':
main()

111
re2/re2/make_unicode_groups.py Executable file
View File

@ -0,0 +1,111 @@
#!/usr/bin/python
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Generate C++ tables for Unicode Script and Category groups."""
import sys
import unicode
_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
n16 = 0
n32 = 0
def MakeRanges(codes):
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
ranges = []
last = -100
for c in codes:
if c == last+1:
ranges[-1][1] = c
else:
ranges.append([c, c])
last = c
return ranges
def PrintRanges(type, name, ranges):
"""Print the ranges as an array of type named name."""
print "static %s %s[] = {" % (type, name,)
for lo, hi in ranges:
print "\t{ %d, %d }," % (lo, hi)
print "};"
# def PrintCodes(type, name, codes):
# """Print the codes as an array of type named name."""
# print "static %s %s[] = {" % (type, name,)
# for c in codes:
# print "\t%d," % (c,)
# print "};"
def PrintGroup(name, codes):
"""Print the data structures for the group of codes.
Return a UGroup literal for the group."""
# See unicode_groups.h for a description of the data structure.
# Split codes into 16-bit ranges and 32-bit ranges.
range16 = MakeRanges([c for c in codes if c < 65536])
range32 = MakeRanges([c for c in codes if c >= 65536])
# Pull singleton ranges out of range16.
# code16 = [lo for lo, hi in range16 if lo == hi]
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
global n16
global n32
n16 += len(range16)
n32 += len(range32)
ugroup = "{ \"%s\", +1" % (name,)
# if len(code16) > 0:
# PrintCodes("uint16", name+"_code16", code16)
# ugroup += ", %s_code16, %d" % (name, len(code16))
# else:
# ugroup += ", 0, 0"
if len(range16) > 0:
PrintRanges("URange16", name+"_range16", range16)
ugroup += ", %s_range16, %d" % (name, len(range16))
else:
ugroup += ", 0, 0"
if len(range32) > 0:
PrintRanges("URange32", name+"_range32", range32)
ugroup += ", %s_range32, %d" % (name, len(range32))
else:
ugroup += ", 0, 0"
ugroup += " }"
return ugroup
def main():
print _header
ugroups = []
for name, codes in unicode.Categories().iteritems():
ugroups.append(PrintGroup(name, codes))
for name, codes in unicode.Scripts().iteritems():
ugroups.append(PrintGroup(name, codes))
print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
print "UGroup unicode_groups[] = {";
ugroups.sort()
for ug in ugroups:
print "\t%s," % (ug,)
print "};"
print "int num_unicode_groups = %d;" % (len(ugroups),)
print _trailer
if __name__ == '__main__':
main()

185
re2/re2/mimics_pcre.cc Normal file
View File

@ -0,0 +1,185 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Determine whether this library should match PCRE exactly
// for a particular Regexp. (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
// * the regexp contains a repetition of an empty string,
// like (a*)* or (a*)+. In this case, PCRE will treat
// the repetition sequence as ending with an empty string,
// while this library does not.
// * Perl and PCRE differ on whether \v matches \n.
// For historical reasons, this library implements the Perl behavior.
// * Perl and PCRE allow $ in one-line mode to match either the very
// end of the text or just before a \n at the end of the text.
// This library requires it to match only the end of the text.
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
// match the end of the text if the last character is a \n.
// This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);
// Walker class to compute whether library handles a regexp
// exactly as PCRE would. See comment at top for conditions.
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
};
// Called after visiting each of re's children and accumulating
// the return values in child_args. So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
// If children failed, so do we.
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
// Otherwise look for other reasons to fail.
switch (re->op()) {
// Look for repeated empty string.
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
if (CanBeEmptyString(re->sub()[0]))
return false;
break;
case kRegexpRepeat:
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
return false;
break;
// Look for \v
case kRegexpLiteral:
if (re->rune() == '\v')
return false;
break;
// Look for $ in single-line mode.
case kRegexpEndText:
case kRegexpEmptyMatch:
if (re->parse_flags() & Regexp::WasDollar)
return false;
break;
// Look for ^ in multi-line mode.
case kRegexpBeginLine:
// No condition: in single-line mode ^ becomes kRegexpBeginText.
return false;
default:
break;
}
// Not proven guilty.
return true;
}
// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
PCREWalker w;
return w.Walk(this, true);
}
// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate. For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can. Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
private:
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch: // never empty
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpCharClass:
case kRegexpLiteralString:
return false;
case kRegexpEmptyMatch: // always empty
case kRegexpBeginLine: // always empty, when they match
case kRegexpEndLine:
case kRegexpNoWordBoundary:
case kRegexpWordBoundary:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
return true;
case kRegexpAlternate: // can be empty if any child can
for (int i = 0; i < nchild_args; i++)
if (child_args[i])
return true;
return false;
case kRegexpPlus: // can be empty if the child can
case kRegexpCapture:
return child_args[0];
case kRegexpRepeat: // can be empty if child can or is x{0}
return child_args[0] || re->min() == 0;
}
return false;
}
// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
EmptyStringWalker w;
return w.Walk(re, true);
}
} // namespace re2

709
re2/re2/nfa.cc Normal file
View File

@ -0,0 +1,709 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchNFA, an NFA search.
// This is an actual NFA like the theorists talk about,
// not the pseudo-NFA found in backtracking regexp implementations.
//
// IMPLEMENTATION
//
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
// which is a variant of the one described in Thompson's 1968 CACM paper.
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
// over the DFA implementation is that it tracks submatch boundaries.
//
// When the choice of submatch boundaries is ambiguous, this particular
// implementation makes the same choices that traditional backtracking
// implementations (in particular, Perl and PCRE) do.
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
// time in the length of the input.
//
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
namespace re2 {
class NFA {
public:
NFA(Prog* prog);
~NFA();
// Searches for a matching string.
// * If anchored is true, only considers matches starting at offset.
// Otherwise finds lefmost match at or after offset.
// * If longest is true, returns the longest match starting
// at the chosen start point. Otherwise returns the so-called
// left-biased match, the one traditional backtracking engines
// (like Perl and PCRE) find.
// Records submatch boundaries in submatch[1..nsubmatch-1].
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
static const int Debug = 0;
private:
struct Thread {
union {
int id;
Thread* next; // when on free list
};
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
int j;
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
AddState()
: id(0), j(-1), cap_j(NULL) {}
explicit AddState(int id)
: id(id), j(-1), cap_j(NULL) {}
AddState(int id, const char* cap_j, int j)
: id(id), j(j), cap_j(cap_j) {}
};
// Threadq is a list of threads. The list is sorted by the order
// in which Perl would explore that particular state -- the earlier
// choices appear earlier in the list.
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline void FreeThread(Thread*);
// Add r (or its children, following unlabeled arrows)
// to the workqueue q with associated capture info.
void AddToThreadq(Threadq* q, int id, int flag,
const char* p, const char** capture);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// p is position of the next byte (the one after c)
// in the input string, used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
// Returns text version of capture information, for debugging.
string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text being matched (for FormatSubmatch)
const char* etext_; // end of text being matched (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
const char** match_; // best match so far
bool matched_; // any match so far?
AddState* astack_; // pre-allocated for AddToThreadq
int nastack_;
int first_byte_; // required first byte for match, or -1 if none
Thread* free_threads_; // free list
DISALLOW_EVIL_CONSTRUCTORS(NFA);
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
nastack_ = 2*prog_->size();
astack_ = new AddState[nastack_];
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
first_byte_ = ComputeFirstByte();
}
NFA::~NFA() {
delete[] match_;
delete[] astack_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
}
void NFA::FreeThread(Thread *t) {
if (t == NULL)
return;
t->next = free_threads_;
free_threads_ = t;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
return t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
}
// Follows all empty arrows from r and enqueues all the states reached.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// The pointer p is the current input position, and m is the
// current set of match boundaries.
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
const char* p, const char** capture) {
if (id0 == 0)
return;
// Astack_ is pre-allocated to avoid resize operations.
// It has room for 2*prog_->size() entries, which is enough:
// Each inst in prog can be processed at most once,
// pushing at most two entries on stk.
int nstk = 0;
AddState* stk = astack_;
stk[nstk++] = AddState(id0);
while (nstk > 0) {
DCHECK_LE(nstk, nastack_);
const AddState& a = stk[--nstk];
if (a.j >= 0)
capture[a.j] = a.cap_j;
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (Debug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
continue;
}
// Create entry in q no matter what. We might fill it in below,
// or we might not. Even if not, it is necessary to have it,
// so that we don't revisit r during the recursion.
q->set_new(id, NULL);
Thread** tp = &q->find(id)->second;
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
// fall through
case kInstAlt:
// Explore alternatives.
stk[nstk++] = AddState(ip->out1());
stk[nstk++] = AddState(ip->out());
break;
case kInstNop:
// Continue on.
stk[nstk++] = AddState(ip->out());
break;
case kInstCapture:
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore capture[j]
// once we finish exploring this possibility.
stk[nstk++] = AddState(0, capture[j], j);
// Record capture.
capture[j] = p;
}
stk[nstk++] = AddState(ip->out());
break;
case kInstMatch:
case kInstByteRange:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
if (Debug)
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
break;
case kInstEmptyWidth:
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
stk[nstk++] = AddState(ip->out());
break;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates match as new, better matches are found.
// p is position of the byte c in the input string,
// used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
FreeThread(t);
continue;
}
}
int id = t->id;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
// Should only see the values handled below.
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
if (ip->Matches(c))
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture((const char**)match_, t->capture);
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch:
if (endmatch_ && p != etext_)
break;
const char* old = t->capture[1]; // previous end pointer
t->capture[1] = p;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
CopyCapture((const char**)match_, t->capture);
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture((const char**)match_, t->capture);
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
t->capture[0] = old;
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
return 0;
}
t->capture[0] = old;
matched_ = true;
break;
}
FreeThread(t);
}
runq->clear();
return 0;
}
string NFA::FormatCapture(const char** capture) {
string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
StringAppendF(&s, "(?,?)");
else if (capture[i+1] == NULL)
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
else
StringAppendF(&s, "(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
}
return s;
}
// Returns whether haystack contains needle's memory.
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
return haystack.begin() <= needle.begin() &&
haystack.end() >= needle.end();
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (!StringPieceContains(context, text)) {
LOG(FATAL) << "Bad args: context does not contain text "
<< reinterpret_cast<const void*>(context.begin())
<< "+" << context.size() << " "
<< reinterpret_cast<const void*>(text.begin())
<< "+" << text.size();
return false;
}
if (prog_->anchor_start() && context.begin() != text.begin())
return false;
if (prog_->anchor_end() && context.end() != text.end())
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
// Save search parameters.
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
// We need to maintain match[0], both to distinguish the
// longest match (if longest is true) and also to tell
// whether we've seen any matches at all.
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
matched_ = false;
memset(match_, 0, ncapture_*sizeof match_[0]);
// For debugging prints.
btext_ = context.begin();
if (Debug) {
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
text.as_string().c_str(), context.as_string().c_str(), anchored,
longest);
}
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
const char* bp = context.begin();
int c = -1;
int wasword = 0;
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
wasword = Prog::IsWordChar(c);
}
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
// Check for empty-width specials.
int flag = 0;
// ^ and \A
if (p == context.begin())
flag |= kEmptyBeginText | kEmptyBeginLine;
else if (p <= context.end() && p[-1] == '\n')
flag |= kEmptyBeginLine;
// $ and \z
if (p == context.end())
flag |= kEmptyEndText | kEmptyEndLine;
else if (p < context.end() && p[0] == '\n')
flag |= kEmptyEndLine;
// \b and \B
int isword = 0;
if (p < context.end())
isword = Prog::IsWordChar(p[0] & 0xFF);
if (isword != wasword)
flag |= kEmptyWordBoundary;
else
flag |= kEmptyNonWordBoundary;
if (Debug) {
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
fprintf(stderr, " %d%s", t->id,
FormatCapture((const char**)t->capture).c_str());
}
fprintf(stderr, "\n");
}
// Process previous character (waited until now to avoid
// repeating the flag computation above).
// This is a no-op the first time around the loop, because
// runq is empty.
int id = Step(runq, nextq, c, flag, p-1);
DCHECK_EQ(runq->size(), 0);
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = text.end();
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
case kInstEmptyWidth:
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
break;
}
id = ip->out();
continue;
}
break;
}
break;
}
if (p > text.end())
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.begin())) {
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
p < text.end() && (p[0] & 0xFF) != first_byte_) {
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
} else {
isword = Prog::IsWordChar(p[0] & 0xFF);
}
flag = Prog::EmptyFlags(context, p);
}
// Steal match storage (cleared but unused as of yet)
// temporarily to hold match boundaries for new thread.
match_[0] = p;
AddToThreadq(runq, start_, flag, p, match_);
match_[0] = NULL;
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (Debug)
fprintf(stderr, "dead\n");
break;
}
if (p == text.end())
c = 0;
else
c = *p & 0xFF;
wasword = isword;
// Will run step(runq, nextq, c, ...) on next iteration. See above.
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
FreeThread(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
if (Debug)
fprintf(stderr, "match (%d,%d)\n",
static_cast<int>(match_[0] - btext_),
static_cast<int>(match_[1] - btext_));
return true;
}
VLOG(1) << "No matches found";
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int NFA::ComputeFirstByte() {
if (start_ == 0)
return -1;
int b = -1; // first byte, not yet computed
typedef SparseSet Workq;
Workq q(prog_->size());
q.insert(start_);
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
// (assume all possible empty-width flags are true).
if (ip->out())
q.insert(ip->out());
break;
case kInstAlt:
case kInstAltMatch:
// Explore alternatives.
if (ip->out())
q.insert(ip->out());
if (ip->out1())
q.insert(ip->out1());
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (NFA::Debug)
Dump();
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

614
re2/re2/onepass.cc Normal file
View File

@ -0,0 +1,614 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchOnePass is an efficient implementation of
// regular expression search with submatch tracking for
// what I call "one-pass regular expressions". (An alternate
// name might be "backtracking-free regular expressions".)
//
// One-pass regular expressions have the property that
// at each input byte during an anchored match, there may be
// multiple alternatives but only one can proceed for any
// given input byte.
//
// For example, the regexp /x*yx*/ is one-pass: you read
// x's until a y, then you read the y, then you keep reading x's.
// At no point do you have to guess what to do or back up
// and try a different guess.
//
// On the other hand, /x*x/ is not one-pass: when you're
// looking at an input "x", it's not clear whether you should
// use it to extend the x* or as the final x.
//
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
//
// A simple intuition for identifying one-pass regular expressions
// is that it's always immediately obvious when a repetition ends.
// It must also be immediately obvious which branch of an | to take:
//
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
//
// The NFA-based search in nfa.cc does some bookkeeping to
// avoid the need for backtracking and its associated exponential blowup.
// But if we have a one-pass regular expression, there is no
// possibility of backtracking, so there is no need for the
// extra bookkeeping. Hence, this code.
//
// On a one-pass regular expression, the NFA code in nfa.cc
// runs at about 1/20 of the backtracking-based PCRE speed.
// In contrast, the code in this file runs at about the same
// speed as PCRE.
//
// One-pass regular expressions get used a lot when RE is
// used for parsing simple strings, so it pays off to
// notice them and handle them efficiently.
//
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <string.h>
#include <map>
#include "util/util.h"
#include "util/arena.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
static const int Debug = 0;
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
// is contained. To explain what that means, first a
// refresher about what regular expression programs look like
// and how the usual NFA execution runs.
//
// In a regular expression program, only the kInstByteRange
// instruction processes an input byte c and moves on to the
// next byte in the string (it does so if c is in the given range).
// The kInstByteRange instructions correspond to literal characters
// and character classes in the regular expression.
//
// The kInstAlt instructions are used as wiring to connect the
// kInstByteRange instructions together in interesting ways when
// implementing | + and *.
// The kInstAlt instruction forks execution, like a goto that
// jumps to ip->out() and ip->out1() in parallel. Each of the
// resulting computation paths is called a thread.
//
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
// are interesting in their own right but like kInstAlt they don't
// advance the input pointer. Only kInstByteRange does.
//
// The automaton execution in nfa.cc runs all the possible
// threads of execution in lock-step over the input. To process
// a particular byte, each thread gets run until it either dies
// or finds a kInstByteRange instruction matching the byte.
// If the latter happens, the thread stops just past the
// kInstByteRange instruction (at ip->out()) and waits for
// the other threads to finish processing the input byte.
// Then, once all the threads have processed that input byte,
// the whole process repeats. The kInstAlt state instruction
// might create new threads during input processing, but no
// matter what, all the threads stop after a kInstByteRange
// and wait for the other threads to "catch up".
// Running in lock step like this ensures that the NFA reads
// the input string only once.
//
// Each thread maintains its own set of capture registers
// (the string positions at which it executed the kInstCapture
// instructions corresponding to capturing parentheses in the
// regular expression). Repeated copying of the capture registers
// is the main performance bottleneck in the NFA implementation.
//
// A regular expression program is "one-pass" if, no matter what
// the input string, there is only one thread that makes it
// past a kInstByteRange instruction at each input byte. This means
// that there is in some sense only one active thread throughout
// the execution. Other threads might be created during the
// processing of an input byte, but they are ephemeral: only one
// thread is left to start processing the next input byte.
// This is what I meant above when I said the non-determinism
// was "contained".
//
// To execute a one-pass regular expression program, we can build
// a DFA (no non-determinism) that has at most as many states as
// the NFA (compare this to the possibly exponential number of states
// in the general case). Each state records, for each possible
// input byte, the next state along with the conditions required
// before entering that state -- empty-width flags that must be true
// and capture operations that must be performed. It also records
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA (aka DFA) - just an array of actions.
struct OneState;
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32 matchcond; // conditions to match right now.
uint32 action[1];
};
// The uint32 conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
//
// Bits 0-5 are the empty-width flags from prog.h.
// Bit 6 is kMatchWins, which means the match takes
// priority over moving to next in a first-match search.
// The remaining bits mark capture registers that should
// be set to the current input position. The capture bits
// start at index 2, since the search loop can take care of
// cap[0], cap[1] (the overall match position).
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
// No input position can satisfy both kEmptyWordBoundary
// and kEmptyNonWordBoundary, so we can use that as a sentinel
// instead of needing an extra bit.
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
static const int kRealCapShift = kEmptyShift + 1;
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
// Parameters used to skip over cap[0], cap[1].
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32 kMatchWins = 1 << kEmptyShift;
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
kEmptyShift_disagrees_with_kEmptyAllFlags);
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
kMaxCap_disagrees_with_kMaxOnePassCapture);
}
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
uint32 satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
}
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32 cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Compute a node pointer.
// Basically (OneState*)(nodes + statesize*nodeindex)
// but the version with the C++ casts overflows 80 characters (and is ugly).
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(
const_cast<uint8*>(nodes + statesize*nodeindex));
}
bool Prog::SearchOnePass(const StringPiece& text,
const StringPiece& const_context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
}
// Make sure we have at least cap[1],
// because we use it to tell if we matched.
int ncap = 2*nmatch;
if (ncap < 2)
ncap = 2;
const char* cap[kMaxCap];
for (int i = 0; i < ncap; i++)
cap[i] = NULL;
const char* matchcap[kMaxCap];
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (anchor_start() && context.begin() != text.begin())
return false;
if (anchor_end() && context.end() != text.end())
return false;
if (anchor_end())
kind = kFullMatch;
// State and act are marked volatile to
// keep the compiler from re-ordering the
// memory accesses walking over the NFA.
// This is worth about 5%.
volatile OneState* state = onepass_start_;
volatile uint8* nodes = onepass_nodes_;
volatile uint32 statesize = onepass_statesize_;
uint8* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32 nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32 matchcond = nextmatchcond;
uint32 cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32 nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
state = NULL;
nextmatchcond = kImpossible;
}
// This code section is carefully tuned.
// The goto sequence is about 10% faster than the
// obvious rewrite as a large if statement in the
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
// Saving the match capture registers is expensive.
// Is this intermediate match worth thinking about?
// Not if we want a full match.
if (kind == kFullMatch)
goto skipmatch;
// Not if it's impossible.
if (matchcond == kImpossible)
goto skipmatch;
// Not if the possible match is beaten by the certain
// match at the next byte. When this test is useless
// (e.g., HTTPPartialMatchRE2) it slows the loop by
// about 10%, but when it avoids work (e.g., DotMatchRE2),
// it cuts the loop execution by about 45%.
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
goto skipmatch;
// Finally, the match conditions must be satisfied.
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
for (int i = 2; i < 2*nmatch; i++)
matchcap[i] = cap[i];
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, matchcap, ncap);
matchcap[1] = p;
matched = true;
// If we're in longest match mode, we have to keep
// going and see if we find a longer match.
// In first match mode, we can stop if the match
// takes priority over the next state for this input byte.
// That bit is per-input byte and thus in cond, not matchcond.
if (kind == kFirstMatch && (cond & kMatchWins))
goto done;
}
skipmatch:
if (state == NULL)
goto done;
if ((cond & kCapMask) && nmatch > 1)
ApplyCaptures(cond, p, cap, ncap);
}
// Look for match at end of input.
{
uint32 matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, cap, ncap);
for (int i = 2; i < ncap; i++)
matchcap[i] = cap[i];
matchcap[1] = p;
matched = true;
}
}
done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
return true;
}
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
// If ip is already on work queue, does nothing and returns false.
// If ip is NULL, does nothing and returns true (pretends to add it).
typedef SparseSet Instq;
static bool AddQ(Instq *q, int id) {
if (id == 0)
return true;
if (q->contains(id))
return false;
q->insert(id);
return true;
}
struct InstCond {
int id;
uint32 cond;
};
// Returns whether this is a one-pass program; that is,
// returns whether it is safe to use SearchOnePass on this program.
// These conditions must be true for any instruction ip:
//
// (1) for any other Inst nip, there is at most one input-free
// path from ip to nip.
// (2) there is at most one kInstByte instruction reachable from
// ip that matches any particular byte c.
// (3) there is at most one input-free path from ip to a kInstMatch
// instruction.
//
// This is actually just a conservative approximation: it might
// return false when the answer is true, when kInstEmptyWidth
// instructions are involved.
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_start_ != NULL;
did_onepass_ = true;
if (start() == 0) // no match
return false;
// Steal memory for the one-pass NFA from the overall DFA budget.
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + byte_inst_count_;
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int size = this->size();
InstCond *stack = new InstCond[size];
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
uint8* nodes = new uint8[maxnodes*statesize];
uint8* nodep = nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
nodep += statesize;
int nalloc = 1;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes, statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
for (int b = 0; b < bytemap_range_; b++)
node->action[b] = kImpossible;
node->matchcond = kImpossible;
workq.clear();
bool matched = false;
int nstack = 0;
stack[nstack].id = id;
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
Prog::Inst* ip = inst(id);
uint32 cond = stack[nstack].cond;
switch (ip->opcode()) {
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
// Fall through.
case kInstAlt:
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
goto fail;
stack[nstack].id = ip->out1();
stack[nstack++].cond = cond;
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (Debug)
LOG(ERROR)
<< StringPrintf("Not OnePass: hit node limit %d > %d",
nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
nodep += statesize;
nodebyid[ip->out()] = nextindex;
nalloc++;
AddQ(&tovisit, ip->out());
}
if (matched)
cond |= kMatchWins;
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in byte class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
}
break;
}
case kInstCapture:
if (ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
goto QueueEmpty;
case kInstEmptyWidth:
cond |= ip->empty();
goto QueueEmpty;
case kInstNop:
QueueEmpty:
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
// We could be a little more precise by looking at what c
// is, but that seems like overkill.
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
" %d -> %d\n",
*it, ip->out());
}
goto fail;
}
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstMatch:
if (matched) {
// (3) is violated
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
" from %d\n", *it);
}
goto fail;
}
matched = true;
node->matchcond = cond;
break;
case kInstFail:
break;
}
}
}
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
string dump = "prog dump:\n" + Dump() + "node dump\n";
map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
StringAppendF(&dump, "byte ranges:\n");
int i = 0;
for (int b = 0; b < bytemap_range_; b++) {
int lo = i;
while (bytemap_[i] == b)
i++;
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
}
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes, statesize, nodeindex);
string s;
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
if ((node->action[i] & kImpossible) == kImpossible)
continue;
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
i, node->action[i] & 0xFFFF,
node->action[i] >> kIndexShift,
idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << dump;
}
// Overallocated earlier; cut down to actual size.
nodep = new uint8[nalloc*statesize];
memmove(nodep, nodes, nalloc*statesize);
delete[] nodes;
nodes = nodep;
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
onepass_nodes_ = nodes;
onepass_statesize_ = statesize;
dfa_mem_ -= nalloc*statesize;
delete[] stack;
delete[] nodebyid;
return true;
fail:
delete[] stack;
delete[] nodebyid;
delete[] nodes;
return false;
}
} // namespace re2

2202
re2/re2/parse.cc Normal file

File diff suppressed because it is too large Load Diff

119
re2/re2/perl_groups.cc Normal file
View File

@ -0,0 +1,119 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
static URange16 code1[] = { /* \d */
{ 0x30, 0x39 },
};
static URange16 code2[] = { /* \s */
{ 0x9, 0xa },
{ 0xc, 0xd },
{ 0x20, 0x20 },
};
static URange16 code3[] = { /* \w */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
UGroup perl_groups[] = {
{ "\\d", +1, code1, 1 },
{ "\\D", -1, code1, 1 },
{ "\\s", +1, code2, 3 },
{ "\\S", -1, code2, 3 },
{ "\\w", +1, code3, 4 },
{ "\\W", -1, code3, 4 },
};
int num_perl_groups = 6;
static URange16 code4[] = { /* [:alnum:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static URange16 code5[] = { /* [:alpha:] */
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static URange16 code6[] = { /* [:ascii:] */
{ 0x0, 0x7f },
};
static URange16 code7[] = { /* [:blank:] */
{ 0x9, 0x9 },
{ 0x20, 0x20 },
};
static URange16 code8[] = { /* [:cntrl:] */
{ 0x0, 0x1f },
{ 0x7f, 0x7f },
};
static URange16 code9[] = { /* [:digit:] */
{ 0x30, 0x39 },
};
static URange16 code10[] = { /* [:graph:] */
{ 0x21, 0x7e },
};
static URange16 code11[] = { /* [:lower:] */
{ 0x61, 0x7a },
};
static URange16 code12[] = { /* [:print:] */
{ 0x20, 0x7e },
};
static URange16 code13[] = { /* [:punct:] */
{ 0x21, 0x2f },
{ 0x3a, 0x40 },
{ 0x5b, 0x60 },
{ 0x7b, 0x7e },
};
static URange16 code14[] = { /* [:space:] */
{ 0x9, 0xd },
{ 0x20, 0x20 },
};
static URange16 code15[] = { /* [:upper:] */
{ 0x41, 0x5a },
};
static URange16 code16[] = { /* [:word:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
static URange16 code17[] = { /* [:xdigit:] */
{ 0x30, 0x39 },
{ 0x41, 0x46 },
{ 0x61, 0x66 },
};
UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3 },
{ "[:^alnum:]", -1, code4, 3 },
{ "[:alpha:]", +1, code5, 2 },
{ "[:^alpha:]", -1, code5, 2 },
{ "[:ascii:]", +1, code6, 1 },
{ "[:^ascii:]", -1, code6, 1 },
{ "[:blank:]", +1, code7, 2 },
{ "[:^blank:]", -1, code7, 2 },
{ "[:cntrl:]", +1, code8, 2 },
{ "[:^cntrl:]", -1, code8, 2 },
{ "[:digit:]", +1, code9, 1 },
{ "[:^digit:]", -1, code9, 1 },
{ "[:graph:]", +1, code10, 1 },
{ "[:^graph:]", -1, code10, 1 },
{ "[:lower:]", +1, code11, 1 },
{ "[:^lower:]", -1, code11, 1 },
{ "[:print:]", +1, code12, 1 },
{ "[:^print:]", -1, code12, 1 },
{ "[:punct:]", +1, code13, 4 },
{ "[:^punct:]", -1, code13, 4 },
{ "[:space:]", +1, code14, 2 },
{ "[:^space:]", -1, code14, 2 },
{ "[:upper:]", +1, code15, 1 },
{ "[:^upper:]", -1, code15, 1 },
{ "[:word:]", +1, code16, 4 },
{ "[:^word:]", -1, code16, 4 },
{ "[:xdigit:]", +1, code17, 3 },
{ "[:^xdigit:]", -1, code17, 3 },
};
int num_posix_groups = 28;
} // namespace re2

671
re2/re2/prefilter.cc Normal file
View File

@ -0,0 +1,671 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const int Trace = false;
typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;
static int alloc_id = 100000; // Used for debugging.
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new vector<Prefilter*>;
alloc_id_ = alloc_id++;
VLOG(10) << "alloc_id: " << alloc_id_;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
VLOG(10) << "Deleted: " << alloc_id_;
if (subs_) {
for (int i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
// Nothing left in the AND/OR.
if (subs_->size() == 0) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
op_ = NONE; // OR of nothing is false
return this;
}
// Just one subnode: throw away wrapper.
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
// If a, b can be rewritten as op, do so.
a = a->Simplify();
b = b->Simplify();
// Canonicalize: a->op <= b->op.
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
// Trivial cases.
// ALL AND b = b
// NONE OR b = b
// ALL OR b = ALL
// NONE AND b = NONE
// Don't need to look at b, because of canonicalization above.
// ALL and NONE are smallest opcodes.
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
// If a and b match op, merge their contents.
if (a->op() == op && b->op() == op) {
for (int i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
// If a already has the same op as the op that is under construction
// add in b (similarly if b already has the same op, add in a).
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
// Otherwise just return the op.
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(set<string> *ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching ab will already allow this regexp to be a
// candidate for match, so further matching abc is redundant.
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
// Increment j early so that we can erase the element it points to.
SSIter old_j = j;
++j;
if (old_j->find(*i) != string::npos)
ss->erase(old_j);
}
}
}
Prefilter* Prefilter::OrStrings(set<string>* ss) {
SimplifyStringSet(ss);
Prefilter* or_prefilter = NULL;
if (!ss->empty()) {
or_prefilter = new Prefilter(NONE);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
}
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
Prefilter* Prefilter::FromString(const string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
public:
Info();
~Info();
// More constructors. They delete their Info* arguments.
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyChar();
static Info* CClass(CharClass* cc);
static Info* Literal(Rune r);
static Info* AnyMatch();
// Format Info as a string.
string ToString();
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
set<string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
set<string> exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
// set of strings that match this RE, then is_exact_
// is false and the match_ contains the required match
// criteria.
bool is_exact_;
// Accumulated Prefilter query that any
// match for this regexp is guaranteed to match.
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
// Format a Info in string form.
string Prefilter::Info::ToString() {
if (this == NULL) {
// Sometimes when iterating on children of a node,
// some children might have NULL Info. Adding
// the check here for NULL to take care of cases where
// the caller is not checking.
return "";
}
if (is_exact_) {
int n = 0;
string s;
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
// Add the strings from src to dst.
static void CopyIn(const set<string>& src, set<string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const set<string>& a,
const set<string>& b,
set<string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
CopyIn(a->exact_, &ab->exact_);
CopyIn(b->exact_, &ab->exact_);
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
// one has is_exact_ = true, we move it to match_ and
// then create a OR of a,b. The resulting Info has
// is_exact_ = false.
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return string(buf, n);
}
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for dot (any character).
Prefilter::Info* Prefilter::Info::AnyChar() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
if (Trace) {
VLOG(0) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
VLOG(0) << " " << i->lo << "-" << i->hi;
}
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
return AnyChar();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++)
a->exact_.insert(RuneToString(ToLowerRune(r)));
a->is_exact_ = true;
if (Trace) {
VLOG(0) << " = " << a->ToString();
}
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker() {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
private:
DISALLOW_EVIL_CONSTRUCTORS(Walker);
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (Trace) {
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
}
Prefilter::Info::Walker w;
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
// These ops match the empty string:
case kRegexpEmptyMatch: // anywhere
case kRegexpBeginLine: // at beginning of line
case kRegexpEndLine: // at end of line
case kRegexpBeginText: // at beginning of text
case kRegexpEndText: // at end of text
case kRegexpWordBoundary: // at word boundary
case kRegexpNoWordBoundary: // not at word boundary
info = EmptyString();
break;
case kRegexpLiteral:
info = Literal(re->rune());
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++)
info = Concat(info, Literal(re->runes()[i]));
break;
case kRegexpConcat: {
// Accumulate in info.
// Exact is concat of recent contiguous exact nodes.
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i]; // child info
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
// Exact run is over.
info = And(info, exact);
exact = NULL;
// Add this child's info.
info = And(info, ci);
} else {
// Append to exact run.
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
VLOG(10) << "Alt: " << info->ToString();
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
// Claim nothing, except that it's not empty.
info = AnyChar();
break;
case kRegexpCharClass:
info = CClass(re->cc());
break;
case kRegexpCapture:
// These don't affect the set of matching strings.
info = child_args[0];
break;
}
if (Trace) {
VLOG(0) << "BuildInfo " << re->ToString()
<< ": " << info->ToString();
}
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
string Prefilter::DebugString() const {
if (this == NULL)
return "<nil>";
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
string s = "";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
s += (*subs_)[i]->DebugString();
}
return s;
}
case OR: {
string s = "(";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
s += (*subs_)[i]->DebugString();
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
} // namespace re2

105
re2/re2/prefilter.h Normal file
View File

@ -0,0 +1,105 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
#include "util/util.h"
namespace re2 {
class RE2;
class Regexp;
class Prefilter {
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
public:
enum Op {
ALL = 0, // Everything matches
NONE, // Nothing matches
ATOM, // The string atom() must match
AND, // All in subs() must match
OR, // One of subs() must match
};
explicit Prefilter(Op op);
~Prefilter();
Op op() { return op_; }
const string& atom() const { return atom_; }
void set_unique_id(int id) { unique_id_ = id; }
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
vector<Prefilter*>* subs() {
CHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
// cannot be formed.
static Prefilter* FromRE2(const RE2* re2);
// Returns a readable debug string of the prefilter.
string DebugString() const;
private:
class Info;
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
// Combines two prefilters together to create an OR. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* Or(Prefilter* a, Prefilter* b);
// Generalized And/Or
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
static Prefilter* FromRegexp(Regexp* a);
static Prefilter* FromString(const string& str);
static Prefilter* OrStrings(set<string>* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
// Kind of Prefilter.
Op op_;
// Sub-matches for AND or OR Prefilter.
vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
string atom_;
// If different prefilters have the same string atom, or if they are
// structurally the same (e.g., OR of same atom strings) they are
// considered the same unique nodes. This is the id for each unique
// node. This field is populated with a unique id for every node,
// and -1 for duplicate nodes.
int unique_id_;
// Used for debugging, helps in tracking memory leaks.
int alloc_id_;
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
};
} // namespace re2
#endif // RE2_PREFILTER_H_

398
re2/re2/prefilter_tree.cc Normal file
View File

@ -0,0 +1,398 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
#include "re2/re2.h"
DEFINE_int32(filtered_re2_min_atom_len,
3,
"Strings less than this length are not stored as atoms");
namespace re2 {
PrefilterTree::PrefilterTree()
: compiled_(false) {
}
PrefilterTree::~PrefilterTree() {
for (int i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
for (int i = 0; i < entries_.size(); i++)
delete entries_[i].parents;
}
// Functions used for adding and Compiling prefilters to the
// PrefilterTree.
static bool KeepPart(Prefilter* prefilter, int level) {
if (prefilter == NULL)
return false;
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepPart: "
<< prefilter->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return prefilter->atom().size() >=
FLAGS_filtered_re2_min_atom_len;
case Prefilter::AND: {
int j = 0;
vector<Prefilter*>* subs = prefilter->subs();
for (int i = 0; i < subs->size(); i++)
if (KeepPart((*subs)[i], level + 1))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (int i = 0; i < prefilter->subs()->size(); i++)
if (!KeepPart((*prefilter->subs())[i], level + 1))
return false;
return true;
}
}
void PrefilterTree::Add(Prefilter *f) {
if (compiled_) {
LOG(DFATAL) << "Add after Compile.";
return;
}
if (f != NULL && !KeepPart(f, 0)) {
delete f;
f = NULL;
}
prefilter_vec_.push_back(f);
}
void PrefilterTree::Compile(vector<string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile after Compile.";
return;
}
// We do this check to support some legacy uses of
// PrefilterTree that call Compile before adding any regexps,
// and expect Compile not to have effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
AssignUniqueIds(atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
// Note that getting rid of a prefilter node simply means they are
// no longer necessary for their parent to trigger; that is, we do
// not miss out on any regexps triggering by getting rid of a
// prefilter node.
for (int i = 0; i < entries_.size(); i++) {
IntMap* parents = entries_[i].parents;
if (parents->size() > 8) {
// This one triggers too many things. If all the parents are AND
// nodes and have other things guarding them, then get rid of
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
have_other_guard = have_other_guard &&
(entries_[it->index()].propagate_up_at_count > 1);
if (have_other_guard) {
for (IntMap::iterator it = parents->begin();
it != parents->end(); ++it)
entries_[it->index()].propagate_up_at_count -= 1;
parents->clear(); // Forget the parents
}
}
}
PrintDebugInfo();
}
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
string node_string = NodeString(node);
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
if (iter == node_map_.end())
return NULL;
return (*iter).second;
}
static string Itoa(int n) {
char buf[100];
snprintf(buf, sizeof buf, "%d", n);
return string(buf);
}
string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
string s = Itoa(node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
s += ',';
s += Itoa((*node->subs())[i]->unique_id());
}
}
return s;
}
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (int i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(i);
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (int i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const vector<Prefilter*>& subs = *f->subs();
for (int j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
node_map_[NodeString(node)] = node;
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(node_map_.size());
// Create parent IntMap for the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new IntMap(node_map_.size());
}
// Fill the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
switch (prefilter->op()) {
default:
case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entry->propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
IntMap uniq_child(node_map_.size());
for (int j = 0; j < prefilter->subs()->size() ; j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
}
int child_id = canonical->unique_id();
if (!uniq_child.has_index(child_id))
uniq_child.set_new(child_id, 1);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (!child_entry->parents->has_index(prefilter->unique_id()))
child_entry->parents->set_new(prefilter->unique_id(), 1);
}
entry->propagate_up_at_count =
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (int i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(i);
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const vector<int>& matched_atoms,
vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
LOG(WARNING) << "Compile() not called";
for (int i = 0; i < prefilter_vec_.size(); ++i)
regexps->push_back(i);
} else {
if (!prefilter_vec_.empty()) {
IntMap regexps_map(prefilter_vec_.size());
vector<int> matched_atom_ids;
for (int j = 0; j < matched_atoms.size(); j++) {
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
}
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
}
sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(entries_.size());
IntMap work(entries_.size());
for (int i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
VLOG(10) << "Processing: " << it->index();
// Record regexps triggered.
for (int i = 0; i < entry.regexps.size(); i++) {
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
regexps->set(entry.regexps[i], 1);
}
int c;
// Pass trigger up to parents.
for (IntMap::iterator it = entry.parents->begin();
it != entry.parents->end();
++it) {
int j = it->index();
const Entry& parent = entries_[j];
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
VLOG(10) << "Triggering: " << j;
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo() {
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
VLOG(10) << "#Unique Nodes: " << entries_.size();
for (int i = 0; i < entries_.size(); ++i) {
IntMap* parents = entries_[i].parents;
const vector<int>& regexps = entries_[i].regexps;
VLOG(10) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
VLOG(10) << it->index();
}
VLOG(10) << "Map:";
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
iter != node_map_.end(); ++iter)
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
string PrefilterTree::DebugNodeString(Prefilter* node) const {
string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
node_string += ',';
node_string += Itoa((*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2

130
re2/re2/prefilter_tree.h Normal file
View File

@ -0,0 +1,130 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
//
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
#include "util/util.h"
#include "util/sparse_array.h"
namespace re2 {
typedef SparseArray<int> IntMap;
class Prefilter;
class PrefilterTree {
public:
PrefilterTree();
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
// Add called sequentially for all regexps. All Add calls
// must precede Compile.
void Add(Prefilter* prefilter);
// The Compile returns a vector of string in atom_vec.
// Call this after all the prefilters are added through Add.
// No calls to Add after Compile are allowed.
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(vector<string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
struct Entry {
public:
// How many children should match before this node triggers the
// parent. For an atom and an OR node, this is 1 and for an AND
// node, it is the number of unique children.
int propagate_up_at_count;
// When this node is ready to trigger the parent, what are the indices
// of the parent nodes to trigger. The reason there may be more than
// one is because of sharing. For example (abc | def) and (xyz | def)
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
IntMap* parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
vector<int> regexps;
};
private:
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(vector<string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
string NodeString(Prefilter* node) const;
// Recursively constructs a readable prefilter string.
string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo();
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
vector<Entry> entries_;
// Map node string to canonical Prefilter node.
map<string, Prefilter*> node_map_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
vector<int> unfiltered_;
// vector of Prefilter for all regexps.
vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
};
} // namespace
#endif // RE2_PREFILTER_TREE_H_

341
re2/re2/prog.cc Normal file
View File

@ -0,0 +1,341 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "util/util.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
foldcase_ = foldcase;
}
void Prog::Inst::InitCapture(int cap, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32 id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
void Prog::Inst::InitFail() {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstFail);
}
string Prog::Inst::Dump() {
switch (opcode()) {
default:
return StringPrintf("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
return StringPrintf("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
return StringPrintf("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] -> %d",
foldcase_ ? "/i" : "",
lo_, hi_, out());
case kInstCapture:
return StringPrintf("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
return StringPrintf("emptywidth %#x -> %d",
static_cast<int>(empty_), out());
case kInstMatch:
return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
case kInstFail:
return StringPrintf("fail");
}
}
Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
byte_inst_count_(0),
bytemap_range_(0),
flags_(0),
onepass_statesize_(0),
inst_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
dfa_mem_(0),
delete_dfa_(NULL),
unbytemap_(NULL),
onepass_nodes_(NULL),
onepass_start_(NULL) {
}
Prog::~Prog() {
if (delete_dfa_) {
if (dfa_first_)
delete_dfa_(dfa_first_);
if (dfa_longest_)
delete_dfa_(dfa_longest_);
}
delete[] onepass_nodes_;
delete[] inst_;
delete[] unbytemap_;
}
typedef SparseSet Workq;
static inline void AddToQueue(Workq* q, int id) {
if (id != 0)
q->insert(id);
}
static string ProgToString(Prog* prog, Workq* q) {
string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
}
return s;
}
string Prog::Dump() {
string map;
if (false) { // Debugging
int lo = 0;
StringAppendF(&map, "byte map:\n");
for (int i = 0; i < bytemap_range_; i++) {
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
lo = unbytemap_[i] + 1;
}
StringAppendF(&map, "\n");
}
Workq q(size_);
AddToQueue(&q, start_);
return map + ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
void Prog::Optimize() {
Workq q(size_);
// Eliminate nops. Most are taken out during compilation
// but a few are hard to avoid.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
int j = ip->out();
Inst* jp;
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->set_out(j);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt) {
j = ip->out1();
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->out1_ = j;
AddToQueue(&q, ip->out1());
}
}
// Insert kInstAltMatch instructions
// Look for
// ip: Alt -> j | k
// j: ByteRange [00-FF] -> ip
// k: Match
// or the reverse (the above is the greedy one).
// Rewrite Alt to AltMatch.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt)
AddToQueue(&q, ip->out1());
if (ip->opcode() == kInstAlt) {
Inst* j = inst(ip->out());
Inst* k = inst(ip->out1());
if (j->opcode() == kInstByteRange && j->out() == id &&
j->lo() == 0x00 && j->hi() == 0xFF &&
IsMatch(this, k)) {
ip->set_opcode(kInstAltMatch);
continue;
}
if (IsMatch(this, j) &&
k->opcode() == kInstByteRange && k->out() == id &&
k->lo() == 0x00 && k->hi() == 0xFF) {
ip->set_opcode(kInstAltMatch);
}
}
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
// no word boundary here
} else if (p == text.begin()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
}
if (!(flags & kEmptyWordBoundary))
flags |= kEmptyNonWordBoundary;
return flags;
}
void Prog::MarkByteRange(int lo, int hi) {
CHECK_GE(lo, 0);
CHECK_GE(hi, 0);
CHECK_LE(lo, 255);
CHECK_LE(hi, 255);
if (lo > 0)
byterange_.Set(lo - 1);
byterange_.Set(hi);
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for prog_.
// Ranges of bytes that are treated as indistinguishable
// by the regexp program are mapped to a single byte class.
// The vector prog_->byterange() marks the end of each
// such range.
const Bitmap<256>& v = byterange();
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
if ((i&31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
bits >>= 1;
}
bytemap_range_ = bytemap_[255] + 1;
unbytemap_ = new uint8[bytemap_range_];
for (int i = 0; i < 256; i++)
unbytemap_[bytemap_[i]] = i;
if (0) { // For debugging: use trivial byte map.
for (int i = 0; i < 256; i++) {
bytemap_[i] = i;
unbytemap_[i] = i;
}
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
}
}
} // namespace re2

376
re2/re2/prog.h Normal file
View File

@ -0,0 +1,376 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#ifndef RE2_PROG_H__
#define RE2_PROG_H__
#include "util/util.h"
#include "re2/re2.h"
namespace re2 {
// Simple fixed-size bitmap.
template<int Bits>
class Bitmap {
public:
Bitmap() { Reset(); }
int Size() { return Bits; }
void Reset() {
for (int i = 0; i < Words; i++)
w_[i] = 0;
}
bool Get(int k) const {
return w_[k >> WordLog] & (1<<(k & 31));
}
void Set(int k) {
w_[k >> WordLog] |= 1<<(k & 31);
}
void Clear(int k) {
w_[k >> WordLog] &= ~(1<<(k & 31));
}
uint32 Word(int i) const {
return w_[i];
}
private:
static const int WordLog = 5;
static const int Words = (Bits+31)/32;
uint32 w_[Words];
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
};
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
kInstCapture, // capturing parenthesis number cap_
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
};
// Bit flags for empty-width specials
enum EmptyOp {
kEmptyBeginLine = 1<<0, // ^ - beginning of line
kEmptyEndLine = 1<<1, // $ - end of line
kEmptyBeginText = 1<<2, // \A - beginning of text
kEmptyEndText = 1<<3, // \z - end of text
kEmptyWordBoundary = 1<<4, // \b - word boundary
kEmptyNonWordBoundary = 1<<5, // \B - not \b
kEmptyAllFlags = (1<<6)-1,
};
class Regexp;
class DFA;
struct OneState;
// Compiled form of regexp program.
class Prog {
public:
Prog();
~Prog();
// Single instruction in regexp program.
class Inst {
public:
Inst() : out_opcode_(0), out1_(0) { }
// Constructors per opcode
void InitAlt(uint32 out, uint32 out1);
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
void InitCapture(int cap, uint32 out);
void InitEmptyWidth(EmptyOp empty, uint32 out);
void InitMatch(int id);
void InitNop(uint32 out);
void InitFail();
// Getters
int id(Prog* p) { return this - p->inst_; }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int out() { return out_opcode_>>3; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog *p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange;
}
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase_ && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return lo_ <= c && c <= hi_;
}
// Returns string representation for debugging.
string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_, and PatchList steals another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<3) | opcode;
}
void set_out(int out) {
out_opcode_ = (out<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<3) | opcode;
}
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
union { // additional instruction arguments:
uint32 out1_; // opcode == kInstAlt
// alternate next instruction
int32 cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32 match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
struct { // opcode == kInstByteRange
uint8 lo_; // byte range is lo_-hi_ inclusive
uint8 hi_; //
uint8 foldcase_; // convert A-Z to a-z before checking range.
};
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
};
friend class Compiler;
friend struct PatchList;
friend class Prog;
DISALLOW_EVIL_CONSTRUCTORS(Inst);
};
// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
kAnchored, // match only starting at beginning of text
};
// Kind of match to look for (for anchor != kFullMatch)
//
// kLongestMatch mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
//
// kFullMatch is not strictly necessary -- we could use
// kLongestMatch and then check the length of the match -- but
// the matching code can run faster if it knows to consider only
// full matches.
enum MatchKind {
kFirstMatch, // like Perl, PCRE
kLongestMatch, // like egrep or POSIX
kFullMatch, // match only entire text; implies anchor==kAnchored
kManyMatch // for SearchDFA, records set of matches
};
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int64 size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int64 byte_inst_count() { return byte_inst_count_; }
const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8* bytemap() { return bytemap_; }
// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
// Record that at some point in the prog, the bytes in the range
// lo-hi (inclusive) are treated as different from bytes outside the range.
// Tracking this lets the DFA collapse commonly-treated byte ranges
// when recording state pointers, greatly reducing its memory footprint.
void MarkByteRange(int lo, int hi);
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32 EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
// This is not right for Unicode, but:
// - it's hard to get right in a byte-at-a-time matching world
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8 c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
c == '_';
}
// Execution engines. They all search for the regexp (run the prog)
// in text, which is in the larger context (used for ^ $ \b etc).
// Anchor and kind control the kind of search.
// Returns true if match found, false if not.
// If match found, fills match[0..nmatch-1] with submatch info.
// match[0] is overall match, match[1] is first set of parens, etc.
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
// Matching text == StringPiece(NULL, 0) is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
bool SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
// Returns whether a match was found.
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match0, bool* failed,
vector<int>* matches);
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work. This function is useful only
// for testing purposes. Returns number of states.
int BuildEntireDFA(MatchKind kind);
// Compute byte map.
void ComputeByteMap();
// Run peep-hole optimizer on program.
void Optimize();
// One-pass NFA: only correct if IsOnePass() is true,
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the program size and the text size.
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
// Backtracking search: the gold standard against which the other
// implementations are checked. FOR TESTING ONLY.
// It allocates a ton of memory to avoid running forever.
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
bool UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int byte_inst_count_; // number of kInstByteRange instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int flags_; // regexp parse flags
int onepass_statesize_; // byte size of each OneState* node
Inst* inst_; // pointer to instruction array
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
void (*delete_dfa_)(DFA* dfa);
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
// commonly-treated byte range.
uint8 bytemap_[256]; // map from input bytes to byte classes
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
uint8* onepass_nodes_; // data for OnePass nodes
OneState* onepass_start_; // start node for OnePass program
DISALLOW_EVIL_CONSTRUCTORS(Prog);
};
} // namespace re2
#endif // RE2_PROG_H__

1182
re2/re2/re2.cc Normal file

File diff suppressed because it is too large Load Diff

837
re2/re2/re2.h Normal file
View File

@ -0,0 +1,837 @@
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_RE2_H
#define RE2_RE2_H
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
// \d, \w, \s, ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the re2 library and hence supports
// its syntax for regular expressions, which is similar to Perl's with
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
// See http://code.google.com/p/re2/wiki/Syntax for the syntax
// supported by RE2, and a comparison with PCRE and PERL regexps.
//
// For those not familiar with Perl's regular expressions,
// here are some examples of the most commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(RE2::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!RE2::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, the pattern and input text are interpreted as UTF-8.
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
//
// Example:
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUB-STRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched subpieces.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// string s;
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns:
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
// This may get a little faster in the future, but right now is slower
// than PCRE. On the other hand, failed matches run *very* fast (faster
// than PCRE), as do matches without substring extraction.
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(RE2::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PRE-COMPILED REGULAR EXPRESSIONS
//
// RE2 makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "RE2"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// RE2 pattern("h.*o");
// while (ReadLine(&str)) {
// if (RE2::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// string var;
// int value;
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// RE2::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// USING VARIABLE NUMBER OF ARGUMENTS
//
// The above operations require you to know the number of arguments
// when you write the code. This is not always possible or easy (for
// example, the regular expression may be calculated at run time).
// You can use the "N" version of the operations when the number of
// match arguments are determined at run time.
//
// Example:
// const RE2::Arg* args[10];
// int n;
// // ... populate args with pointers to RE2::Arg values ...
// // ... set n to the number of RE2::Arg objects ...
// bool match = RE2::FullMatchN(input, pattern, args, n);
//
// The last statement is equivalent to
//
// bool match = RE2::FullMatch(input, pattern,
// *args[0], *args[1], ..., *args[n - 1]);
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
// will leave 64 in a, b, c, and d.
#include <stdint.h>
#include <map>
#include <string>
#include "re2/stringpiece.h"
#include "re2/variadic_function.h"
namespace re2 {
using std::string;
using std::map;
class Mutex;
class Prog;
class Regexp;
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
// concurrent use by multiple threads.
class RE2 {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
class Options;
// Defined in set.h.
class Set;
enum ErrorCode {
NoError = 0,
// Unexpected error
ErrorInternal,
// Parse errors
ErrorBadEscape, // bad escape sequence
ErrorBadCharClass, // bad character class
ErrorBadCharRange, // bad character class range
ErrorMissingBracket, // missing closing ]
ErrorMissingParen, // missing closing )
ErrorTrailingBackslash, // trailing \ at end of regexp
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
ErrorRepeatSize, // bad repetition argument
ErrorRepeatOp, // bad repetition operator
ErrorBadPerlOp, // bad perl operator
ErrorBadUTF8, // invalid UTF-8 in regexp
ErrorBadNamedCapture, // bad named capture group
ErrorPatternTooLarge, // pattern too large (compile failed)
};
// Predefined common options.
// If you need more complicated things, instantiate
// an Option class, change the settings, and pass it to the
// RE2 constructor.
static const Options DefaultOptions;
static const Options Latin1; // treat input as Latin-1 (default UTF-8)
static const Options POSIX; // POSIX syntax, leftmost-longest match
static const Options Quiet; // do not log about regexp parse errors
// Need to have the const char* and const string& forms for implicit
// conversions when passing string literals to FullMatch and PartialMatch.
// Otherwise the StringPiece form would be sufficient.
#ifndef SWIG
RE2(const char* pattern);
RE2(const string& pattern);
#endif
RE2(const StringPiece& pattern);
RE2(const StringPiece& pattern, const Options& option);
~RE2();
// Returns whether RE2 was created properly.
bool ok() const { return error_code() == NoError; }
// The string specification for this RE2. E.g.
// RE2 re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const string& pattern() const { return pattern_; }
// If RE2 could not be created properly, returns an error string.
// Else returns the empty string.
const string& error() const { return *error_; }
// If RE2 could not be created properly, returns an error code.
// Else returns RE2::NoError (== 0).
ErrorCode error_code() const { return error_code_; }
// If RE2 could not be created properly, returns the offending
// portion of the regexp.
const string& error_arg() const { return error_arg_; }
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
re2::Regexp* Regexp() const { return entire_regexp_; }
/***** The useful part: the matching interface *****/
// Matches "text" against "pattern". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, int)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// a substring of "text".
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(RE2::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace(), except replaces successive non-overlapping occurrences
// of the pattern in the string with the rewrite. E.g.
//
// string s = "yabba dabba doo";
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
// Replacements are not subject to re-matching.
//
// Because GlobalReplace only replaces non-overlapping matches,
// replacing "ana" within "banana" makes only one replacement, not two.
//
// Returns the number of replacements made.
static int GlobalReplace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
static bool Extract(const StringPiece &text,
const RE2& pattern,
const StringPiece &rewrite,
string *out);
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
// Generic matching interface
// Type of match.
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH, // Anchor at start and end
};
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction. The overall match ($0)
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const;
// Return a map from names to capturing indices.
// The map records the index of the leftmost group
// with the given name.
// Only valid until the re is deleted.
const map<string, int>& NamedCapturingGroups() const;
// Return a map from capturing indices to names.
// The map has no entries for unnamed groups.
// Only valid until the re is deleted.
const map<int, string>& CapturingGroupNames() const;
// General matching routine.
// Match against text starting at offset startpos
// and stopping the search at offset endpos.
// Returns true if match found, false if not.
// On a successful match, fills in match[] (up to nmatch entries)
// with information about submatches.
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
//
// Don't ask for more match information than you will use:
// runs much faster with nmatch == 1 than nmatch > 1, and
// runs even faster if nmatch == 0.
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
// but will be handled correctly.
//
// Passing text == StringPiece(NULL, 0) will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, match[i] == NULL.
bool Match(const StringPiece& text,
int startpos,
int endpos,
Anchor anchor,
StringPiece *match,
int nmatch) const;
// Check that the given rewrite string is suitable for use with this
// regular expression. It checks that:
// * The regular expression has enough parenthesized subexpressions
// to satisfy all of the \N tokens in rewrite
// * The rewrite string doesn't have any syntax errors. E.g.,
// '\' followed by anything other than a digit or '\'.
// A true return value guarantees that Replace() and Extract() won't
// fail because of a bad rewrite string.
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
// Constructor options
class Options {
public:
// The options are (defaults in parentheses):
//
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
// posix_syntax (false) restrict regexps to POSIX egrep syntax
// longest_match (false) search for longest match, not first match
// log_errors (true) log syntax and execution errors to ERROR
// max_mem (see below) approx. max memory footprint of RE2
// literal (false) interpret string as literal, not regexp
// never_nl (false) never match \n, even if it is in regexp
// case_sensitive (true) match is case-sensitive (regexp can override
// with (?i) unless in posix_syntax mode)
//
// The following options are only consulted when posix_syntax == true.
// (When posix_syntax == false these features are always enabled and
// cannot be turned off.)
// perl_classes (false) allow Perl's \d \s \w \D \S \W
// word_boundary (false) allow Perl's \b \B (word boundary and not)
// one_line (false) ^ and $ only match beginning and end of text
//
// The max_mem option controls how much memory can be used
// to hold the compiled form of the regexp (the Prog) and
// its cached DFA graphs. Code Search placed limits on the number
// of Prog instructions and DFA states: 10,000 for both.
// In RE2, those limits would translate to about 240 KB per Prog
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
// better job of keeping them small than Code Search did).
// Each RE2 has two Progs (one forward, one reverse), and each Prog
// can have two DFAs (one first match, one longest match).
// That makes 4 DFAs:
//
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
// if opt.longest_match() == false
// forward, longest-match - used for all ANCHOR_BOTH searches,
// and the other two kinds if
// opt.longest_match() == true
// reverse, first-match - never used
// reverse, longest-match - used as second phase for unanchored searches
//
// The RE2 memory budget is statically divided between the two
// Progs and then the DFAs: two thirds to the forward Prog
// and one third to the reverse Prog. The forward Prog gives half
// of what it has left over to each of its DFAs. The reverse Prog
// gives it all to its longest-match DFA.
//
// Once a DFA fills its budget, it flushes its cache and starts over.
// If this happens too often, RE2 falls back on the NFA implementation.
// For now, make the default budget something close to Code Search.
static const int kDefaultMaxMem = 8<<20;
enum Encoding {
EncodingUTF8 = 1,
EncodingLatin1
};
Options() :
encoding_(EncodingUTF8),
posix_syntax_(false),
longest_match_(false),
log_errors_(true),
max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
case_sensitive_(true),
perl_classes_(false),
word_boundary_(false),
one_line_(false) {
}
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
// Legacy interface to encoding.
// TODO(rsc): Remove once clients have been converted.
bool utf8() const { return encoding_ == EncodingUTF8; }
void set_utf8(bool b) {
if (b) {
encoding_ = EncodingUTF8;
} else {
encoding_ = EncodingLatin1;
}
}
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
bool longest_match() const { return longest_match_; }
void set_longest_match(bool b) { longest_match_ = b; }
bool log_errors() const { return log_errors_; }
void set_log_errors(bool b) { log_errors_ = b; }
int max_mem() const { return max_mem_; }
void set_max_mem(int m) { max_mem_ = m; }
bool literal() const { return literal_; }
void set_literal(bool b) { literal_ = b; }
bool never_nl() const { return never_nl_; }
void set_never_nl(bool b) { never_nl_ = b; }
bool case_sensitive() const { return case_sensitive_; }
void set_case_sensitive(bool b) { case_sensitive_ = b; }
bool perl_classes() const { return perl_classes_; }
void set_perl_classes(bool b) { perl_classes_ = b; }
bool word_boundary() const { return word_boundary_; }
void set_word_boundary(bool b) { word_boundary_ = b; }
bool one_line() const { return one_line_; }
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
encoding_ = src.encoding_;
posix_syntax_ = src.posix_syntax_;
longest_match_ = src.longest_match_;
log_errors_ = src.log_errors_;
max_mem_ = src.max_mem_;
literal_ = src.literal_;
never_nl_ = src.never_nl_;
case_sensitive_ = src.case_sensitive_;
perl_classes_ = src.perl_classes_;
word_boundary_ = src.word_boundary_;
one_line_ = src.one_line_;
}
int ParseFlags() const;
private:
// Private constructor for defining constants like RE2::Latin1.
friend class RE2;
Options(Encoding encoding,
bool posix_syntax,
bool longest_match,
bool log_errors) :
encoding_(encoding),
posix_syntax_(posix_syntax),
longest_match_(longest_match),
log_errors_(log_errors),
max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
case_sensitive_(true),
perl_classes_(false),
word_boundary_(false),
one_line_(false) {
}
Encoding encoding_;
bool posix_syntax_;
bool longest_match_;
bool log_errors_;
int64_t max_mem_;
bool literal_;
bool never_nl_;
bool case_sensitive_;
bool perl_classes_;
bool word_boundary_;
bool one_line_;
//DISALLOW_EVIL_CONSTRUCTORS(Options);
Options(const Options&);
void operator=(const Options&);
};
// Returns the options set in the constructor.
const Options& options() const { return options_; };
// Argument converters; see below.
static inline Arg CRadix(short* x);
static inline Arg CRadix(unsigned short* x);
static inline Arg CRadix(int* x);
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
static inline Arg Hex(int* x);
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
static inline Arg Octal(int* x);
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
private:
void Init(const StringPiece& pattern, const Options& options);
bool Rewrite(string *out,
const StringPiece &rewrite,
const StringPiece* vec,
int veclen) const;
bool DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
const Arg* const args[],
int n) const;
re2::Prog* ReverseProg() const;
mutable Mutex* mutex_;
string pattern_; // string regular expression
Options options_; // option flags
string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
mutable re2::Prog* rprog_; // reverse program for regexp
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable const string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable string error_arg_; // Fragment of regexp showing error
mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
// Map from capture indices to names
mutable const map<int, string>* group_names_;
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
RE2(const RE2&);
void operator=(const RE2&);
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, int n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class RE2::Arg {
public:
// Empty constructor so we can declare arrays of RE2::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, int n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type,name) \
Arg(type* p) : arg_(p), parser_(name) { } \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_char);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
// Generic constructor
template <class T> Arg(T*, Parser parser);
// Generic constructor template
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
}
// Parse the data
bool Parse(const char* str, int n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_ ## name(const char* str, int n, void* dest); \
static bool parse_ ## name ## _radix( \
const char* str, int n, void* dest, int radix); \
public: \
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
DECLARE_INTEGER_PARSER(int);
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, int n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
MAKE_INTEGER_PARSER(short, short);
MAKE_INTEGER_PARSER(unsigned short, ushort);
MAKE_INTEGER_PARSER(int, int);
MAKE_INTEGER_PARSER(unsigned int, uint);
MAKE_INTEGER_PARSER(long, long);
MAKE_INTEGER_PARSER(unsigned long, ulong);
MAKE_INTEGER_PARSER(long long, longlong);
MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
#undef MAKE_INTEGER_PARSER
} // namespace re2
using re2::RE2;
#endif /* RE2_RE2_H */

920
re2/re2/regexp.cc Normal file
View File

@ -0,0 +1,920 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression representation.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(op),
simple_(false),
parse_flags_(static_cast<uint16>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
subone_ = NULL;
memset(the_union_, 0, sizeof the_union_);
}
// Destructor. Assumes already cleaned up children.
// Private: use Decref() instead of delete to destroy Regexps.
// Can't call Decref on the sub-Regexps here because
// that could cause arbitrarily deep recursion, so
// required Decref() to have handled them for us.
Regexp::~Regexp() {
if (nsub_ > 0)
LOG(DFATAL) << "Regexp not destroyed.";
switch (op_) {
default:
break;
case kRegexpCapture:
delete name_;
break;
case kRegexpLiteralString:
delete[] runes_;
break;
case kRegexpCharClass:
cc_->Delete();
delete ccb_;
break;
}
}
// If it's possible to destroy this regexp without recurring,
// do so and return true. Else return false.
bool Regexp::QuickDestroy() {
if (nsub_ == 0) {
delete this;
return true;
}
return false;
}
static map<Regexp*, int> ref_map;
static Mutex ref_mutex;
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
MutexLock l(&ref_mutex);
return ref_map[this];
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
// Store ref count in overflow map.
MutexLock l(&ref_mutex);
if (ref_ == kMaxRef) { // already overflowed
ref_map[this]++;
return this;
}
// overflowing now
ref_map[this] = kMaxRef;
ref_ = kMaxRef;
return this;
}
ref_++;
return this;
}
// Decrements reference count and deletes this object if count reaches 0.
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
MutexLock l(&ref_mutex);
int r = ref_map[this] - 1;
if (r < kMaxRef) {
ref_ = r;
ref_map.erase(this);
} else {
ref_map[this] = r;
}
return;
}
ref_--;
if (ref_ == 0)
Destroy();
}
// Deletes this object; ref count has count reached 0.
void Regexp::Destroy() {
if (QuickDestroy())
return;
// Handle recursive Destroy with explicit stack
// to avoid arbitrarily deep recursion on process stack [sigh].
down_ = NULL;
Regexp* stack = this;
while (stack != NULL) {
Regexp* re = stack;
stack = re->down_;
if (re->ref_ != 0)
LOG(DFATAL) << "Bad reference count " << re->ref_;
if (re->nsub_ > 0) {
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
if (sub == NULL)
continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
--sub->ref_;
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
sub->down_ = stack;
stack = sub;
}
}
if (re->nsub_ > 1)
delete[] subs;
re->nsub_ = 0;
}
delete re;
}
}
void Regexp::AddRuneToString(Rune r) {
DCHECK(op_ == kRegexpLiteralString);
if (nrunes_ == 0) {
// start with 8
runes_ = new Rune[8];
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
// double on powers of two
Rune *old = runes_;
runes_ = new Rune[nrunes_ * 2];
for (int i = 0; i < nrunes_; i++)
runes_[i] = old[i];
delete[] old;
}
runes_[nrunes_++] = r;
}
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
re->match_id_ = match_id;
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
subcopy = new Regexp*[nsub];
memmove(subcopy, sub, nsub * sizeof sub[0]);
sub = subcopy;
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
delete[] subcopy;
return re;
}
}
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
Regexp* re = new Regexp(op, flags);
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
delete[] subcopy;
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(nsub);
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
}
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
Regexp* re = new Regexp(kRegexpCapture, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->cap_ = cap;
return re;
}
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
Regexp* re = new Regexp(kRegexpRepeat, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->min_ = min;
re->max_ = max;
return re;
}
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpLiteral, flags);
re->rune_ = rune;
return re;
}
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
if (nrunes <= 0)
return new Regexp(kRegexpEmptyMatch, flags);
if (nrunes == 1)
return NewLiteral(runes[0], flags);
Regexp* re = new Regexp(kRegexpLiteralString, flags);
for (int i = 0; i < nrunes; i++)
re->AddRuneToString(runes[i]);
return re;
}
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpCharClass, flags);
re->cc_ = cc;
return re;
}
// Swaps this and that in place.
void Regexp::Swap(Regexp* that) {
// Can use memmove because Regexp is just a struct (no vtable).
char tmp[sizeof *this];
memmove(tmp, this, sizeof tmp);
memmove(this, that, sizeof tmp);
memmove(that, tmp, sizeof tmp);
}
// Tests equality of all top-level structure but not subregexps.
static bool TopEqual(Regexp* a, Regexp* b) {
if (a->op() != b->op())
return false;
switch (a->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpBeginText:
return true;
case kRegexpEndText:
// The parse flags remember whether it's \z or (?-m:$),
// which matters when testing against PCRE.
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
case kRegexpLiteral:
return a->rune() == b->rune() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
case kRegexpLiteralString:
return a->nrunes() == b->nrunes() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
memcmp(a->runes(), b->runes(),
a->nrunes() * sizeof a->runes()[0]) == 0;
case kRegexpAlternate:
case kRegexpConcat:
return a->nsub() == b->nsub();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
case kRegexpRepeat:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
a->min() == b->min() &&
a->max() == b->max();
case kRegexpCapture:
return a->cap() == b->cap() && a->name() == b->name();
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
case kRegexpCharClass: {
CharClass* acc = a->cc();
CharClass* bcc = b->cc();
return acc->size() == bcc->size() &&
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
memcmp(acc->begin(), bcc->begin(),
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
}
}
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
return 0;
}
bool Regexp::Equal(Regexp* a, Regexp* b) {
if (a == NULL || b == NULL)
return a == b;
if (!TopEqual(a, b))
return false;
// Fast path:
// return without allocating vector if there are no subregexps.
switch (a->op()) {
case kRegexpAlternate:
case kRegexpConcat:
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
break;
default:
return true;
}
// Committed to doing real work.
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
Regexp* a2;
Regexp* b2;
switch (a->op()) {
default:
break;
case kRegexpAlternate:
case kRegexpConcat:
for (int i = 0; i < a->nsub(); i++) {
a2 = a->sub()[i];
b2 = b->sub()[i];
if (!TopEqual(a2, b2))
return false;
stk.push_back(a2);
stk.push_back(b2);
}
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
a2 = a->sub()[0];
b2 = b->sub()[0];
if (!TopEqual(a2, b2))
return false;
// Really:
// stk.push_back(a2);
// stk.push_back(b2);
// break;
// but faster to assign directly and loop.
a = a2;
b = b2;
continue;
}
int n = stk.size();
if (n == 0)
break;
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
}
return true;
}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const string kErrorStrings[] = {
"no error",
"unexpected error",
"invalid escape sequence",
"invalid character class",
"invalid character class range",
"missing ]",
"missing )",
"trailing \\",
"no argument for repetition operator",
"invalid repetition size",
"bad repetition operator",
"invalid perl operator",
"invalid UTF-8",
"invalid named capture group",
};
const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
if (code < 0 || code >= arraysize(kErrorStrings))
code = kRegexpInternalError;
return kErrorStrings[code];
}
string RegexpStatus::Text() const {
if (error_arg_.empty())
return CodeText(code_);
string s;
s.append(CodeText(code_));
s.append(": ");
s.append(error_arg_.data(), error_arg_.size());
return s;
}
void RegexpStatus::Copy(const RegexpStatus& status) {
code_ = status.code_;
error_arg_ = status.error_arg_;
}
typedef int Ignored; // Walker<void> doesn't exist
// Walker subclass to count capturing parens in regexp.
class NumCapturesWalker : public Regexp::Walker<Ignored> {
public:
NumCapturesWalker() : ncapture_(0) {}
int ncapture() { return ncapture_; }
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture)
ncapture_++;
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
return ignored;
}
private:
int ncapture_;
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
};
int Regexp::NumCaptures() {
NumCapturesWalker w;
w.Walk(this, 0);
return w.ncapture();
}
// Walker class to build map of named capture groups and their indices.
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
public:
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
map<string, int>* TakeMap() {
map<string, int>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
// multiple times, only the leftmost one counts.)
if (map_->find(*re->name()) == map_->end())
(*map_)[*re->name()] = re->cap();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
return ignored;
}
private:
map<string, int>* map_;
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
};
map<string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Walker class to build map from capture group indices to their names.
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
public:
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
map<int, string>* TakeMap() {
map<int, string>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<int, string>;
(*map_)[re->cap()] = *re->name();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
return ignored;
}
private:
map<int, string>* map_;
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
};
map<int, string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
prefix->clear();
*foldcase = false;
*suffix = NULL;
if (op_ != kRegexpConcat)
return false;
// Some number of anchors, then a literal or concatenation.
int i = 0;
Regexp** sub = this->sub();
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = re->runes_[j];
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
prefix->resize(re->nrunes_ * UTFmax);
char *p = &(*prefix)[0];
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = r;
else
p += runetochar(p, &r);
}
prefix->resize(p - &(*prefix)[0]);
}
break;
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, re->rune_);
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase);
i++;
// The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
sub[j]->Incref();
re = Concat(sub + i, nsub_ - i, parse_flags());
} else {
re = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
return true;
}
// Character class builder is a balanced binary tree (STL set)
// containing non-overlapping, non-abutting RuneRanges.
// The less-than operator used in the tree treats two
// ranges as equal if they overlap at all, so that
// lookups for a particular Rune are possible.
CharClassBuilder::CharClassBuilder() {
nrunes_ = 0;
upper_ = 0;
lower_ = 0;
}
// Add lo-hi to the class; return whether class got bigger.
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (hi < lo)
return false;
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = max<Rune>(lo, 'A');
Rune hi1 = min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
lo1 = max<Rune>(lo, 'a');
hi1 = min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
{ // Check whether lo, hi is already in the class.
iterator it = ranges_.find(RuneRange(lo, lo));
if (it != end() && it->lo <= lo && hi <= it->hi)
return false;
}
// Look for a range abutting lo on the left.
// If it exists, take it out and increase our range.
if (lo > 0) {
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
if (it != end()) {
lo = it->lo;
if (it->hi > hi)
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for a range abutting hi on the right.
// If it exists, take it out and increase our range.
if (hi < Runemax) {
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
if (it != end()) {
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for ranges between lo and hi. Take them out.
// This is only safe because the set has no overlapping ranges.
// We've already removed any ranges abutting lo and hi, so
// any that overlap [lo, hi] must be contained within it.
for (;;) {
iterator it = ranges_.find(RuneRange(lo, hi));
if (it == end())
break;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
// Finally, add [lo, hi].
nrunes_ += hi - lo + 1;
ranges_.insert(RuneRange(lo, hi));
return true;
}
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
for (iterator it = cc->begin(); it != cc->end(); ++it)
AddRange(it->lo, it->hi);
}
bool CharClassBuilder::Contains(Rune r) {
return ranges_.find(RuneRange(r, r)) != end();
}
// Does the character class behave the same on A-Z as on a-z?
bool CharClassBuilder::FoldsASCII() {
return ((upper_ ^ lower_) & AlphaMask) == 0;
}
CharClassBuilder* CharClassBuilder::Copy() {
CharClassBuilder* cc = new CharClassBuilder;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_.insert(RuneRange(it->lo, it->hi));
cc->upper_ = upper_;
cc->lower_ = lower_;
cc->nrunes_ = nrunes_;
return cc;
}
void CharClassBuilder::RemoveAbove(Rune r) {
if (r >= Runemax)
return;
if (r < 'z') {
if (r < 'a')
lower_ = 0;
else
lower_ &= AlphaMask >> ('z' - r);
}
if (r < 'Z') {
if (r < 'A')
upper_ = 0;
else
upper_ &= AlphaMask >> ('Z' - r);
}
for (;;) {
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
if (it == end())
break;
RuneRange rr = *it;
ranges_.erase(it);
nrunes_ -= rr.hi - rr.lo + 1;
if (rr.lo <= r) {
rr.hi = r;
ranges_.insert(rr);
nrunes_ += rr.hi - rr.lo + 1;
}
}
}
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
// the current class begins at 0.
iterator it = begin();
if (it == end()) {
v.push_back(RuneRange(0, Runemax));
} else {
int nextlo = 0;
if (it->lo == 0) {
nextlo = it->hi + 1;
++it;
}
for (; it != end(); ++it) {
v.push_back(RuneRange(nextlo, it->lo - 1));
nextlo = it->hi + 1;
}
if (nextlo <= Runemax)
v.push_back(RuneRange(nextlo, Runemax));
}
ranges_.clear();
for (int i = 0; i < v.size(); i++)
ranges_.insert(v[i]);
upper_ = AlphaMask & ~upper_;
lower_ = AlphaMask & ~lower_;
nrunes_ = Runemax+1 - nrunes_;
}
// Character class is a sorted list of ranges.
// The ranges are allocated in the same block as the header,
// necessitating a special allocator and Delete method.
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
cc->folds_ascii_ = false;
cc->nrunes_ = 0;
return cc;
}
void CharClass::Delete() {
if (this == NULL)
return;
uint8 *data = reinterpret_cast<uint8*>(this);
delete[] data;
}
CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(nranges_+1);
cc->folds_ascii_ = folds_ascii_;
cc->nrunes_ = Runemax + 1 - nrunes_;
int n = 0;
int nextlo = 0;
for (CharClass::iterator it = begin(); it != end(); ++it) {
if (it->lo == nextlo) {
nextlo = it->hi + 1;
} else {
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
nextlo = it->hi + 1;
}
}
if (nextlo <= Runemax)
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
cc->nranges_ = n;
return cc;
}
bool CharClass::Contains(Rune r) {
RuneRange* rr = ranges_;
int n = nranges_;
while (n > 0) {
int m = n/2;
if (rr[m].hi < r) {
rr += m+1;
n -= m+1;
} else if (r < rr[m].lo) {
n = m;
} else { // rr[m].lo <= r && r <= rr[m].hi
return true;
}
}
return false;
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(ranges_.size());
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;
cc->nranges_ = n;
DCHECK_LE(n, ranges_.size());
cc->nrunes_ = nrunes_;
cc->folds_ascii_ = FoldsASCII();
return cc;
}
} // namespace re2

632
re2/re2/regexp.h Normal file
View File

@ -0,0 +1,632 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------
// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps. RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
// 1. Backreferences (optional in POSIX EREs) are not supported.
// (Supporting them precludes the use of DFA-based
// matching engines.)
//
// 2. Collating elements and collation classes are not supported.
// (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse. In particular, many of the basic Perl additions
// are available. The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression. These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
// 1. The syntax and meaning of regular expressions is guaranteed
// to be that used by Regexp's parser, not the syntax expected
// by the sublibrary. Regexp might accept a restricted or
// expanded syntax for regular expressions as compared with
// the sublibrary. As long as Regexp can translate from its
// internal form into the sublibrary's, clients need not know
// exactly which sublibrary they are using.
//
// 2. The sublibrary parsers are bypassed. For whatever reason,
// sublibrary regular expression parsers often have security
// problems. For example, plan9grep's regular expression parser
// has a buffer overflow in its handling of large character
// classes, and PCRE's parser has had buffer overflow problems
// in the past. Security-team requires sandboxing of sublibrary
// regular expression parsers. Avoiding the sublibrary parsers
// avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#ifndef RE2_REGEXP_H__
#define RE2_REGEXP_H__
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
// Matches no strings.
kRegexpNoMatch = 1,
// Matches empty string.
kRegexpEmptyMatch,
// Matches rune_.
kRegexpLiteral,
// Matches runes_.
kRegexpLiteralString,
// Matches concatenation of sub_[0..nsub-1].
kRegexpConcat,
// Matches union of sub_[0..nsub-1].
kRegexpAlternate,
// Matches sub_[0] zero or more times.
kRegexpStar,
// Matches sub_[0] one or more times.
kRegexpPlus,
// Matches sub_[0] zero or one times.
kRegexpQuest,
// Matches sub_[0] at least min_ times, at most max_ times.
// max_ == -1 means no upper limit.
kRegexpRepeat,
// Parenthesized (capturing) subexpression. Index is cap_.
// Optionally, capturing name is name_.
kRegexpCapture,
// Matches any character.
kRegexpAnyChar,
// Matches any byte [sic].
kRegexpAnyByte,
// Matches empty string at beginning of line.
kRegexpBeginLine,
// Matches empty string at end of line.
kRegexpEndLine,
// Matches word boundary "\b".
kRegexpWordBoundary,
// Matches not-a-word boundary "\B".
kRegexpNoWordBoundary,
// Matches empty string at beginning of text.
kRegexpBeginText,
// Matches empty string at end of text.
kRegexpEndText,
// Matches character class given by cc_.
kRegexpCharClass,
// Forces match of entire expression right now,
// with match ID match_id_ (used by RE2::Set).
kRegexpHaveMatch,
kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
// No error
kRegexpSuccess = 0,
// Unexpected error
kRegexpInternalError,
// Parse errors
kRegexpBadEscape, // bad escape sequence
kRegexpBadCharClass, // bad character class
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
kRegexpRepeatOp, // bad repetition operator
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
};
// Error status for certain operations.
class RegexpStatus {
public:
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(enum RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
enum RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
void Copy(const RegexpStatus& status);
// Returns text equivalent of code, e.g.:
// "Bad character class"
static const string& CodeText(enum RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
string Text() const;
private:
enum RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
string* tmp_; // Temporary storage, possibly where error_arg_ is.
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
};
// Walker to implement Simplify.
class SimplifyWalker;
// Compiled form; see prog.h
class Prog;
struct RuneRange {
RuneRange() : lo(0), hi(0) { }
RuneRange(int l, int h) : lo(l), hi(h) { }
Rune lo;
Rune hi;
};
// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
bool operator()(const RuneRange& a, const RuneRange& b) const {
return a.hi < b.lo;
}
};
class CharClassBuilder;
class CharClass {
public:
void Delete();
typedef RuneRange* iterator;
iterator begin() { return ranges_; }
iterator end() { return ranges_ + nranges_; }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
bool Contains(Rune r);
CharClass* Negate();
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
friend class CharClassBuilder;
bool folds_ascii_;
int nrunes_;
RuneRange *ranges_;
int nranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
};
class Regexp {
public:
// Flags for parsing. Can be ORed together.
enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
// and [[:space:]] to match newline.
DotNL = 1<<3, // Allow . to match newline.
MatchNL = ClassNL | DotNL,
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
// end of text, not around embedded newlines.
// (Perl's default)
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
PerlClasses = 1<<7, // Allow Perl character classes like \d.
PerlB = 1<<8, // Allow Perl's \b and \B.
PerlX = 1<<9, // Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
// As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
// Internal use only.
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_; }
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
if(nsub_ <= 1)
return &subone_;
else
return submany_;
}
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
// Does not edit the current regexp.
// Caller must release return value with re->Decref().
// Simplified means that counted repetition has been rewritten
// into simpler terms and all Perl/POSIX features have been
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
int NumCaptures();
friend class NumCapturesWalker;
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
map<string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
map<int, string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
string ToString();
// Convenience functions. They consume the passed reference,
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
// They do not consume allocated arrays like subs or runes.
static Regexp* Plus(Regexp* sub, ParseFlags flags);
static Regexp* Star(Regexp* sub, ParseFlags flags);
static Regexp* Quest(Regexp* sub, ParseFlags flags);
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
static Regexp* HaveMatch(int match_id, ParseFlags flags);
// Like Alternate but does not factor out common prefixes.
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
string Dump();
// Helper traversal class, defined fully in walker-inl.h.
template<typename T> class Walker;
// Compile to Prog. See prog.h
// Reverse prog expects to be run over text backward.
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64 max_mem);
Prog* CompileToReverseProg(int64 max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
// obscure cases behave differently. Technically this is more a property
// of the Prog than the Regexp, but the computation is much easier to do
// on the Regexp. See mimics_pcre.cc for the exact conditions.
bool MimicsPCRE();
// Benchmarking function.
void NullWalk();
// Whether every match of this regexp must be anchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
// Use Decref() instead of delete to release Regexps.
// This is private to catch deletes at compile time.
~Regexp();
void Destroy();
bool QuickDestroy();
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
ParseFlags flags, bool can_factor);
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
static void RemoveLeadingString(Regexp* re, int n);
// Returns the leading regexp in re's top-level concatenation.
// The returned Regexp* points at re or a sub-expression of re,
// so it must not be used after the caller calls re->Decref().
static Regexp* LeadingRegexp(Regexp* re);
// Removes LeadingRegexp(re) from re and returns the remainder.
// Might edit re in place.
static Regexp* RemoveLeadingRegexp(Regexp* re);
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
static int FactorAlternationRecursive(Regexp** sub, int nsub,
ParseFlags flags, int maxdepth);
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
// take a long time. Do not call on such regexps, hence private.
static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
if (n < 0 || static_cast<uint16>(n) != n)
LOG(FATAL) << "Cannot AllocSub " << n;
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = n;
}
// Add Rune to LiteralString
void AddRuneToString(Rune r);
// Swaps this with that, in place.
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8 instead of RegexpOp to control space usage.
uint8 op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8 instead of bool to control space usage.
uint8 simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16 instead of ParseFlags to control space usage.
uint16 parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16 to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (100),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16 ref_;
static const uint16 kMaxRef = 0xffff;
// Subexpressions.
// uint16 to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16 nsub_;
static const uint16 kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
};
// Extra space for parse and teardown stacks.
Regexp* down_;
// Arguments to operator. See description of operators above.
union {
struct { // Repeat
int max_;
int min_;
};
struct { // Capture
int cap_;
string* name_;
};
struct { // LiteralString
int nrunes_;
Rune* runes_;
};
struct { // CharClass
// These two could be in separate union members,
// but it wouldn't save any space (there are other two-word structs)
// and keeping them separate avoids confusion during parsing.
CharClass* cc_;
CharClassBuilder* ccb_;
};
Rune rune_; // Literal
int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
};
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
iterator end() { return ranges_.end(); }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
CharClassBuilder* Copy();
void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
CharClass* GetCharClass();
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32 AlphaMask = (1<<26) - 1;
uint32 upper_; // bitmap of A-Z
uint32 lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
};
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
{
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
}
} // namespace re2
#endif // RE2_REGEXP_H__

113
re2/re2/set.cc Normal file
View File

@ -0,0 +1,113 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/set.h"
#include "util/util.h"
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
using namespace re2;
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
}
RE2::Set::~Set() {
for (int i = 0; i < re_.size(); i++)
re_[i]->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add after Compile";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
if (options_.log_errors())
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
return -1;
}
// Concatenate with match index and push on vector.
int n = re_.size();
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
for (int i = 0; i < nsub; i++)
sub[i] = re->sub()[i]->Incref();
sub[nsub] = m;
re->Decref();
re = re2::Regexp::Concat(sub, nsub + 1, pf);
delete[] sub;
} else {
re2::Regexp* sub[2];
sub[0] = re;
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
re_.push_back(re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile multiple times";
return false;
}
compiled_ = true;
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
re_.size(), pf);
re_.clear();
re2::Regexp* sre = re->Simplify();
re->Decref();
re = sre;
if (re == NULL) {
if (options_.log_errors())
LOG(ERROR) << "Error simplifying during Compile.";
return false;
}
prog_ = Prog::CompileSet(options_, anchor_, re);
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match without Compile";
return false;
}
v->clear();
bool failed;
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
Prog::kManyMatch, NULL, &failed, v);
if (failed)
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
if (ret == false)
return false;
if (v->size() == 0) {
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
return false;
}
return true;
}

55
re2/re2/set.h Normal file
View File

@ -0,0 +1,55 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H
#define RE2_SET_H
#include <utility>
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Add adds regexp pattern to the set, interpreted using the RE2 options.
// (The RE2 constructor's default options parameter is RE2::UTF8.)
// Add returns the regexp index that will be used to identify
// it in the result of Match, or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Error returns do not increment the index.
// If an error occurs and error != NULL, *error will hold an error message.
int Add(const StringPiece& pattern, string* error);
// Compile prepares the Set for matching.
// Add must not be called again after Compile.
// Compile must be called before FullMatch or PartialMatch.
// Compile may return false if it runs out of memory.
bool Compile();
// Match returns true if text matches any of the regexps in the set.
// If so, it fills v with the indices of the matching regexps.
bool Match(const StringPiece& text, vector<int>* v) const;
private:
RE2::Options options_;
RE2::Anchor anchor_;
vector<re2::Regexp*> re_;
re2::Prog* prog_;
bool compiled_;
//DISALLOW_EVIL_CONSTRUCTORS(Set);
Set(const Set&);
void operator=(const Set&);
};
} // namespace re2
#endif // RE2_SET_H

393
re2/re2/simplify.cc Normal file
View File

@ -0,0 +1,393 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Rewrite POSIX and other features in re
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
// Should not happen, since Simplify never fails.
LOG(ERROR) << "Simplify failed on " << src;
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple_)
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
if (ccb_ != NULL)
return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple_;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple_)
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside SimplifyWalker so that
// they can edit the private fields of the Regexps they construct.
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Caller must Decref return value when done with it.
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags parse_flags);
// Simplifies a character class by expanding any named classes
// into rune ranges. Does not edit re. Does not consume ref to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
};
// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions. The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit current object.
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
if (simple_)
return Incref();
SimplifyWalker w;
return w.Walk(this, NULL);
}
#define Simplify DontCallSimplify // Avoid accidental recursion
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple_) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
// Two passes to avoid allocation in the common case.
bool changed = false;
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
Regexp* newsub = child_args[i];
if (newsub != sub) {
changed = true;
break;
}
}
if (!changed) {
for (int i = 0; i < re->nsub_; i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub_);
Regexp** nre_subs = nre->sub();
for (int i = 0; i <re->nsub_; i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap_;
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
// These are also idempotent if flags are constant.
if (re->op() == newsub->op() &&
re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Returns a new Regexp, handing the ref to the caller.
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
if (min == 0)
return Regexp::Star(re->Incref(), f);
// Special case: x{1,} is x+
if (min == 1)
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
Regexp* nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
VLOG(1) << "Simplify " << min;
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return nre;
}
// Special case: (x){0} matches only empty string.
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
// Special case: x{1} is just x.
if (min == 1 && max == 1)
return re->Incref();
// General case: x{n,m} means n copies of x and m copies of x?.
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
}
// Build and attach suffix: (x(x(x)?)?)?
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min+1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
// Some degenerate case, like min > max, or min < max < 0.
// This shouldn't happen, because the parser rejects such regexps.
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
// Simplifies a character class.
// Caller must Decref return value when done with it.
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
// Special cases
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} // namespace re2

182
re2/re2/stringpiece.h Normal file
View File

@ -0,0 +1,182 @@
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece. The implicit conversion means that it is often appropriate
// to include this .h file in other files rather than forward-declaring
// StringPiece as would be appropriate for most other Google classes.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
#ifndef STRINGS_STRINGPIECE_H__
#define STRINGS_STRINGPIECE_H__
#include <string.h>
#include <cstddef>
#include <iosfwd>
#include <string>
namespace re2 {
class StringPiece {
private:
const char* ptr_;
int length_;
public:
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece() : ptr_(NULL), length_(0) { }
StringPiece(const char* str)
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
const char* data() const { return ptr_; }
int size() const { return length_; }
int length() const { return length_; }
bool empty() const { return length_ == 0; }
void clear() { ptr_ = NULL; length_ = 0; }
void set(const char* data, int len) { ptr_ = data; length_ = len; }
void set(const char* str) {
ptr_ = str;
if (str != NULL)
length_ = static_cast<int>(strlen(str));
else
length_ = 0;
}
void set(const void* data, int len) {
ptr_ = reinterpret_cast<const char*>(data);
length_ = len;
}
char operator[](int i) const { return ptr_[i]; }
void remove_prefix(int n) {
ptr_ += n;
length_ -= n;
}
void remove_suffix(int n) {
length_ -= n;
}
int compare(const StringPiece& x) const {
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
if (r == 0) {
if (length_ < x.length_) r = -1;
else if (length_ > x.length_) r = +1;
}
return r;
}
std::string as_string() const {
return std::string(data(), size());
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data(), size());
}
void CopyToString(std::string* target) const;
void AppendToString(std::string* target) const;
// Does "this" start with "x"
bool starts_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_, x.ptr_, x.length_) == 0));
}
// Does "this" end with "x"
bool ends_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
}
// standard STL container boilerplate
typedef char value_type;
typedef const char* pointer;
typedef const char& reference;
typedef const char& const_reference;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos;
typedef const char* const_iterator;
typedef const char* iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef std::reverse_iterator<iterator> reverse_iterator;
iterator begin() const { return ptr_; }
iterator end() const { return ptr_ + length_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(ptr_ + length_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(ptr_);
}
// STLS says return size_type, but Google says return int
int max_size() const { return length_; }
int capacity() const { return length_; }
int copy(char* buf, size_type n, size_type pos = 0) const;
int find(const StringPiece& s, size_type pos = 0) const;
int find(char c, size_type pos = 0) const;
int rfind(const StringPiece& s, size_type pos = npos) const;
int rfind(char c, size_type pos = npos) const;
StringPiece substr(size_type pos, size_type n = npos) const;
static bool _equal(const StringPiece&, const StringPiece&);
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
return StringPiece::_equal(x, y);
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
const int r = memcmp(x.data(), y.data(),
std::min(x.size(), y.size()));
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
return y < x;
}
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
return !(x > y);
}
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
} // namespace re2
// allow StringPiece to be logged
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
#endif // STRINGS_STRINGPIECE_H__

View File

@ -0,0 +1,254 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
//
// Prog::BadSearchBacktrack is a backtracking regular expression search,
// except that it remembers where it has been, trading a lot of
// memory for a lot of time. It exists only for testing purposes.
//
// Let me repeat that.
//
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
// - It uses a ton of memory.
// - It uses a ton of stack.
// - It uses CHECK and LOG(FATAL).
// - It implements unanchored search by repeated anchored search.
//
// On the other hand, it is very simple and a good reference
// implementation for the more complicated regexp packages.
//
// In BUILD, this file is linked into the ":testing" library,
// not the main library, in order to make it harder to pick up
// accidentally.
#include "util/util.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
// Backtracker holds the state for a backtracking search.
//
// Excluding the search parameters, the main search state
// is just the "capture registers", which record, for the
// current execution, the string position at which each
// parenthesis was passed. cap_[0] and cap_[1] are the
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
//
// To avoid infinite loops during backtracking on expressions
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
// pairs that have already been explored and are thus not worth
// re-exploring if we get there via another path. Modern backtracking
// libraries engineer their program representation differently, to make
// such infinite loops possible to avoid without keeping a giant visited_
// bitmap, but visited_ works fine for a reference implementation
// and it has the nice benefit of making the search run in linear time.
class Backtracker {
public:
explicit Backtracker(Prog* prog);
~Backtracker();
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
// Explores from instruction ip at string position p looking for a match.
// Returns true if found (so that caller can stop trying other possibilities).
bool Visit(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether search must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char* cap_[64]; // capture registers
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
};
Backtracker::Backtracker(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
visited_(NULL),
nvisited_(0) {
}
Backtracker::~Backtracker() {
delete[] visited_;
}
// Runs a backtracking search.
bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && text.begin() > context_.begin())
return false;
if (prog_->anchor_end() && text.end() < context_.end())
return false;
anchored_ = anchored | prog_->anchor_start();
longest_ = longest | prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
CHECK(2*nsubmatch_ < arraysize(cap_));
memset(cap_, 0, sizeof cap_);
// We use submatch_[0] for our own bookkeeping,
// so it had better exist.
StringPiece sp0;
if (nsubmatch < 1) {
submatch_ = &sp0;
nsubmatch_ = 1;
}
submatch_[0] = NULL;
// Allocate new visited_ bitmap -- size is proportional
// to text, so have to reallocate on each call to Search.
delete[] visited_;
nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
visited_ = new uint32[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return Visit(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Explores from instruction ip at string position p looking for a match.
// Return true if found (so that caller can stop trying other possibilities).
bool Backtracker::Visit(int id, const char* p) {
// Check bitmap. If we've already explored from here,
// either it didn't match or it did but we're hoping for a better match.
// Either way, don't go down that road again.
CHECK(p <= text_.end());
int n = id*(text_.size()+1) + (p - text_.begin());
CHECK_LT(n/32, nvisited_);
if (visited_[n/32] & (1 << (n&31)))
return false;
visited_[n/32] |= 1 << (n&31);
// Pick out byte at current position. If at end of string,
// have to explore in hope of finishing a match. Use impossible byte -1.
int c = -1;
if (p < text_.end())
c = *p & 0xFF;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
return false; // not reached
case kInstAlt:
case kInstAltMatch:
// Try both possible next states: out is preferred to out1.
if (Visit(ip->out(), p)) {
if (longest_)
Visit(ip->out1(), p);
return true;
}
return Visit(ip->out1(), p);
case kInstByteRange:
if (ip->Matches(c))
return Visit(ip->out(), p+1);
return false;
case kInstCapture:
if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
// Capture p to register, but save old value.
const char* q = cap_[ip->cap()];
cap_[ip->cap()] = p;
bool ret = Visit(ip->out(), p);
// Restore old value as we backtrack.
cap_[ip->cap()] = q;
return ret;
}
return Visit(ip->out(), p);
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
return false;
return Visit(ip->out(), p);
case kInstNop:
return Visit(ip->out(), p);
case kInstMatch:
// We found a match. If it's the best so far, record the
// parameters in the caller's submatch_ array.
if (endmatch_ && p != context_.end())
return false;
cap_[1] = p;
if (submatch_[0].data() == NULL || // First match so far ...
(longest_ && p > submatch_[0].end())) { // ... or better match
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
}
return true;
case kInstFail:
return false;
}
}
// Runs a backtracking search.
bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
Backtracker b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

View File

@ -0,0 +1,223 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test character class manipulations.
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct CCTest {
struct {
Rune lo;
Rune hi;
} add[10];
int remove;
struct {
Rune lo;
Rune hi;
} final[10];
};
static CCTest tests[] = {
{ { { 10, 20 }, {-1} }, -1,
{ { 10, 20 }, {-1} } },
{ { { 10, 20 }, { 20, 30 }, {-1} }, -1,
{ { 10, 30 }, {-1} } },
{ { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
{ { 10, 40 }, {-1} } },
{ { { 0, 50 }, { 20, 30 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
{ { 5, 25 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
{ { 10, 23 }, {-1} } },
// These check boundary cases during negation.
{ { { 0, Runemax }, {-1} }, -1,
{ { 0, Runemax }, {-1} } },
{ { { 0, 50 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 50, Runemax }, {-1} }, -1,
{ { 50, Runemax }, {-1} } },
// Check RemoveAbove.
{ { { 50, Runemax }, {-1} }, 255,
{ { 50, 255 }, {-1} } },
{ { { 50, Runemax }, {-1} }, 65535,
{ { 50, 65535 }, {-1} } },
{ { { 50, Runemax }, {-1} }, Runemax,
{ { 50, Runemax }, {-1} } },
{ { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
{ { 50, 60 }, { 250, 255 }, {-1} } },
{ { { 50, 60 }, {-1} }, 255,
{ { 50, 60 }, {-1} } },
{ { { 350, 360 }, {-1} }, 255,
{ {-1} } },
{ { {-1} }, 255,
{ {-1} } },
};
template<class CharClass>
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
if (t == NULL) {
printf("\t%s:", desc);
} else {
printf("\n");
printf("CharClass added: [%s]", desc);
for (int k = 0; t->add[k].lo >= 0; k++)
printf(" %d-%d", t->add[k].lo, t->add[k].hi);
printf("\n");
if (t->remove >= 0)
printf("Removed > %d\n", t->remove);
printf("\twant:");
for (int k = 0; t->final[k].lo >= 0; k++)
printf(" %d-%d", t->final[k].lo, t->final[k].hi);
printf("\n");
printf("\thave:");
}
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
printf(" %d-%d", it->lo, it->hi);
printf("\n");
}
bool ShouldContain(CCTest *t, int x) {
for (int j = 0; t->final[j].lo >= 0; j++)
if (t->final[j].lo <= x && x <= t->final[j].hi)
return true;
return false;
}
// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
CharClass* Negate(CharClass *cc) {
return cc->Negate();
}
void Delete(CharClass* cc) {
cc->Delete();
}
CharClassBuilder* Negate(CharClassBuilder* cc) {
CharClassBuilder* ncc = cc->Copy();
ncc->Negate();
return ncc;
}
void Delete(CharClassBuilder* cc) {
delete cc;
}
template<class CharClass>
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
typename CharClass::iterator it = cc->begin();
int size = 0;
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
if (it == cc->end() ||
it->lo != t->final[j].lo ||
it->hi != t->final[j].hi) {
Broke(desc, t, cc);
return false;
}
size += it->hi - it->lo + 1;
}
if (it != cc->end()) {
Broke(desc, t, cc);
return false;
}
if (cc->size() != size) {
Broke(desc, t, cc);
printf("wrong size: want %d have %d\n", size, cc->size());
return false;
}
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) != cc->Contains(j)) {
Broke(desc, t, cc);
printf("want contains(%d)=%d, got %d\n",
j, ShouldContain(t, j), cc->Contains(j));
return false;
}
}
CharClass* ncc = Negate(cc);
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) == ncc->Contains(j)) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("want ncc contains(%d)!=%d, got %d\n",
j, ShouldContain(t, j), ncc->Contains(j));
Delete(ncc);
return false;
}
if (ncc->size() != Runemax+1 - cc->size()) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("ncc size should be %d is %d\n",
Runemax+1 - cc->size(), ncc->size());
Delete(ncc);
return false;
}
}
Delete(ncc);
return true;
}
TEST(TestCharClassBuilder, Adds) {
int nfail = 0;
for (int i = 0; i < arraysize(tests); i++) {
CharClassBuilder ccb;
CCTest* t = &tests[i];
for (int j = 0; t->add[j].lo >= 0; j++)
ccb.AddRange(t->add[j].lo, t->add[j].hi);
if (t->remove >= 0)
ccb.RemoveAbove(t->remove);
if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
nfail++;
CharClass* cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "before copy (CharClass)"))
nfail++;
cc->Delete();
CharClassBuilder *ccb1 = ccb.Copy();
if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
nfail++;
cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "after copy (CharClass)"))
nfail++;
cc->Delete();
delete ccb1;
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

View File

@ -0,0 +1,171 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test prog.cc, compile.cc
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
#include "re2/prog.h"
DEFINE_string(show, "", "regular expression to compile and dump");
namespace re2 {
// Simple input/output tests checking that
// the regexp compiles to the expected code.
// These are just to sanity check the basic implementation.
// The real confidence tests happen by testing the NFA/DFA
// that run the compiled code.
struct Test {
const char* regexp;
const char* code;
};
static Test tests[] = {
{ "a",
"1. byte [61-61] -> 2\n"
"2. match! 0\n" },
{ "ab",
"1. byte [61-61] -> 2\n"
"2. byte [62-62] -> 3\n"
"3. match! 0\n" },
{ "a|c",
"3. alt -> 1 | 2\n"
"1. byte [61-61] -> 4\n"
"2. byte [63-63] -> 4\n"
"4. match! 0\n" },
{ "a|b",
"1. byte [61-62] -> 2\n"
"2. match! 0\n" },
{ "[ab]",
"1. byte [61-62] -> 2\n"
"2. match! 0\n" },
{ "a+",
"1. byte [61-61] -> 2\n"
"2. alt -> 1 | 3\n"
"3. match! 0\n" },
{ "a+?",
"1. byte [61-61] -> 2\n"
"2. alt -> 3 | 1\n"
"3. match! 0\n" },
{ "a*",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 2\n"
"3. match! 0\n" },
{ "a*?",
"2. alt -> 3 | 1\n"
"3. match! 0\n"
"1. byte [61-61] -> 2\n" },
{ "a?",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 3\n"
"3. match! 0\n" },
{ "a??",
"2. alt -> 3 | 1\n"
"3. match! 0\n"
"1. byte [61-61] -> 3\n" },
{ "a{4}",
"1. byte [61-61] -> 2\n"
"2. byte [61-61] -> 3\n"
"3. byte [61-61] -> 4\n"
"4. byte [61-61] -> 5\n"
"5. match! 0\n" },
{ "(a)",
"2. capture 2 -> 1\n"
"1. byte [61-61] -> 3\n"
"3. capture 3 -> 4\n"
"4. match! 0\n" },
{ "(?:a)",
"1. byte [61-61] -> 2\n"
"2. match! 0\n" },
{ "",
"2. match! 0\n" },
{ ".",
"3. alt -> 1 | 2\n"
"1. byte [00-09] -> 4\n"
"2. byte [0b-ff] -> 4\n"
"4. match! 0\n" },
{ "[^ab]",
"5. alt -> 3 | 4\n"
"3. alt -> 1 | 2\n"
"4. byte [63-ff] -> 6\n"
"1. byte [00-09] -> 6\n"
"2. byte [0b-60] -> 6\n"
"6. match! 0\n" },
{ "[Aa]",
"1. byte/i [61-61] -> 2\n"
"2. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
int failed = 0;
for (int i = 0; i < arraysize(tests); i++) {
const re2::Test& t = tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
if (re == NULL) {
LOG(ERROR) << "Cannot parse: " << t.regexp;
failed++;
continue;
}
Prog* prog = re->CompileToProg(0);
if (prog == NULL) {
LOG(ERROR) << "Cannot compile: " << t.regexp;
re->Decref();
failed++;
continue;
}
CHECK(re->CompileToProg(1) == NULL);
string s = prog->Dump();
if (s != t.code) {
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
LOG(ERROR) << "Want:\n" << t.code;
LOG(ERROR) << "Got:\n" << s;
failed++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(failed, 0);
}
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
// Once, erroneously split between 0x3f and 0x40 because it is
// a 6-bit boundary.
static struct UTF8ByteRange {
int lo;
int hi;
} utf8ranges[] = {
{ 0x00, 0x09 },
{ 0x0A, 0x0A },
{ 0x10, 0x7F },
{ 0x80, 0x8F },
{ 0x90, 0x9F },
{ 0xA0, 0xBF },
{ 0xC0, 0xC1 },
{ 0xC2, 0xDF },
{ 0xE0, 0xE0 },
{ 0xE1, 0xEF },
{ 0xF0, 0xF0 },
{ 0xF1, 0xF3 },
{ 0xF4, 0xF4 },
{ 0xF5, 0xFF },
};
TEST(TestCompile, ByteRanges) {
Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
for (int i = 0; i < arraysize(utf8ranges); i++)
for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
delete prog;
re->Decref();
}
} // namespace re2

343
re2/re2/testing/dfa_test.cc Normal file
View File

@ -0,0 +1,343 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "util/thread.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
DECLARE_bool(re2_dfa_bail_when_slow);
DEFINE_int32(size, 8, "log2(number of DFA nodes)");
DEFINE_int32(repeat, 2, "Repetition count.");
DEFINE_int32(threads, 4, "number of threads");
namespace re2 {
// Check that multithreaded access to DFA class works.
// Helper thread: builds entire DFA for prog.
class BuildThread : public Thread {
public:
BuildThread(Prog* prog) : prog_(prog) {}
virtual void Run() {
CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
}
private:
Prog* prog_;
};
TEST(Multithreaded, BuildEntireDFA) {
// Create regexp with 2^FLAGS_size states in DFA.
string s = "a";
for (int i = 0; i < FLAGS_size; i++)
s += "[ab]";
s += "b";
// Check that single-threaded code works.
{
//LOG(INFO) << s;
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
BuildThread* t = new BuildThread(prog);
t->SetJoinable(true);
t->Start();
t->Join();
delete t;
delete prog;
re->Decref();
}
// Build the DFA simultaneously in a bunch of threads.
for (int i = 0; i < FLAGS_repeat; i++) {
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
vector<BuildThread*> threads;
for (int j = 0; j < FLAGS_threads; j++) {
BuildThread *t = new BuildThread(prog);
t->SetJoinable(true);
threads.push_back(t);
}
for (int j = 0; j < FLAGS_threads; j++)
threads[j]->Start();
for (int j = 0; j < FLAGS_threads; j++) {
threads[j]->Join();
delete threads[j];
}
// One more compile, to make sure everything is okay.
prog->BuildEntireDFA(Prog::kFirstMatch);
delete prog;
re->Decref();
}
}
// Check that DFA size requirements are followed.
// BuildEntireDFA will, like SearchDFA, stop building out
// the DFA once the memory limits are reached.
TEST(SingleThreaded, BuildEntireDFA) {
// Create regexp with 2^30 states in DFA.
string s = "a";
for (int i = 0; i < 30; i++)
s += "[ab]";
s += "b";
//LOG(INFO) << s;
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
int max = 24;
for (int i = 17; i < max; i++) {
int limit = 1<<i;
int usage, progusage, dfamem;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(limit);
CHECK(prog);
progusage = m.HeapGrowth();
dfamem = prog->dfa_mem();
prog->BuildEntireDFA(Prog::kFirstMatch);
prog->BuildEntireDFA(Prog::kLongestMatch);
usage = m.HeapGrowth();
delete prog;
}
if (!UsingMallocCounter)
continue;
//LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
// limit, progusage, dfamem, usage);
CHECK_GT(usage, limit*9/10);
CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay
}
re->Decref();
}
// Generates and returns a string over binary alphabet {0,1} that contains
// all possible binary sequences of length n as subsequences. The obvious
// brute force method would generate a string of length n * 2^n, but this
// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
// Such a string is useful for testing a DFA. If you have a DFA
// where distinct last n bytes implies distinct states, then running on a
// DeBruijn string causes the DFA to need to create a new state at every
// position in the input, never reusing any states until it gets to the
// end of the string. This is the worst possible case for DFA execution.
static string DeBruijnString(int n) {
CHECK_LT(n, 8*sizeof(int));
CHECK_GT(n, 0);
vector<bool> did(1<<n);
for (int i = 0; i < 1<<n; i++)
did[i] = false;
string s;
for (int i = 0; i < n-1; i++)
s.append("0");
int bits = 0;
int mask = (1<<n) - 1;
for (int i = 0; i < (1<<n); i++) {
bits <<= 1;
bits &= mask;
if (!did[bits|1]) {
bits |= 1;
s.append("1");
} else {
s.append("0");
}
CHECK(!did[bits]);
did[bits] = true;
}
return s;
}
// Test that the DFA gets the right result even if it runs
// out of memory during a search. The regular expression
// 0[01]{n}$ matches a binary string of 0s and 1s only if
// the (n+1)th-to-last character is a 0. Matching this in
// a single forward pass (as done by the DFA) requires
// keeping one bit for each of the last n+1 characters
// (whether each was a 0), or 2^(n+1) possible states.
// If we run this regexp to search in a string that contains
// every possible n-character binary string as a substring,
// then it will have to run through at least 2^n states.
// States are big data structures -- certainly more than 1 byte --
// so if the DFA can search correctly while staying within a
// 2^n byte limit, it must be handling out-of-memory conditions
// gracefully.
TEST(SingleThreaded, SearchDFA) {
// Choice of n is mostly arbitrary, except that:
// * making n too big makes the test run for too long.
// * making n too small makes the DFA refuse to run,
// because it has so little memory compared to the program size.
// Empirically, n = 18 is a good compromise between the two.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
CHECK(re);
// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
string no_match = DeBruijnString(n);
string match = no_match + "0";
// The De Bruijn string is the worst case input for this regexp.
// By default, the DFA will notice that it is flushing its cache
// too frequently and will bail out early, so that RE2 can use the
// NFA implementation instead. (The DFA loses its speed advantage
// if it can't get a good cache hit rate.)
// Tell the DFA to trudge along instead.
FLAGS_re2_dfa_bail_when_slow = false;
int64 usage;
int64 peak_usage;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
for (int i = 0; i < 10; i++) {
bool matched, failed = false;
matched = prog->SearchDFA(match, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(matched);
matched = prog->SearchDFA(no_match, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(!matched);
}
usage = m.HeapGrowth();
peak_usage = m.PeakHeapGrowth();
delete prog;
}
re->Decref();
if (!UsingMallocCounter)
return;
//LOG(INFO) << "usage " << usage << " " << peak_usage;
CHECK_LT(usage, 1<<n);
CHECK_LT(peak_usage, 1<<n);
}
// Helper thread: searches for match, which should match,
// and no_match, which should not.
class SearchThread : public Thread {
public:
SearchThread(Prog* prog, const StringPiece& match,
const StringPiece& no_match)
: prog_(prog), match_(match), no_match_(no_match) {}
virtual void Run() {
for (int i = 0; i < 2; i++) {
bool matched, failed = false;
matched = prog_->SearchDFA(match_, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(matched);
matched = prog_->SearchDFA(no_match_, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(!matched);
}
}
private:
Prog* prog_;
StringPiece match_;
StringPiece no_match_;
};
TEST(Multithreaded, SearchDFA) {
// Same as single-threaded test above.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
CHECK(re);
string no_match = DeBruijnString(n);
string match = no_match + "0";
FLAGS_re2_dfa_bail_when_slow = false;
// Check that single-threaded code works.
{
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
SearchThread* t = new SearchThread(prog, match, no_match);
t->SetJoinable(true);
t->Start();
t->Join();
delete t;
delete prog;
}
// Run the search simultaneously in a bunch of threads.
// Reuse same flags for Multithreaded.BuildDFA above.
for (int i = 0; i < FLAGS_repeat; i++) {
//LOG(INFO) << "Search " << i;
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
vector<SearchThread*> threads;
for (int j = 0; j < FLAGS_threads; j++) {
SearchThread *t = new SearchThread(prog, match, no_match);
t->SetJoinable(true);
threads.push_back(t);
}
for (int j = 0; j < FLAGS_threads; j++)
threads[j]->Start();
for (int j = 0; j < FLAGS_threads; j++) {
threads[j]->Join();
delete threads[j];
}
delete prog;
}
re->Decref();
}
struct ReverseTest {
const char *regexp;
const char *text;
bool match;
};
// Test that reverse DFA handles anchored/unanchored correctly.
// It's in the DFA interface but not used by RE2.
ReverseTest reverse_tests[] = {
{ "\\A(a|b)", "abc", true },
{ "(a|b)\\z", "cba", true },
{ "\\A(a|b)", "cba", false },
{ "(a|b)\\z", "abc", false },
};
TEST(DFA, ReverseMatch) {
int nfail = 0;
for (int i = 0; i < arraysize(reverse_tests); i++) {
const ReverseTest& t = reverse_tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog *prog = re->CompileToReverseProg(0);
CHECK(prog);
bool failed = false;
bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
if (matched != t.match) {
LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
nfail++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

164
re2/re2/testing/dump.cc Normal file
View File

@ -0,0 +1,164 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Dump the regexp into a string showing structure.
// Tested by parse_unittest.cc
// This function traverses the regexp recursively,
// meaning that on inputs like Regexp::Simplify of
// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
// it takes time and space exponential in the size of the
// original regular expression. It can also use stack space
// linear in the size of the regular expression for inputs
// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
// As a result, Dump is provided only in the testing
// library (see BUILD).
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/stringpiece.h"
#include "re2/regexp.h"
// Cause a link error if this file is used outside of testing.
DECLARE_string(test_tmpdir);
namespace re2 {
static const char* kOpcodeNames[] = {
"bad",
"no",
"emp",
"lit",
"str",
"cat",
"alt",
"star",
"plus",
"que",
"rep",
"cap",
"dot",
"byte",
"bol",
"eol",
"wb", // kRegexpWordBoundary
"nwb", // kRegexpNoWordBoundary
"bot",
"eot",
"cc",
"match",
};
// Create string representation of regexp with explicit structure.
// Nothing pretty, just for testing.
static void DumpRegexpAppending(Regexp* re, string* s) {
if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
StringAppendF(s, "op%d", re->op());
} else {
switch (re->op()) {
default:
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (re->parse_flags() & Regexp::NonGreedy)
s->append("n");
break;
}
s->append(kOpcodeNames[re->op()]);
if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
Rune r = re->rune();
if ('a' <= r && r <= 'z')
s->append("fold");
}
if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
if ('a' <= r && r <= 'z') {
s->append("fold");
break;
}
}
}
}
s->append("{");
switch (re->op()) {
default:
break;
case kRegexpEndText:
if (!(re->parse_flags() & Regexp::WasDollar)) {
s->append("\\z");
}
break;
case kRegexpLiteral: {
Rune r = re->rune();
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
break;
}
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
}
break;
case kRegexpConcat:
case kRegexpAlternate:
for (int i = 0; i < re->nsub(); i++)
DumpRegexpAppending(re->sub()[i], s);
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCapture:
if (re->name()) {
s->append(*re->name());
s->append(":");
}
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpRepeat:
s->append(StringPrintf("%d,%d ", re->min(), re->max()));
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCharClass: {
string sep;
for (CharClass::iterator it = re->cc()->begin();
it != re->cc()->end(); ++it) {
RuneRange rr = *it;
s->append(sep);
if (rr.lo == rr.hi)
s->append(StringPrintf("%#x", rr.lo));
else
s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
sep = " ";
}
break;
}
}
s->append("}");
}
string Regexp::Dump() {
string s;
// Make sure being called from a unit test.
if (FLAGS_test_tmpdir.empty()) {
LOG(ERROR) << "Cannot use except for testing.";
return s;
}
DumpRegexpAppending(this, &s);
return s;
}
} // namespace re2

View File

@ -0,0 +1,42 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test simple repetition operators
TEST(Repetition, Simple) {
vector<string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Explode("abc."), ops,
6, Explode("ab"), "(?:%s)", "");
ExhaustiveTest(3, 2, Explode("abc."), ops,
40, Explode("a"), "(?:%s)", "");
}
// Test capturing parens -- (a) -- inside repetition operators
TEST(Repetition, Capturing) {
vector<string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
7, Explode("ab"), "(?:%s)", "");
// This would be a great test, but it runs forever when PCRE is enabled.
if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL)
ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops,
100, Explode("a"), "(?:%s)", "");
}
} // namespace re2

View File

@ -0,0 +1,70 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/re2.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test empty string matches (aka "(?:)")
TEST(EmptyString, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
RegexpGenerator::EgrepOps(),
5, Split("", "ab"), "", "");
}
// Test escaped versions of regexp syntax.
TEST(Punctuation, Literals) {
vector<string> alphabet = Explode("()*+?{}[]\\^$.");
vector<string> escaped = alphabet;
for (int i = 0; i < escaped.size(); i++)
escaped[i] = "\\" + escaped[i];
ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
2, alphabet, "", "");
}
// Test ^ $ . \A \z in presence of line endings.
// Have to wrap the empty-width ones in (?:) so that
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
TEST(LineEnds, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
RegexpGenerator::EgrepOps(),
4, Explode("ab\n"), "", "");
}
// Test what does and does not match \n.
// This would be a good test, except that PCRE seems to have a bug:
// in single-byte character set mode (the default),
// [^a] matches \n, but in UTF-8 mode it does not.
// So when we run the test, the tester complains that
// we don't agree with PCRE, but it's PCRE that is at fault.
// For what it's worth, Perl gets this right (matches
// regardless of whether UTF-8 input is selected):
//
// #!/usr/bin/perl
// use POSIX qw(locale_h);
// print "matches in latin1\n" if "\n" =~ /[^a]/;
// setlocale("en_US.utf8");
// print "matches in utf8\n" if "\n" =~ /[^a]/;
//
// The rule chosen for RE2 is that by default, like Perl,
// dot does not match \n but negated character classes [^a] do.
// (?s) will allow dot to match \n; there is no way in RE2
// to stop [^a] from matching \n, though the underlying library
// provides a mechanism, and RE2 could add new syntax if needed.
//
// TEST(Newlines, Exhaustive) {
// vector<string> empty_vector;
// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
// RegexpGenerator::EgrepOps(),
// 4, Explode("a\n"), "");
// }
} // namespace re2

View File

@ -0,0 +1,94 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
// Test simple character classes by themselves.
TEST(CharacterClasses, Exhaustive) {
vector<string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "", "");
}
// Test simple character classes inside a___b (for example, a[a]b).
TEST(CharacterClasses, ExhaustiveAB) {
vector<string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "a%sb", "");
}
// Returns UTF8 for Rune r
static string UTF8(Rune r) {
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
return string(buf);
}
// Returns a vector of "interesting" UTF8 characters.
// Unicode is now too big to just return all of them,
// so UTF8Characters return a set likely to be good test cases.
static const vector<string>& InterestingUTF8() {
static bool init;
static vector<string> v;
if (init)
return v;
init = true;
// All the Latin1 equivalents are interesting.
for (int i = 1; i < 256; i++)
v.push_back(UTF8(i));
// After that, the codes near bit boundaries are
// interesting, because they span byte sequence lengths.
for (int j = 0; j < 8; j++)
v.push_back(UTF8(256 + j));
for (int i = 512; i < Runemax; i <<= 1)
for (int j = -8; j < 8; j++)
v.push_back(UTF8(i + j));
// The codes near Runemax, including Runemax itself, are interesting.
for (int j = -8; j <= 0; j++)
v.push_back(UTF8(Runemax + j));
return v;
}
// Test interesting UTF-8 characters against character classes.
TEST(InterestingUTF8, SingleOps) {
vector<string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
vector<string> ops; // no ops
ExhaustiveTest(1, 0, atoms, ops,
1, InterestingUTF8(), "", "");
}
// Test interesting UTF-8 characters against character classes,
// but wrap everything inside AB.
TEST(InterestingUTF8, AB) {
vector<string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
vector<string> ops; // no ops
vector<string> alpha = InterestingUTF8();
for (int i = 0; i < alpha.size(); i++)
alpha[i] = "a" + alpha[i] + "b";
ExhaustiveTest(1, 0, atoms, ops,
1, alpha, "a%sb", "");
}
} // namespace re2

View File

@ -0,0 +1,38 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
DECLARE_string(regexp_engines);
// Test very simple expressions.
TEST(EgrepLiterals, Lowercase) {
EgrepTest(3, 2, "abc.", 3, "abc", "");
}
// Test mixed-case expressions.
TEST(EgrepLiterals, MixedCase) {
EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
}
// Test mixed-case in case-insensitive mode.
TEST(EgrepLiterals, FoldCase) {
// The punctuation characters surround A-Z and a-z
// in the ASCII table. This looks for bugs in the
// bytemap range code in the DFA.
EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
}
// Test very simple expressions.
TEST(EgrepLiterals, UTF8) {
EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
}
} // namespace re2

View File

@ -0,0 +1,188 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
// Each test picks an alphabet (e.g., "abc"), a maximum string length,
// a maximum regular expression length, and a maximum number of letters
// that can appear in the regular expression. Given these parameters,
// it tries every possible regular expression and string, verifying that
// the NFA, DFA, and a trivial backtracking implementation agree about
// the location of the match.
#include <stdlib.h>
#include <stdio.h>
#ifndef LOGGING
#define LOGGING 0
#endif
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
#include "re2/testing/tester.h"
DEFINE_bool(show_regexps, false, "show regexps during testing");
DEFINE_int32(max_bad_regexp_inputs, 1,
"Stop testing a regular expression after finding this many "
"strings that break it.");
// Compiled in debug mode, the usual tests run for over an hour.
// Have to cut it down to make the unit test machines happy.
DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode.");
namespace re2 {
static char* escape(const StringPiece& sp) {
static char buf[512];
char* p = buf;
*p++ = '\"';
for (int i = 0; i < sp.size(); i++) {
if(p+5 >= buf+sizeof buf)
LOG(FATAL) << "ExhaustiveTester escape: too long";
if(sp[i] == '\\' || sp[i] == '\"') {
*p++ = '\\';
*p++ = sp[i];
} else if(sp[i] == '\n') {
*p++ = '\\';
*p++ = 'n';
} else {
*p++ = sp[i];
}
}
*p++ = '\"';
*p = '\0';
return buf;
}
static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
printf("-");
return;
}
for (int i = 0; i < n; i++) {
if (i > 0)
printf(" ");
if (m[i].begin() == NULL)
printf("-");
else
printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin()));
}
}
// Processes a single generated regexp.
// Compiles it using Regexp interface and PCRE, and then
// checks that NFA, DFA, and PCRE all return the same results.
void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
regexps_++;
string regexp = const_regexp;
if (!topwrapper_.empty())
regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
if (FLAGS_show_regexps) {
printf("\r%s", regexp.c_str());
fflush(stdout);
}
if (LOGGING) {
// Write out test cases and answers for use in testing
// other implementations, such as Go's regexp package.
if (randomstrings_)
LOG(ERROR) << "Cannot log with random strings.";
if (regexps_ == 1) { // first
printf("strings\n");
strgen_.Reset();
while (strgen_.HasNext())
printf("%s\n", escape(strgen_.Next()));
printf("regexps\n");
}
printf("%s\n", escape(regexp));
RE2 re(regexp);
RE2::Options longest;
longest.set_longest_match(true);
RE2 relongest(regexp, longest);
int ngroup = re.NumberOfCapturingGroups()+1;
StringPiece* group = new StringPiece[ngroup];
strgen_.Reset();
while (strgen_.HasNext()) {
StringPiece input = strgen_.Next();
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
printf("\n");
}
delete[] group;
return;
}
Tester tester(regexp);
if (tester.error())
return;
strgen_.Reset();
strgen_.GenerateNULL();
if (randomstrings_)
strgen_.Random(stringseed_, stringcount_);
int bad_inputs = 0;
while (strgen_.HasNext()) {
tests_++;
if (!tester.TestInput(strgen_.Next())) {
failures_++;
if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
break;
}
}
}
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper) {
if (DEBUG_MODE && FLAGS_quick_debug_mode) {
if (maxatoms > 1)
maxatoms--;
if (maxops > 1)
maxops--;
if (maxstrlen > 1)
maxstrlen--;
}
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper,
topwrapper);
t.Generate();
if (!LOGGING) {
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
}
EXPECT_EQ(0, t.failures());
}
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
int maxstrlen, const string& stralphabet,
const string& wrapper) {
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
for (int i = 0; i < arraysize(tops); i++) {
ExhaustiveTest(maxatoms, maxops,
Split("", alphabet),
RegexpGenerator::EgrepOps(),
maxstrlen,
Split("", stralphabet),
wrapper,
tops[i]);
}
}
} // namespace re2

View File

@ -0,0 +1,85 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
#include <string>
#include <vector>
#include "util/util.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Exhaustive regular expression test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that NFA, DFA, and PCRE agree about whether each regexp matches
// each possible string, and if so, where the match is.
//
// Can also be used in a "random" mode that generates a given number
// of random regexp and strings, allowing testing of larger expressions
// and inputs.
class ExhaustiveTester : public RegexpGenerator {
public:
ExhaustiveTester(int maxatoms,
int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen,
const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
wrapper_(wrapper),
topwrapper_(topwrapper),
regexps_(0), tests_(0), failures_(0),
randomstrings_(0), stringseed_(0), stringcount_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
int failures() { return failures_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const string& regexp);
// Causes testing to generate random input strings.
void RandomStrings(int32 seed, int32 count) {
randomstrings_ = true;
stringseed_ = seed;
stringcount_ = count;
}
private:
StringGenerator strgen_;
string wrapper_; // Regexp wrapper - either empty or has one %s.
string topwrapper_; // Regexp top-level wrapper.
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
int failures_; // Number of tests failed.
bool randomstrings_; // Whether to use random strings
int32 stringseed_; // If so, the seed.
int stringcount_; // If so, how many to generate.
DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester);
};
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper);
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
int maxstrlen, const string& stralphabet,
const string& wrapper);
} // namespace re2
#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__

View File

@ -0,0 +1,258 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/filtered_re2.h"
#include "re2/re2.h"
DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
namespace re2 {
struct FilterTestVars {
vector<string> atoms;
vector<int> atom_indices;
vector<int> matches;
RE2::Options opts;
FilteredRE2 f;
};
TEST(FilteredRE2Test, EmptyTest) {
FilterTestVars v;
v.f.AllMatches("foo", v.atom_indices, &v.matches);
EXPECT_EQ(0, v.matches.size());
}
TEST(FilteredRE2Test, SmallOrTest) {
FLAGS_filtered_re2_min_atom_len = 4;
FilterTestVars v;
int id;
v.f.Add("(foo|bar)", v.opts, &id);
v.f.Compile(&v.atoms);
EXPECT_EQ(0, v.atoms.size());
v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
EXPECT_EQ(1, v.matches.size());
EXPECT_EQ(id, v.matches[0]);
}
struct AtomTest {
const char* testname;
// If any test needs more than this many regexps or atoms, increase
// the size of the corresponding array.
const char* regexps[20];
const char* atoms[20];
};
AtomTest atom_tests[] = {
{
// This test checks to make sure empty patterns are allowed.
"CheckEmptyPattern",
{""},
{}
}, {
// This test checks that all atoms of length greater than min length
// are found, and no atoms that are of smaller length are found.
"AllAtomsGtMinLengthFound", {
"(abc123|def456|ghi789).*mnop[x-z]+",
"abc..yyy..zz",
"mnmnpp[a-z]+PPP"
}, {
"abc123",
"def456",
"ghi789",
"mnop",
"abc",
"yyy",
"mnmnpp",
"ppp"
}
}, {
// Test to make sure that any atoms that have another atom as a
// substring in an OR are removed; that is, only the shortest
// substring is kept.
"SubstrAtomRemovesSuperStrInOr", {
"(abc123|abc|ghi789|abc1234).*[x-z]+",
"abcd..yyy..yyyzzz",
"mnmnpp[a-z]+PPP"
}, {
"abc",
"ghi789",
"abcd",
"yyy",
"yyyzzz",
"mnmnpp",
"ppp"
}
}, {
// Test character class expansion.
"CharClassExpansion", {
"m[a-c][d-f]n.*[x-z]+",
"[x-y]bcde[ab]"
}, {
"madn", "maen", "mafn",
"mbdn", "mben", "mbfn",
"mcdn", "mcen", "mcfn",
"xbcdea", "xbcdeb",
"ybcdea", "ybcdeb"
}
}, {
// Test upper/lower of non-ASCII.
"UnicodeLower", {
"(?i)ΔδΠϖπΣςσ",
"ΛΜΝΟΠ",
"ψρστυ",
}, {
"δδπππσσσ",
"λμνοπ",
"ψρστυ",
},
},
};
void AddRegexpsAndCompile(const char* regexps[],
int n,
struct FilterTestVars* v) {
for (int i = 0; i < n; i++) {
int id;
v->f.Add(regexps[i], v->opts, &id);
}
v->f.Compile(&v->atoms);
}
bool CheckExpectedAtoms(const char* atoms[],
int n,
const char* testname,
struct FilterTestVars* v) {
vector<string> expected;
for (int i = 0; i < n; i++)
expected.push_back(atoms[i]);
bool pass = expected.size() == v->atoms.size();
sort(v->atoms.begin(), v->atoms.end());
sort(expected.begin(), expected.end());
for (int i = 0; pass && i < n; i++)
pass = pass && expected[i] == v->atoms[i];
if (!pass) {
LOG(WARNING) << "Failed " << testname;
LOG(WARNING) << "Expected #atoms = " << expected.size();
for (int i = 0; i < expected.size(); i++)
LOG(WARNING) << expected[i];
LOG(WARNING) << "Found #atoms = " << v->atoms.size();
for (int i = 0; i < v->atoms.size(); i++)
LOG(WARNING) << v->atoms[i];
}
return pass;
}
TEST(FilteredRE2Test, AtomTests) {
FLAGS_filtered_re2_min_atom_len = 3;
int nfail = 0;
for (int i = 0; i < arraysize(atom_tests); i++) {
FilterTestVars v;
AtomTest* t = &atom_tests[i];
int natom, nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
for (natom = 0; natom < arraysize(t->atoms); natom++)
if (t->atoms[natom] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
nfail++;
}
EXPECT_EQ(0, nfail);
}
void FindAtomIndices(const vector<string> atoms,
const vector<string> matched_atoms,
vector<int>* atom_indices) {
atom_indices->clear();
for (int i = 0; i < matched_atoms.size(); i++) {
int j = 0;
for (; j < atoms.size(); j++) {
if (matched_atoms[i] == atoms[j]) {
atom_indices->push_back(j);
break;
}
EXPECT_LT(j, atoms.size());
}
}
}
TEST(FilteredRE2Test, MatchEmptyPattern) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
AtomTest* t = &atom_tests[0];
// We are using the regexps used in one of the atom tests
// for this test. Adding the EXPECT here to make sure
// the index we use for the test is for the correct test.
EXPECT_EQ("CheckEmptyPattern", string(t->testname));
int nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
string text = "0123";
vector<int> atom_ids;
vector<int> matching_regexps;
EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
}
TEST(FilteredRE2Test, MatchTests) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
AtomTest* t = &atom_tests[2];
// We are using the regexps used in one of the atom tests
// for this test.
EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
int nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
string text = "abc121212xyz";
// atoms = abc
vector<int> atom_ids;
vector<string> atoms;
atoms.push_back("abc");
FindAtomIndices(v.atoms, atoms, &atom_ids);
vector<int> matching_regexps;
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abc12312yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abcd12yyy32yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("abcd");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
LOG(INFO) << "S: " << atom_ids.size();
for (int i = 0; i < atom_ids.size(); i++)
LOG(INFO) << "i: " << i << " : " << atom_ids[i];
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(2, matching_regexps.size());
}
} // namespace re2

View File

@ -0,0 +1,76 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct PCRETest {
const char* regexp;
bool should_match;
};
static PCRETest tests[] = {
// Most things should behave exactly.
{ "abc", true },
{ "(a|b)c", true },
{ "(a*|b)c", true },
{ "(a|b*)c", true },
{ "a(b|c)d", true },
{ "a(()|())c", true },
{ "ab*c", true },
{ "ab+c", true },
{ "a(b*|c*)d", true },
{ "\\W", true },
{ "\\W{1,2}", true },
{ "\\d", true },
// Check that repeated empty strings do not.
{ "(a*)*", false },
{ "x(a*)*y", false },
{ "(a*)+", false },
{ "(a+)*", true },
{ "(a+)+", true },
{ "(a+)+", true },
// \v is the only character class that shouldn't.
{ "\\b", true },
{ "\\v", false },
{ "\\d", true },
// The handling of ^ in multi-line mode is different, as is
// the handling of $ in single-line mode. (Both involve
// boundary cases if the string ends with \n.)
{ "\\A", true },
{ "\\z", true },
{ "(?m)^", false },
{ "(?m)$", true },
{ "(?-m)^", true },
{ "(?-m)$", false }, // In PCRE, == \Z
{ "(?m)\\A", true },
{ "(?m)\\z", true },
{ "(?-m)\\A", true },
{ "(?-m)\\z", true },
};
TEST(MimicsPCRE, SimpleTests) {
for (int i = 0; i < arraysize(tests); i++) {
const PCRETest& t = tests[i];
for (int j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
CHECK(re) << " " << t.regexp;
CHECK_EQ(t.should_match, re->MimicsPCRE())
<< " " << t.regexp << " "
<< (j==0 ? "latin1" : "utf");
re->Decref();
}
}
}
} // namespace re2

View File

@ -0,0 +1,44 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Null walker. For benchmarking the walker itself.
class NullWalker : public Regexp::Walker<bool> {
public:
NullWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NullWalker::ShortVisit called";
return a;
}
private:
DISALLOW_EVIL_CONSTRUCTORS(NullWalker);
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
return false;
}
// Returns whether re can match an empty string.
void Regexp::NullWalk() {
NullWalker w;
w.Walk(this, false);
}
} // namespace re2

View File

@ -0,0 +1,376 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test parse.cc, dump.cc, and tostring.cc.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct Test {
const char* regexp;
const char* parse;
};
static Test tests[] = {
// Base cases
{ "a", "lit{a}" },
{ "a.", "cat{lit{a}dot{}}" },
{ "a.b", "cat{lit{a}dot{}lit{b}}" },
{ "ab", "str{ab}" },
{ "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
{ "abc", "str{abc}" },
{ "a|^", "alt{lit{a}bol{}}" },
{ "a|b", "cc{0x61-0x62}" },
{ "(a)", "cap{lit{a}}" },
{ "(a)|b", "alt{cap{lit{a}}lit{b}}" },
{ "a*", "star{lit{a}}" },
{ "a+", "plus{lit{a}}" },
{ "a?", "que{lit{a}}" },
{ "a{2}", "rep{2,2 lit{a}}" },
{ "a{2,3}", "rep{2,3 lit{a}}" },
{ "a{2,}", "rep{2,-1 lit{a}}" },
{ "a*?", "nstar{lit{a}}" },
{ "a+?", "nplus{lit{a}}" },
{ "a??", "nque{lit{a}}" },
{ "a{2}?", "nrep{2,2 lit{a}}" },
{ "a{2,3}?", "nrep{2,3 lit{a}}" },
{ "a{2,}?", "nrep{2,-1 lit{a}}" },
{ "", "emp{}" },
{ "|", "emp{}" }, // alt{emp{}emp{}} but got factored
{ "|x|", "alt{emp{}lit{x}emp{}}" },
{ ".", "dot{}" },
{ "^", "bol{}" },
{ "$", "eol{}" },
{ "\\|", "lit{|}" },
{ "\\(", "lit{(}" },
{ "\\)", "lit{)}" },
{ "\\*", "lit{*}" },
{ "\\+", "lit{+}" },
{ "\\?", "lit{?}" },
{ "{", "lit{{}" },
{ "}", "lit{}}" },
{ "\\.", "lit{.}" },
{ "\\^", "lit{^}" },
{ "\\$", "lit{$}" },
{ "\\\\", "lit{\\}" },
{ "[ace]", "cc{0x61 0x63 0x65}" },
{ "[abc]", "cc{0x61-0x63}" },
{ "[a-z]", "cc{0x61-0x7a}" },
{ "[a]", "lit{a}" },
{ "\\-", "lit{-}" },
{ "-", "lit{-}" },
{ "\\_", "lit{_}" },
// Posix and Perl extensions
{ "[[:lower:]]", "cc{0x61-0x7a}" },
{ "[a-z]", "cc{0x61-0x7a}" },
{ "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
{ "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "\\d", "cc{0x30-0x39}" },
{ "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
{ "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
{ "\\C", "byte{}" },
// Unicode, negatives, and a double negative.
{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
{ "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
{ "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },
// More interesting regular expressions.
{ "a{,2}", "str{a{,2}}" },
{ "\\.\\^\\$\\\\", "str{.^$\\}" },
{ "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
{ "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
{ "a*{", "cat{star{lit{a}}lit{{}}" },
// Test precedences
{ "(?:ab)*", "star{str{ab}}" },
{ "(ab)*", "star{cap{str{ab}}}" },
{ "ab|cd", "alt{str{ab}str{cd}}" },
{ "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
// Test flattening.
{ "(?:a)", "lit{a}" },
{ "(?:ab)(?:cd)", "str{abcd}" },
{ "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
{ "a|.", "dot{}" },
{ ".|a", "dot{}" },
// Test Perl quoted literals
{ "\\Q+|*?{[\\E", "str{+|*?{[}" },
{ "\\Q+\\E+", "plus{lit{+}}" },
{ "\\Q\\\\E", "lit{\\}" },
{ "\\Q\\\\\\E", "str{\\\\}" },
// Test Perl \A and \z
{ "(?m)^", "bol{}" },
{ "(?m)$", "eol{}" },
{ "(?-m)^", "bot{}" },
{ "(?-m)$", "eot{}" },
{ "(?m)\\A", "bot{}" },
{ "(?m)\\z", "eot{\\z}" },
{ "(?-m)\\A", "bot{}" },
{ "(?-m)\\z", "eot{\\z}" },
// Test named captures
{ "(?P<name>a)", "cap{name:lit{a}}" },
// Case-folded literals
{ "[Aa]", "litfold{a}" },
// Strings
{ "abcde", "str{abcde}" },
{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
};
static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
Regexp::PerlX |
Regexp::PerlClasses |
Regexp::UnicodeGroups;
bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
return Regexp::Equal(a, b);
}
void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
const string& title) {
Regexp** re = new Regexp*[ntests];
for (int i = 0; i < ntests; i++) {
RegexpStatus status;
re[i] = Regexp::Parse(tests[i].regexp, flags, &status);
CHECK(re[i] != NULL) << " " << tests[i].regexp << " "
<< status.Text();
string s = re[i]->Dump();
EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp
<< "\nparse: " << tests[i].parse << " s: " << s;
}
for (int i = 0; i < ntests; i++) {
for (int j = 0; j < ntests; j++) {
EXPECT_EQ(string(tests[i].parse) == tests[j].parse,
RegexpEqualTestingOnly(re[i], re[j]))
<< "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
}
}
for (int i = 0; i < ntests; i++)
re[i]->Decref();
delete[] re;
}
// Test that regexps parse to expected structures.
TEST(TestParse, SimpleRegexps) {
TestParse(tests, arraysize(tests), kTestFlags, "simple");
}
Test foldcase_tests[] = {
{ "AbCdE", "strfold{abcde}" },
{ "[Aa]", "litfold{a}" },
{ "a", "litfold{a}" },
// 0x17F is an old English long s (looks like an f) and folds to s.
// 0x212A is the Kelvin symbol and folds to k.
{ "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
{ "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
};
// Test that parsing with FoldCase works.
TEST(TestParse, FoldCase) {
TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
}
Test literal_tests[] = {
{ "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
};
// Test that parsing with Literal works.
TEST(TestParse, Literal) {
TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
}
Test matchnl_tests[] = {
{ ".", "dot{}" },
{ "\n", "lit{\n}" },
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
{ "[a\\n]", "cc{0xa 0x61}" },
};
// Test that parsing with MatchNL works.
// (Also tested above during simple cases.)
TEST(TestParse, MatchNL) {
TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
}
Test nomatchnl_tests[] = {
{ ".", "cc{0-0x9 0xb-0x10ffff}" },
{ "\n", "lit{\n}" },
{ "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
{ "[a\\n]", "cc{0xa 0x61}" },
};
// Test that parsing without MatchNL works.
TEST(TestParse, NoMatchNL) {
TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
}
Test prefix_tests[] = {
{ "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
{ "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
{ "abc|abd|aef|bcx|bcy",
"alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
"cat{str{bc}cc{0x78-0x79}}}" },
{ "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
{ "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
{ "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
{ "(?:xx|yy)c|(?:xx|yy)d",
"cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" },
{ "x{2}|x{2}[0-9]",
"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
{ "x{2}y|x{2}[0-9]y",
"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
};
// Test that prefix factoring works.
TEST(TestParse, Prefix) {
TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
}
// Invalid regular expressions
const char* badtests[] = {
"(",
")",
"(a",
"(a|b|",
"(a|b",
"[a-z",
"([a-z)",
"x{1001}",
"\xff", // Invalid UTF-8
"[\xff]",
"[\\\xff]",
"\\\xff",
"(?P<name>a",
"(?P<name>",
"(?P<name",
"(?P<x y>a)",
"(?P<>a)",
"[a-Z]",
"(?i)[a-Z]",
"a{100000}",
"a{100000,}",
};
// Valid in Perl, bad in POSIX
const char* only_perl[] = {
"[a-b-c]",
"\\Qabc\\E",
"\\Q*+?{[\\E",
"\\Q\\\\E",
"\\Q\\\\\\E",
"\\Q\\\\\\\\E",
"\\Q\\\\\\\\\\E",
"(?:a)",
"(?P<name>a)",
};
// Valid in POSIX, bad in Perl.
const char* only_posix[] = {
"a++",
"a**",
"a?*",
"a+*",
"a{1}*",
};
// Test that parser rejects bad regexps.
TEST(TestParse, InvalidRegexps) {
for (int i = 0; i < arraysize(badtests); i++) {
CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
<< " " << badtests[i];
CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << badtests[i];
}
for (int i = 0; i < arraysize(only_posix); i++) {
CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
<< " " << only_posix[i];
Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
CHECK(re) << " " << only_posix[i];
re->Decref();
}
for (int i = 0; i < arraysize(only_perl); i++) {
CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << only_perl[i];
Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
CHECK(re) << " " << only_perl[i];
re->Decref();
}
}
// Test that ToString produces original regexp or equivalent one.
TEST(TestToString, EquivalentParse) {
for (int i = 0; i < arraysize(tests); i++) {
RegexpStatus status;
Regexp* re = Regexp::Parse(tests[i].regexp, kTestFlags, &status);
CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
string s = re->Dump();
EXPECT_EQ(string(tests[i].parse), s);
string t = re->ToString();
if (t != tests[i].regexp) {
// If ToString didn't return the original regexp,
// it must have found one with fewer parens.
// Unfortunately we can't check the length here, because
// ToString produces "\\{" for a literal brace,
// but "{" is a shorter equivalent.
// CHECK_LT(t.size(), strlen(tests[i].regexp))
// << " t=" << t << " regexp=" << tests[i].regexp;
// Test that if we parse the new regexp we get the same structure.
Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
CHECK(nre != NULL) << " reparse " << t << " " << status.Text();
string ss = nre->Dump();
string tt = nre->ToString();
if (s != ss || t != tt)
LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
EXPECT_EQ(s, ss);
EXPECT_EQ(t, tt);
nre->Decref();
}
re->Decref();
}
}
// Test that capture error args are correct.
TEST(NamedCaptures, ErrorArgs) {
RegexpStatus status;
Regexp* re;
re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<name");
re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<space bar>");
}
} // namespace re2

View File

@ -0,0 +1,240 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <vector>
#include "util/test.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Test that C++ strings are compared as uint8s, not int8s.
// PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings, EightBit) {
string s = "\x70";
string t = "\xA0";
EXPECT_LT(s, t);
}
struct PrefixTest {
const char* regexp;
int maxlen;
const char* min;
const char* max;
};
static PrefixTest tests[] = {
{ "", 10, "", "", },
{ "Abcdef", 10, "Abcdef", "Abcdef" },
{ "abc(def|ghi)", 10, "abcdef", "abcghi" },
{ "a+hello", 10, "aa", "ahello" },
{ "a*hello", 10, "a", "hello" },
{ "def|abc", 10, "abc", "def" },
{ "a(b)(c)[d]", 10, "abcd", "abcd" },
{ "ab(cab|cat)", 10, "abcab", "abcat" },
{ "ab(cab|ca)x", 10, "abcabx", "abcax" },
{ "(ab|x)(c|de)", 10, "abc", "xde" },
{ "(ab|x)?(c|z)?", 10, "", "z" },
{ "[^\\s\\S]", 10, "", "" },
{ "(abc)+", 5, "abc", "abcac" },
{ "(abc)+", 2, "ab", "ac" },
{ "(abc)+", 1, "a", "b" },
{ "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "a*", 10, "", "ab" },
{ "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)a+hello", 10, "AA", "ahello" },
{ "(?i)a*hello", 10, "A", "hello" },
{ "(?i)def|abc", 10, "ABC", "def" },
{ "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)[^\\s\\S]", 10, "", "" },
{ "(?i)(abc)+", 5, "ABC", "abcac" },
{ "(?i)(abc)+", 2, "AB", "ac" },
{ "(?i)(abc)+", 1, "A", "b" },
{ "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)a*", 10, "", "ab" },
{ "(?i)A*", 10, "", "ab" },
{ "\\AAbcdef", 10, "Abcdef", "Abcdef" },
{ "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
{ "\\Aa+hello", 10, "aa", "ahello" },
{ "\\Aa*hello", 10, "a", "hello" },
{ "\\Adef|abc", 10, "abc", "def" },
{ "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
{ "\\Aab(cab|cat)", 10, "abcab", "abcat" },
{ "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
{ "\\A(ab|x)(c|de)", 10, "abc", "xde" },
{ "\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "\\A[^\\s\\S]", 10, "", "" },
{ "\\A(abc)+", 5, "abc", "abcac" },
{ "\\A(abc)+", 2, "ab", "ac" },
{ "\\A(abc)+", 1, "a", "b" },
{ "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "\\Aa*", 10, "", "ab" },
{ "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)\\Aa+hello", 10, "AA", "ahello" },
{ "(?i)\\Aa*hello", 10, "A", "hello" },
{ "(?i)\\Adef|abc", 10, "ABC", "def" },
{ "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)\\A[^\\s\\S]", 10, "", "" },
{ "(?i)\\A(abc)+", 5, "ABC", "abcac" },
{ "(?i)\\A(abc)+", 2, "AB", "ac" },
{ "(?i)\\A(abc)+", 1, "A", "b" },
{ "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)\\Aa*", 10, "", "ab" },
{ "(?i)\\AA*", 10, "", "ab" },
};
TEST(PossibleMatchRange, HandWritten) {
for (int i = 0; i < arraysize(tests); i++) {
for (int j = 0; j < 2; j++) {
const PrefixTest& t = tests[i];
string min, max;
if (j == 0) {
LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
<< " " << t.regexp;
delete prog;
re->Decref();
} else {
CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
}
EXPECT_EQ(t.min, min) << t.regexp;
EXPECT_EQ(t.max, max) << t.regexp;
}
}
}
// Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange, Failures) {
string min, max;
// Fails because no room to write max.
EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
// Fails because there is no max -- any non-empty string matches
// or begins a match. Have to use Latin-1 input, because there
// are no valid UTF-8 strings beginning with byte 0xFF.
EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("\\C*").
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
// Fails because it's a malformed regexp.
EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
}
// Exhaustive test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that the prefix information agrees with whether
// the regexp matches each of the strings.
class PossibleMatchTester : public RegexpGenerator {
public:
PossibleMatchTester(int maxatoms,
int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen,
const vector<string>& stralphabet)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
regexps_(0), tests_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const string& regexp);
private:
StringGenerator strgen_;
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
};
// Processes a single generated regexp.
// Checks that all accepted strings agree with the prefix range.
void PossibleMatchTester::HandleRegexp(const string& regexp) {
regexps_++;
VLOG(3) << CEscape(regexp);
RE2 re(regexp, RE2::Latin1);
CHECK_EQ(re.error(), "");
string min, max;
if(!re.PossibleMatchRange(&min, &max, 10)) {
// There's no good max for "\\C*". Can't use strcmp
// because sometimes it gets embedded in more
// complicated expressions.
if(strstr(regexp.c_str(), "\\C*"))
return;
LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
}
strgen_.Reset();
while (strgen_.HasNext()) {
const StringPiece& s = strgen_.Next();
tests_++;
if (!RE2::FullMatch(s, re))
continue;
CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
}
}
TEST(PossibleMatchRange, Exhaustive) {
int natom = 3;
int noperator = 3;
int stringlen = 5;
if (DEBUG_MODE) {
natom = 2;
noperator = 3;
stringlen = 3;
}
PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
RegexpGenerator::EgrepOps(),
stringlen, Explode("ab4"));
t.Generate();
LOG(INFO) << t.regexps() << " regexps, "
<< t.tests() << " tests";
}
} // namespace re2

View File

@ -0,0 +1,95 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Random testing of regular expression matching.
#include <stdio.h>
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DEFINE_int32(regexpseed, 404, "Random regexp seed.");
DEFINE_int32(regexpcount, 100, "How many random regexps to generate.");
DEFINE_int32(stringseed, 200, "Random string seed.");
DEFINE_int32(stringcount, 100, "How many random strings to generate.");
namespace re2 {
// Runs a random test on the given parameters.
// (Always uses the same random seeds for reproducibility.
// Can give different seeds on command line.)
static void RandomTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper) {
// Limit to smaller test cases in debug mode,
// because everything is so much slower.
if (DEBUG_MODE) {
maxatoms--;
maxops--;
maxstrlen /= 2;
}
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper, "");
t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount);
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
EXPECT_EQ(0, t.failures());
}
// Tests random small regexps involving literals and egrep operators.
TEST(Random, SmallEgrepLiterals) {
RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random bigger regexps involving literals and egrep operators.
TEST(Random, BigEgrepLiterals) {
RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random small regexps involving literals, capturing parens,
// and egrep operators.
TEST(Random, SmallEgrepCaptures) {
RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random bigger regexps involving literals, capturing parens,
// and egrep operators.
TEST(Random, BigEgrepCaptures) {
RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random large complicated expressions, using all the possible
// operators, some literals, some parenthesized literals, and predefined
// character classes like \d. (Adding larger character classes would
// make for too many possibilities.)
TEST(Random, Complicated) {
vector<string> ops = Split(" ",
"%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
"%s{2} %s{2,} %s{3,4} %s{4,5}");
// Use (?:\b) and (?:\B) instead of \b and \B,
// because PCRE rejects \b* but accepts (?:\b)*.
// Ditto ^ and $.
vector<string> atoms = Split(" ",
". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
"\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
"a (a) b c - \\\\");
vector<string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
RandomTest(10, 10, atoms, ops, 20, alphabet, "");
}
} // namespace re2

View File

@ -0,0 +1,132 @@
// Copyright 2005 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This tests to make sure numbers are parsed from strings
// correctly.
// Todo: Expand the test to validate strings parsed to the other types
// supported by RE2::Arg class
#include "util/test.h"
#include "re2/re2.h"
namespace re2 {
struct SuccessTable {
const char * value_string;
int64 value;
bool success[6];
};
// Test boundary cases for different integral sizes.
// Specifically I want to make sure that values outside the boundries
// of an integral type will fail and that negative numbers will fail
// for unsigned types. The following table contains the boundaries for
// the various integral types and has entries for whether or not each
// type can contain the given value.
const SuccessTable kSuccessTable[] = {
// string integer value short ushort int uint int64 uint64
// 0 to 2^7-1
{ "0", 0, { true, true, true, true, true, true }},
{ "127", 127, { true, true, true, true, true, true }},
// -1 to -2^7
{ "-1", -1, { true, false, true, false, true, false }},
{ "-128", -128, { true, false, true, false, true, false }},
// 2^7 to 2^8-1
{ "128", 128, { true, true, true, true, true, true }},
{ "255", 255, { true, true, true, true, true, true }},
// 2^8 to 2^15-1
{ "256", 256, { true, true, true, true, true, true }},
{ "32767", 32767, { true, true, true, true, true, true }},
// -2^7-1 to -2^15
{ "-129", -129, { true, false, true, false, true, false }},
{ "-32768", -32768, { true, false, true, false, true, false }},
// 2^15 to 2^16-1
{ "32768", 32768, { false, true, true, true, true, true }},
{ "65535", 65535, { false, true, true, true, true, true }},
// 2^16 to 2^31-1
{ "65536", 65536, { false, false, true, true, true, true }},
{ "2147483647", 2147483647, { false, false, true, true, true, true }},
// -2^15-1 to -2^31
{ "-32769", -32769, { false, false, true, false, true, false }},
{ "-2147483648",
0xFFFFFFFF80000000LL, { false, false, true, false, true, false }},
// 2^31 to 2^32-1
{ "2147483648", 2147483648U, { false, false, false, true, true, true }},
{ "4294967295", 4294967295U, { false, false, false, true, true, true }},
// 2^32 to 2^63-1
{ "4294967296", 4294967296LL, { false, false, false, false, true, true }},
{ "9223372036854775807",
9223372036854775807LL, { false, false, false, false, true, true }},
// -2^31-1 to -2^63
{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }},
{ "-9223372036854775808",
0x8000000000000000LL, { false, false, false, false, true, false }},
// 2^63 to 2^64-1
{ "9223372036854775808",
9223372036854775808ULL, { false, false, false, false, false, true }},
{ "18446744073709551615",
18446744073709551615ULL, { false, false, false, false, false, true }},
// >= 2^64
{ "18446744073709551616", 0, { false, false, false, false, false, false }},
};
const int kNumStrings = ARRAYSIZE(kSuccessTable);
// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M
// macro outside of a TEST block and this seems to be the only way to
// avoid code duplication. I can also pull off a couple nice tricks
// using concatenation for the type I'm checking against.
#define PARSE_FOR_TYPE(type, column) { \
type r; \
for ( int i = 0; i < kNumStrings; ++i ) { \
RE2::Arg arg(&r); \
const char* const p = kSuccessTable[i].value_string; \
bool retval = arg.Parse(p, strlen(p)); \
bool success = kSuccessTable[i].success[column]; \
ASSERT_TRUE_M(retval == success, \
StringPrintf("Parsing '%s' for type " #type " should return %d", \
p, success).c_str()); \
if ( success ) { \
ASSERT_EQUALS(r, kSuccessTable[i].value); \
} \
} \
}
TEST(REArgTest, Int16Test) {
PARSE_FOR_TYPE(int16, 0);
}
TEST(REArgTest, Uint16Test) {
PARSE_FOR_TYPE(uint16, 1);
}
TEST(REArgTest, IntTest) {
PARSE_FOR_TYPE(int, 2);
}
TEST(REArgTest, UInt32Test) {
PARSE_FOR_TYPE(uint32, 3);
}
TEST(REArgTest, Iint64Test) {
PARSE_FOR_TYPE(int64, 4);
}
TEST(REArgTest, Uint64Test) {
PARSE_FOR_TYPE(uint64, 5);
}
} // namespace re2

1363
re2/re2/testing/re2_test.cc Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,264 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression generator: generates all possible
// regular expressions within parameters (see regexp_generator.h for details).
// The regexp generator first generates a sequence of commands in a simple
// postfix language. Each command in the language is a string,
// like "a" or "%s*" or "%s|%s".
//
// To evaluate a command, enough arguments are popped from the value stack to
// plug into the %s slots. Then the result is pushed onto the stack.
// For example, the command sequence
// a b %s%s c
// results in the stack
// ab c
//
// GeneratePostfix generates all possible command sequences.
// Then RunPostfix turns each sequence into a regular expression
// and passes the regexp to HandleRegexp.
#include <string.h>
#include <string>
#include <stack>
#include <vector>
#include "util/test.h"
#include "re2/testing/regexp_generator.h"
namespace re2 {
// Returns a vector of the egrep regexp operators.
const vector<string>& RegexpGenerator::EgrepOps() {
static const char *ops[] = {
"%s%s",
"%s|%s",
"%s*",
"%s+",
"%s?",
"%s\\C*",
};
static vector<string> v(ops, ops + arraysize(ops));
return v;
}
RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
const vector<string>& atoms,
const vector<string>& ops)
: maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
// Degenerate case.
if (atoms_.size() == 0)
maxatoms_ = 0;
if (ops_.size() == 0)
maxops_ = 0;
}
// Generates all possible regular expressions (within the parameters),
// calling HandleRegexp for each one.
void RegexpGenerator::Generate() {
vector<string> postfix;
GeneratePostfix(&postfix, 0, 0, 0);
}
// Generates random regular expressions, calling HandleRegexp for each one.
void RegexpGenerator::GenerateRandom(int32 seed, int n) {
ACMRandom acm(seed);
acm_ = &acm;
for (int i = 0; i < n; i++) {
vector<string> postfix;
GenerateRandomPostfix(&postfix, 0, 0, 0);
}
acm_ = NULL;
}
// Counts and returns the number of occurrences of "%s" in s.
static int CountArgs(const string& s) {
const char *p = s.c_str();
int n = 0;
while ((p = strstr(p, "%s")) != NULL) {
p += 2;
n++;
}
return n;
}
// Generates all possible postfix command sequences.
// Each sequence is handed off to RunPostfix to generate a regular expression.
// The arguments are:
// post: the current postfix sequence
// nstk: the number of elements that would be on the stack after executing
// the sequence
// ops: the number of operators used in the sequence
// atoms: the number of atoms used in the sequence
// For example, if post were ["a", "b", "%s%s", "c"],
// then nstk = 2, ops = 1, atoms = 3.
//
// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
//
void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
int ops, int atoms) {
if (nstk == 1)
RunPostfix(*post);
// Early out: if used too many operators or can't
// get back down to a single expression on the stack
// using binary operators, give up.
if (ops + nstk - 1 > maxops_)
return;
// Add atoms if there is room.
if (atoms < maxatoms_) {
for (int i = 0; i < atoms_.size(); i++) {
post->push_back(atoms_[i]);
GeneratePostfix(post, nstk + 1, ops, atoms + 1);
post->pop_back();
}
}
// Add operators if there are enough arguments.
if (ops < maxops_) {
for (int i = 0; i < ops_.size(); i++) {
const string& fmt = ops_[i];
int nargs = CountArgs(fmt);
if (nargs <= nstk) {
post->push_back(fmt);
GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
post->pop_back();
}
}
}
}
// Generates a random postfix command sequence.
// Stops and returns true once a single sequence has been generated.
bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
int ops, int atoms) {
for (;;) {
// Stop if we get to a single element, but only sometimes.
if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
RunPostfix(*post);
return true;
}
// Early out: if used too many operators or can't
// get back down to a single expression on the stack
// using binary operators, give up.
if (ops + nstk - 1 > maxops_)
return false;
// Add operators if there are enough arguments.
if (ops < maxops_ && acm_->Uniform(2) == 0) {
const string& fmt = ops_[acm_->Uniform(ops_.size())];
int nargs = CountArgs(fmt);
if (nargs <= nstk) {
post->push_back(fmt);
bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
ops + 1, atoms);
post->pop_back();
if (ret)
return true;
}
}
// Add atoms if there is room.
if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
post->pop_back();
if (ret)
return true;
}
}
}
// Interprets the postfix command sequence to create a regular expression
// passed to HandleRegexp. The results of operators like %s|%s are wrapped
// in (?: ) to avoid needing to maintain a precedence table.
void RegexpGenerator::RunPostfix(const vector<string>& post) {
stack<string> regexps;
for (int i = 0; i < post.size(); i++) {
switch (CountArgs(post[i])) {
default:
LOG(FATAL) << "Bad operator: " << post[i];
case 0:
regexps.push(post[i]);
break;
case 1: {
string a = regexps.top();
regexps.pop();
regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
break;
}
case 2: {
string b = regexps.top();
regexps.pop();
string a = regexps.top();
regexps.pop();
regexps.push("(?:" +
StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
")");
break;
}
}
}
if (regexps.size() != 1) {
// Internal error - should never happen.
printf("Bad regexp program:\n");
for (int i = 0; i < post.size(); i++) {
printf(" %s\n", CEscape(post[i]).c_str());
}
printf("Stack after running program:\n");
while (!regexps.empty()) {
printf(" %s\n", CEscape(regexps.top()).c_str());
regexps.pop();
}
LOG(FATAL) << "Bad regexp program.";
}
HandleRegexp(regexps.top());
HandleRegexp("^(?:" + regexps.top() + ")$");
HandleRegexp("^(?:" + regexps.top() + ")");
HandleRegexp("(?:" + regexps.top() + ")$");
}
// Split s into an vector of strings, one for each UTF-8 character.
vector<string> Explode(const StringPiece& s) {
vector<string> v;
for (const char *q = s.begin(); q < s.end(); ) {
const char* p = q;
Rune r;
q += chartorune(&r, q);
v.push_back(string(p, q - p));
}
return v;
}
// Split string everywhere a substring is found, returning
// vector of pieces.
vector<string> Split(const StringPiece& sep, const StringPiece& s) {
vector<string> v;
if (sep.size() == 0)
return Explode(s);
const char *p = s.begin();
for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
if (StringPiece(q, sep.size()) == sep) {
v.push_back(string(p, q - p));
p = q + sep.size();
q = p - 1; // -1 for ++ in loop
continue;
}
}
if (p < s.end())
v.push_back(string(p, s.end() - p));
return v;
}
} // namespace re2

View File

@ -0,0 +1,70 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression generator: generates all possible
// regular expressions within given parameters (see below for details).
#ifndef RE2_TESTING_REGEXP_GENERATOR_H__
#define RE2_TESTING_REGEXP_GENERATOR_H__
#include <string>
#include <vector>
#include "util/random.h"
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// Regular expression generator.
//
// Given a set of atom expressions like "a", "b", or "."
// and operators like "%s*", generates all possible regular expressions
// using at most maxbases base expressions and maxops operators.
// For each such expression re, calls HandleRegexp(re).
//
// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
//
class RegexpGenerator {
public:
RegexpGenerator(int maxatoms, int maxops, const vector<string>& atoms,
const vector<string>& ops);
virtual ~RegexpGenerator() {}
// Generates all the regular expressions, calling HandleRegexp(re) for each.
void Generate();
// Generates n random regular expressions, calling HandleRegexp(re) for each.
void GenerateRandom(int32 seed, int n);
// Handles a regular expression. Must be provided by subclass.
virtual void HandleRegexp(const string& regexp) = 0;
// The egrep regexp operators: * + ? | and concatenation.
static const vector<string>& EgrepOps();
private:
void RunPostfix(const vector<string>& post);
void GeneratePostfix(vector<string>* post, int nstk, int ops, int lits);
bool GenerateRandomPostfix(vector<string>* post, int nstk, int ops, int lits);
int maxatoms_; // Maximum number of atoms allowed in expr.
int maxops_; // Maximum number of ops allowed in expr.
vector<string> atoms_; // Possible atoms.
vector<string> ops_; // Possible ops.
ACMRandom* acm_; // Random generator.
DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator);
};
// Helpers for preparing arguments to RegexpGenerator constructor.
// Returns one string for each character in s.
vector<string> Explode(const StringPiece& s);
// Splits string everywhere sep is found, returning
// vector of pieces.
vector<string> Split(const StringPiece& sep, const StringPiece& s);
} // namespace re2
#endif // RE2_TESTING_REGEXP_GENERATOR_H__

View File

@ -0,0 +1,81 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test parse.cc, dump.cc, and tostring.cc.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
// Test that overflowed ref counts work.
TEST(Regexp, BigRef) {
Regexp* re;
re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
for (int i = 0; i < 100000; i++)
re->Incref();
for (int i = 0; i < 100000; i++)
re->Decref();
CHECK_EQ(re->Ref(), 1);
re->Decref();
}
// Test that very large Concats work.
// Depends on overflowed ref counts working.
TEST(Regexp, BigConcat) {
Regexp* x;
x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
vector<Regexp*> v(90000, x); // ToString bails out at 100000
for (int i = 0; i < v.size(); i++)
x->Incref();
CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref();
Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags);
CHECK_EQ(re->ToString(), string(v.size(), 'x'));
re->Decref();
CHECK_EQ(x->Ref(), 1) << x->Ref();
x->Decref();
}
TEST(Regexp, NamedCaptures) {
Regexp* x;
RegexpStatus status;
x = Regexp::Parse(
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
EXPECT_TRUE(status.ok());
EXPECT_EQ(4, x->NumCaptures());
const map<string, int>* have = x->NamedCaptures();
EXPECT_TRUE(have != NULL);
EXPECT_EQ(2, have->size()); // there are only two named groups in
// the regexp: 'g1' and 'g2'.
map<string, int> want;
want["g1"] = 1;
want["g2"] = 3;
EXPECT_EQ(want, *have);
x->Decref();
delete have;
}
TEST(Regexp, CaptureNames) {
Regexp* x;
RegexpStatus status;
x = Regexp::Parse(
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
EXPECT_TRUE(status.ok());
EXPECT_EQ(4, x->NumCaptures());
const map<int, string>* have = x->CaptureNames();
EXPECT_TRUE(have != NULL);
EXPECT_EQ(3, have->size());
map<int, string> want;
want[1] = "g1";
want[3] = "g2";
want[4] = "g1";
EXPECT_EQ(want, *have);
x->Decref();
delete have;
}
} // namespace re2

View File

@ -0,0 +1,67 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct PrefixTest {
const char* regexp;
bool return_value;
const char* prefix;
bool foldcase;
const char* suffix;
};
static PrefixTest tests[] = {
// If the regexp is missing a ^, there's no required prefix.
{ "abc", false },
{ "", false },
{ "(?m)^", false },
// If the regexp immediately goes into
// something not a literal match, there's no required prefix.
{ "^(abc)", false },
{ "^a*", false },
// Otherwise, it should work.
{ "^abc$", true, "abc", false, "(?-m:$)" },
{ "^abc", "true", "abc", false, "" },
{ "^(?i)abc", true, "abc", true, "" },
{ "^abcd*", true, "abc", false, "d*" },
{ "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
{ "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
{ "^☺abc", true, "☺abc", false, "" },
};
TEST(RequiredPrefix, SimpleTests) {
for (int i = 0; i < arraysize(tests); i++) {
const PrefixTest& t = tests[i];
for (int j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
CHECK(re) << " " << t.regexp;
string p;
bool f = false;
Regexp* s = NULL;
CHECK_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf") << " " << re->Dump();
if (t.return_value) {
CHECK_EQ(p, string(t.prefix))
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
CHECK_EQ(f, t.foldcase)
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
CHECK_EQ(s->ToString(), string(t.suffix))
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
s->Decref();
}
re->Decref();
}
}
}
} // namespace re2

View File

@ -0,0 +1,325 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdlib.h>
#include <vector>
#include "util/test.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/testing/tester.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
struct RegexpTest {
const char* regexp;
const char* text;
};
RegexpTest simple_tests[] = {
{ "a", "a" },
{ "a", "zyzzyva" },
{ "a+", "aa" },
{ "(a+|b)+", "ab" },
{ "ab|cd", "xabcdx" },
{ "h.*od?", "hello\ngoodbye\n" },
{ "h.*o", "hello\ngoodbye\n" },
{ "h.*o", "goodbye\nhello\n" },
{ "h.*o", "hello world" },
{ "h.*o", "othello, world" },
{ "[^\\s\\S]", "aaaaaaa" },
{ "a", "aaaaaaa" },
{ "a*", "aaaaaaa" },
{ "a*", "" },
{ "a*", NULL },
{ "ab|cd", "xabcdx" },
{ "a", "cab" },
{ "a*b", "cab" },
{ "((((((((((((((((((((x))))))))))))))))))))", "x" },
{ "[abcd]", "xxxabcdxxx" },
{ "[^x]", "xxxabcdxxx" },
{ "[abcd]+", "xxxabcdxxx" },
{ "[^x]+", "xxxabcdxxx" },
{ "(fo|foo)", "fo" },
{ "(foo|fo)", "foo" },
{ "aa", "aA" },
{ "a", "Aa" },
{ "a", "A" },
{ "ABC", "abc" },
{ "abc", "XABCY" },
{ "ABC", "xabcy" },
// Make sure ^ and $ work.
// The pathological cases didn't work
// in the original grep code.
{ "foo|bar|[A-Z]", "foo" },
{ "^(foo|bar|[A-Z])", "foo" },
{ "(foo|bar|[A-Z])$", "foo\n" },
{ "(foo|bar|[A-Z])$", "foo" },
{ "^(foo|bar|[A-Z])$", "foo\n" },
{ "^(foo|bar|[A-Z])$", "foo" },
{ "^(foo|bar|[A-Z])$", "bar" },
{ "^(foo|bar|[A-Z])$", "X" },
{ "^(foo|bar|[A-Z])$", "XY" },
{ "^(fo|foo)$", "fo" },
{ "^(fo|foo)$", "foo" },
{ "^^(fo|foo)$", "fo" },
{ "^^(fo|foo)$", "foo" },
{ "^$", "" },
{ "^$", "x" },
{ "^^$", "" },
{ "^$$", "" },
{ "^^$", "x" },
{ "^$$", "x" },
{ "^^$$", "" },
{ "^^$$", "x" },
{ "^^^^^^^^$$$$$$$$", "" },
{ "^", "x" },
{ "$", "x" },
// Word boundaries.
{ "\\bfoo\\b", "nofoo foo that" },
{ "a\\b", "faoa x" },
{ "\\bbar", "bar x" },
{ "\\bbar", "foo\nbar x" },
{ "bar\\b", "foobar" },
{ "bar\\b", "foobar\nxxx" },
{ "(foo|bar|[A-Z])\\b", "foo" },
{ "(foo|bar|[A-Z])\\b", "foo\n" },
{ "\\b", "" },
{ "\\b", "x" },
{ "\\b(foo|bar|[A-Z])", "foo" },
{ "\\b(foo|bar|[A-Z])\\b", "X" },
{ "\\b(foo|bar|[A-Z])\\b", "XY" },
{ "\\b(foo|bar|[A-Z])\\b", "bar" },
{ "\\b(foo|bar|[A-Z])\\b", "foo" },
{ "\\b(foo|bar|[A-Z])\\b", "foo\n" },
{ "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" },
{ "\\b(fo|foo)\\b", "fo" },
{ "\\b(fo|foo)\\b", "foo" },
{ "\\b\\b", "" },
{ "\\b\\b", "x" },
{ "\\b$", "" },
{ "\\b$", "x" },
{ "\\b$", "y x" },
{ "\\b.$", "x" },
{ "^\\b(fo|foo)\\b", "fo" },
{ "^\\b(fo|foo)\\b", "foo" },
{ "^\\b", "" },
{ "^\\b", "x" },
{ "^\\b\\b", "" },
{ "^\\b\\b", "x" },
{ "^\\b$", "" },
{ "^\\b$", "x" },
{ "^\\b.$", "x" },
{ "^\\b.\\b$", "x" },
{ "^^^^^^^^\\b$$$$$$$", "" },
{ "^^^^^^^^\\b.$$$$$$", "x" },
{ "^^^^^^^^\\b$$$$$$$", "x" },
// Non-word boundaries.
{ "\\Bfoo\\B", "n foo xfoox that" },
{ "a\\B", "faoa x" },
{ "\\Bbar", "bar x" },
{ "\\Bbar", "foo\nbar x" },
{ "bar\\B", "foobar" },
{ "bar\\B", "foobar\nxxx" },
{ "(foo|bar|[A-Z])\\B", "foox" },
{ "(foo|bar|[A-Z])\\B", "foo\n" },
{ "\\B", "" },
{ "\\B", "x" },
{ "\\B(foo|bar|[A-Z])", "foo" },
{ "\\B(foo|bar|[A-Z])\\B", "xXy" },
{ "\\B(foo|bar|[A-Z])\\B", "XY" },
{ "\\B(foo|bar|[A-Z])\\B", "XYZ" },
{ "\\B(foo|bar|[A-Z])\\B", "abara" },
{ "\\B(foo|bar|[A-Z])\\B", "xfoo_" },
{ "\\B(foo|bar|[A-Z])\\B", "xfoo\n" },
{ "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" },
{ "\\B(fo|foo)\\B", "xfoo" },
{ "\\B(foo|fo)\\B", "xfooo" },
{ "\\B\\B", "" },
{ "\\B\\B", "x" },
{ "\\B$", "" },
{ "\\B$", "x" },
{ "\\B$", "y x" },
{ "\\B.$", "x" },
{ "^\\B(fo|foo)\\B", "fo" },
{ "^\\B(fo|foo)\\B", "foo" },
{ "^\\B", "" },
{ "^\\B", "x" },
{ "^\\B\\B", "" },
{ "^\\B\\B", "x" },
{ "^\\B$", "" },
{ "^\\B$", "x" },
{ "^\\B.$", "x" },
{ "^\\B.\\B$", "x" },
{ "^^^^^^^^\\B$$$$$$$", "" },
{ "^^^^^^^^\\B.$$$$$$", "x" },
{ "^^^^^^^^\\B$$$$$$$", "x" },
// PCRE uses only ASCII for \b computation.
// All non-ASCII are *not* word characters.
{ "\\bx\\b", "x" },
{ "\\bx\\b", "x>" },
{ "\\bx\\b", "<x" },
{ "\\bx\\b", "<x>" },
{ "\\bx\\b", "ax" },
{ "\\bx\\b", "xb" },
{ "\\bx\\b", "axb" },
{ "\\bx\\b", "«x" },
{ "\\bx\\b", "" },
{ "\\bx\\b", "«x»" },
{ "\\bx\\b", "axb" },
{ "\\bx\\b", "áxβ" },
{ "\\Bx\\B", "axb" },
{ "\\Bx\\B", "áxβ" },
// Weird boundary cases.
{ "^$^$", "" },
{ "^$^", "" },
{ "$^$", "" },
{ "^$^$", "x" },
{ "^$^", "x" },
{ "$^$", "x" },
{ "^$^$", "x\ny" },
{ "^$^", "x\ny" },
{ "$^$", "x\ny" },
{ "^$^$", "x\n\ny" },
{ "^$^", "x\n\ny" },
{ "$^$", "x\n\ny" },
{ "^(foo\\$)$", "foo$bar" },
{ "(foo\\$)", "foo$bar" },
{ "^...$", "abc" },
// UTF-8
{ "^\xe6\x9c\xac$", "\xe6\x9c\xac" },
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^...$", ".\xe6\x9c\xac." },
{ "^\\C\\C\\C$", "\xe6\x9c\xac" },
{ "^\\C$", "\xe6\x9c\xac" },
{ "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
// Latin1
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^...$", ".\xe6\x9c\xac." },
{ "^.....$", ".\xe6\x9c\xac." },
// Perl v Posix
{ "\\B(fo|foo)\\B", "xfooo" },
{ "(fo|foo)", "foo" },
// Octal escapes.
{ "\\141", "a" },
{ "\\060", "0" },
{ "\\0600", "00" },
{ "\\608", "08" },
{ "\\01", "\01" },
{ "\\018", "\01" "8" },
// Hexadecimal escapes
{ "\\x{61}", "a" },
{ "\\x61", "a" },
{ "\\x{00000061}", "a" },
// Unicode scripts.
{ "\\p{Greek}+", "aαβb" },
{ "\\P{Greek}+", "aαβb" },
{ "\\p{^Greek}+", "aαβb" },
{ "\\P{^Greek}+", "aαβb" },
// Unicode properties. Nd is decimal number. N is any number.
{ "[^0-9]+", "abc123" },
{ "\\p{Nd}+", "abc123²³¼½¾₀₉" },
{ "\\p{^Nd}+", "abc123²³¼½¾₀₉" },
{ "\\P{Nd}+", "abc123²³¼½¾₀₉" },
{ "\\P{^Nd}+", "abc123²³¼½¾₀₉" },
{ "\\pN+", "abc123²³¼½¾₀₉" },
{ "\\p{N}+", "abc123²³¼½¾₀₉" },
{ "\\p{^N}+", "abc123²³¼½¾₀₉" },
{ "\\p{Any}+", "abc123" },
// Character classes & case folding.
{ "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B
{ "(?i)[A-Z]+", "aAzZ" },
{ "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z -
// splits the ranges in an interesting way.
// would like to use, but PCRE mishandles in full-match, non-greedy mode
// { "(?i)[\\\\]+", "Aa" },
{ "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
// Character classes & case folding.
{ "[@-A]+", "@AaB" },
{ "[A-Z]+", "aAzZ" },
{ "[^\\\\]+", "Aa\\" },
{ "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
// Anchoring. (^abc in aabcdef was a former bug)
// The tester checks for a match in the text and
// subpieces of the text with a byte removed on either side.
{ "^abc", "abcdef" },
{ "^abc", "aabcdef" },
{ "^[ay]*[bx]+c", "abcdef" },
{ "^[ay]*[bx]+c", "aabcdef" },
{ "def$", "abcdef" },
{ "def$", "abcdeff" },
{ "d[ex][fy]$", "abcdef" },
{ "d[ex][fy]$", "abcdeff" },
{ "[dz][ex][fy]$", "abcdef" },
{ "[dz][ex][fy]$", "abcdeff" },
{ "(?m)^abc", "abcdef" },
{ "(?m)^abc", "aabcdef" },
{ "(?m)^[ay]*[bx]+c", "abcdef" },
{ "(?m)^[ay]*[bx]+c", "aabcdef" },
{ "(?m)def$", "abcdef" },
{ "(?m)def$", "abcdeff" },
{ "(?m)d[ex][fy]$", "abcdef" },
{ "(?m)d[ex][fy]$", "abcdeff" },
{ "(?m)[dz][ex][fy]$", "abcdef" },
{ "(?m)[dz][ex][fy]$", "abcdeff" },
{ "^", "a" },
{ "^^", "a" },
// Context.
// The tester checks for a match in the text and
// subpieces of the text with a byte removed on either side.
{ "a", "a" },
{ "ab*", "a" },
{ "a\\C*", "a" },
// Former bugs.
{ "a\\C*|ba\\C", "baba" },
};
TEST(Regexp, SearchTests) {
int failures = 0;
for (int i = 0; i < arraysize(simple_tests); i++) {
const RegexpTest& t = simple_tests[i];
if (!TestRegexpOnText(t.regexp, t.text))
failures++;
#ifdef LOGGING
// Build a dummy ExhaustiveTest call that will trigger just
// this one test, so that we log the test case.
vector<string> atom, alpha, ops;
atom.push_back(StringPiece(t.regexp).as_string());
alpha.push_back(StringPiece(t.text).as_string());
ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", "");
#endif
}
EXPECT_EQ(failures, 0);
}
} // namespace re2

102
re2/re2/testing/set_test.cc Normal file
View File

@ -0,0 +1,102 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <sys/types.h>
#include <sys/stat.h>
#include <vector>
#include "util/test.h"
#include "re2/re2.h"
#include "re2/set.h"
namespace re2 {
TEST(Set, Unanchored) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("bar", NULL), 1);
CHECK_EQ(s.Compile(), true);
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
v.clear();
CHECK_EQ(s.Match("fooba", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
v.clear();
CHECK_EQ(s.Match("oobar", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 1);
}
TEST(Set, UnanchoredFactored) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("foobar", NULL), 1);
CHECK_EQ(s.Compile(), true);
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
v.clear();
CHECK_EQ(s.Match("obarfoobaroo", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
v.clear();
CHECK_EQ(s.Match("fooba", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
v.clear();
CHECK_EQ(s.Match("oobar", &v), false);
CHECK_EQ(v.size(), 0);
}
TEST(Set, Anchored) {
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("bar", NULL), 1);
CHECK_EQ(s.Compile(), true);
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), false);
CHECK_EQ(v.size(), 0);
CHECK_EQ(s.Match("fooba", &v), false);
CHECK_EQ(v.size(), 0);
CHECK_EQ(s.Match("oobar", &v), false);
CHECK_EQ(v.size(), 0);
CHECK_EQ(s.Match("foo", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
CHECK_EQ(s.Match("bar", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 1);
}
} // namespace re2

View File

@ -0,0 +1,167 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test simplify.cc.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct Test {
const char* regexp;
const char* simplified;
};
static Test tests[] = {
// Already-simple constructs
{ "a", "a" },
{ "ab", "ab" },
{ "a|b", "[a-b]" },
{ "ab|cd", "ab|cd" },
{ "(ab)*", "(ab)*" },
{ "(ab)+", "(ab)+" },
{ "(ab)?", "(ab)?" },
{ ".", "." },
{ "^", "^" },
{ "$", "$" },
{ "[ac]", "[ac]" },
{ "[^ac]", "[^ac]" },
// Posix character classes
{ "[[:alnum:]]", "[0-9A-Za-z]" },
{ "[[:alpha:]]", "[A-Za-z]" },
{ "[[:blank:]]", "[\\t ]" },
{ "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
{ "[[:digit:]]", "[0-9]" },
{ "[[:graph:]]", "[!-~]" },
{ "[[:lower:]]", "[a-z]" },
{ "[[:print:]]", "[ -~]" },
{ "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
{ "[[:space:]]" , "[\\t-\\r ]" },
{ "[[:upper:]]", "[A-Z]" },
{ "[[:xdigit:]]", "[0-9A-Fa-f]" },
// Perl character classes
{ "\\d", "[0-9]" },
{ "\\s", "[\\t-\\n\\f-\\r ]" },
{ "\\w", "[0-9A-Z_a-z]" },
{ "\\D", "[^0-9]" },
{ "\\S", "[^\\t-\\n\\f-\\r ]" },
{ "\\W", "[^0-9A-Z_a-z]" },
{ "[\\d]", "[0-9]" },
{ "[\\s]", "[\\t-\\n\\f-\\r ]" },
{ "[\\w]", "[0-9A-Z_a-z]" },
{ "[\\D]", "[^0-9]" },
{ "[\\S]", "[^\\t-\\n\\f-\\r ]" },
{ "[\\W]", "[^0-9A-Z_a-z]" },
// Posix repetitions
{ "a{1}", "a" },
{ "a{2}", "aa" },
{ "a{5}", "aaaaa" },
{ "a{0,1}", "a?" },
// The next three are illegible because Simplify inserts (?:)
// parens instead of () parens to avoid creating extra
// captured subexpressions. The comments show a version fewer parens.
{ "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
{ "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
{ "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
{ "a{0,2}", "(?:aa?)?" }, // (aa?)?
{ "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
{ "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
{ "a{0,}", "a*" },
{ "a{1,}", "a+" },
{ "a{2,}", "aa+" },
{ "a{5,}", "aaaaa+" },
// Test that operators simplify their arguments.
// (Simplify used to not simplify arguments to a {} repeat.)
{ "(?:a{1,}){1,}", "a+" },
{ "(a{1,}b{1,})", "(a+b+)" },
{ "a{1,}|b{1,}", "a+|b+" },
{ "(?:a{1,})*", "(?:a+)*" },
{ "(?:a{1,})+", "a+" },
{ "(?:a{1,})?", "(?:a+)?" },
{ "a{0}", "" },
// Character class simplification
{ "[ab]", "[a-b]" },
{ "[a-za-za-z]", "[a-z]" },
{ "[A-Za-zA-Za-z]", "[A-Za-z]" },
{ "[ABCDEFGH]", "[A-H]" },
{ "[AB-CD-EF-GH]", "[A-H]" },
{ "[W-ZP-XE-R]", "[E-Z]" },
{ "[a-ee-gg-m]", "[a-m]" },
{ "[a-ea-ha-m]", "[a-m]" },
{ "[a-ma-ha-e]", "[a-m]" },
{ "[a-zA-Z0-9 -~]", "[ -~]" },
// Empty character classes
{ "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
// Full character classes
{ "[[:cntrl:][:^cntrl:]]", "." },
// Unicode case folding.
{ "(?i)A", "[Aa]" },
{ "(?i)a", "[Aa]" },
{ "(?i)K", "[Kk\\x{212a}]" },
{ "(?i)k", "[Kk\\x{212a}]" },
{ "(?i)\\x{212a}", "[Kk\\x{212a}]" },
{ "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
{ "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
{ "(?i)[\\x00-\\x{10ffff}]", "." },
// Empty string as a regular expression.
// Empty string must be preserved inside parens in order
// to make submatches work right, so these are less
// interesting than they used to be. ToString inserts
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{ "(a|b|)", "([a-b]|(?:))" },
{ "(|)", "()" },
{ "a()", "a()" },
{ "(()|())", "(()|())" },
{ "(a|)", "(a|(?:))" },
{ "ab()cd()", "ab()cd()" },
{ "()", "()" },
{ "()*", "()*" },
{ "()+", "()+" },
{ "()?" , "()?" },
{ "(){0}", "" },
{ "(){1}", "()" },
{ "(){1,}", "()+" },
{ "(){0,2}", "(?:()()?)?" },
};
TEST(TestSimplify, SimpleRegexps) {
for (int i = 0; i < arraysize(tests); i++) {
RegexpStatus status;
VLOG(1) << "Testing " << tests[i].regexp;
Regexp* re = Regexp::Parse(tests[i].regexp,
Regexp::MatchNL | (Regexp::LikePerl &
~Regexp::OneLine),
&status);
CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
Regexp* sre = re->Simplify();
CHECK(sre != NULL);
// Check that already-simple regexps don't allocate new ones.
if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
CHECK(re == sre) << " " << tests[i].regexp
<< " " << re->ToString() << " " << sre->ToString();
}
EXPECT_EQ(tests[i].simplified, sre->ToString())
<< " " << tests[i].regexp << " " << sre->Dump();
re->Decref();
sre->Decref();
}
}
} // namespace re2

View File

@ -0,0 +1,113 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// String generator: generates all possible strings of up to
// maxlen letters using the set of letters in alpha.
// Fetch strings using a Java-like Next()/HasNext() interface.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/testing/string_generator.h"
namespace re2 {
StringGenerator::StringGenerator(int maxlen, const vector<string>& alphabet)
: maxlen_(maxlen), alphabet_(alphabet),
generate_null_(false),
random_(false), nrandom_(0), acm_(NULL) {
// Degenerate case: no letters, no non-empty strings.
if (alphabet_.size() == 0)
maxlen_ = 0;
// Next() will return empty string (digits_ is empty).
hasnext_ = true;
}
StringGenerator::~StringGenerator() {
delete acm_;
}
// Resets the string generator state to the beginning.
void StringGenerator::Reset() {
digits_.clear();
hasnext_ = true;
random_ = false;
nrandom_ = 0;
generate_null_ = false;
}
// Increments the big number in digits_, returning true if successful.
// Returns false if all the numbers have been used.
bool StringGenerator::IncrementDigits() {
// First try to increment the current number.
for (int i = digits_.size() - 1; i >= 0; i--) {
if (++digits_[i] < alphabet_.size())
return true;
digits_[i] = 0;
}
// If that failed, make a longer number.
if (digits_.size() < maxlen_) {
digits_.push_back(0);
return true;
}
return false;
}
// Generates random digits_, return true if successful.
// Returns false if the random sequence is over.
bool StringGenerator::RandomDigits() {
if (--nrandom_ <= 0)
return false;
// Pick length.
int len = acm_->Uniform(maxlen_+1);
digits_.resize(len);
for (int i = 0; i < len; i++)
digits_[i] = acm_->Uniform(alphabet_.size());
return true;
}
// Returns the next string in the iteration, which is the one
// currently described by digits_. Calls IncrementDigits
// after computing the string, so that it knows the answer
// for subsequent HasNext() calls.
const StringPiece& StringGenerator::Next() {
CHECK(hasnext_);
if (generate_null_) {
generate_null_ = false;
sp_ = NULL;
return sp_;
}
s_.clear();
for (int i = 0; i < digits_.size(); i++) {
s_ += alphabet_[digits_[i]];
}
hasnext_ = random_ ? RandomDigits() : IncrementDigits();
sp_ = s_;
return sp_;
}
// Sets generator up to return n random strings.
void StringGenerator::Random(int32 seed, int n) {
if (acm_ == NULL)
acm_ = new ACMRandom(seed);
else
acm_->Reset(seed);
random_ = true;
nrandom_ = n;
hasnext_ = nrandom_ > 0;
}
void StringGenerator::GenerateNULL() {
generate_null_ = true;
hasnext_ = true;
}
} // namespace re2

View File

@ -0,0 +1,58 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// String generator: generates all possible strings of up to
// maxlen letters using the set of letters in alpha.
// Fetch strings using a Java-like Next()/HasNext() interface.
#ifndef RE2_TESTING_STRING_GENERATOR_H__
#define RE2_TESTING_STRING_GENERATOR_H__
#include <string>
#include <vector>
#include "util/util.h"
#include "util/random.h"
#include "re2/stringpiece.h"
namespace re2 {
class StringGenerator {
public:
StringGenerator(int maxlen, const vector<string>& alphabet);
~StringGenerator();
const StringPiece& Next();
bool HasNext() { return hasnext_; }
// Resets generator to start sequence over.
void Reset();
// Causes generator to emit random strings for next n calls to Next().
void Random(int32 seed, int n);
// Causes generator to emit a NULL as the next call.
void GenerateNULL();
private:
bool IncrementDigits();
bool RandomDigits();
// Global state.
int maxlen_; // Maximum length string to generate.
vector<string> alphabet_; // Alphabet, one string per letter.
// Iteration state.
StringPiece sp_; // Last StringPiece returned by Next().
string s_; // String data in last StringPiece returned by Next().
bool hasnext_; // Whether Next() can be called again.
vector<int> digits_; // Alphabet indices for next string.
bool generate_null_; // Whether to generate a NULL StringPiece next.
bool random_; // Whether generated strings are random.
int nrandom_; // Number of random strings left to generate.
ACMRandom* acm_; // Random number generator
DISALLOW_EVIL_CONSTRUCTORS(StringGenerator);
};
} // namespace re2
#endif // RE2_TESTING_STRING_GENERATOR_H__

View File

@ -0,0 +1,109 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test StringGenerator.
#include <stdlib.h>
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/testing/string_generator.h"
#include "re2/testing/regexp_generator.h"
namespace re2 {
// Returns i to the e.
static int64 IntegerPower(int i, int e) {
int64 p = 1;
while (e-- > 0)
p *= i;
return p;
}
// Checks that for given settings of the string generator:
// * it generates strings that are non-decreasing in length.
// * strings of the same length are sorted in alphabet order.
// * it doesn't generate the same string twice.
// * it generates the right number of strings.
//
// If all of these hold, the StringGenerator is behaving.
// Assumes that the alphabet is sorted, so that the generated
// strings can just be compared lexicographically.
static void RunTest(int len, string alphabet, bool donull) {
StringGenerator g(len, Explode(alphabet));
int n = 0;
int last_l = -1;
string last_s;
if (donull) {
g.GenerateNULL();
EXPECT_TRUE(g.HasNext());
StringPiece sp = g.Next();
EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
EXPECT_EQ(sp.size(), 0);
}
while (g.HasNext()) {
string s = g.Next().as_string();
n++;
// Check that all characters in s appear in alphabet.
for (const char *p = s.c_str(); *p != '\0'; ) {
Rune r;
p += chartorune(&r, p);
EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL);
}
// Check that string is properly ordered w.r.t. previous string.
int l = utflen(s.c_str());
EXPECT_LE(l, len);
if (last_l < l) {
last_l = l;
} else {
EXPECT_EQ(last_l, l);
EXPECT_LT(last_s, s);
}
last_s = s;
}
// Check total string count.
int64 m = 0;
int alpha = utflen(alphabet.c_str());
if (alpha == 0) // Degenerate case.
len = 0;
for (int i = 0; i <= len; i++)
m += IntegerPower(alpha, i);
EXPECT_EQ(n, m);
}
TEST(StringGenerator, NoLength) {
RunTest(0, "abc", false);
}
TEST(StringGenerator, NoLengthNoAlphabet) {
RunTest(0, "", false);
}
TEST(StringGenerator, NoAlphabet) {
RunTest(5, "", false);
}
TEST(StringGenerator, Simple) {
RunTest(3, "abc", false);
}
TEST(StringGenerator, UTF8) {
RunTest(4, "abc\xE2\x98\xBA", false);
}
TEST(StringGenerator, GenNULL) {
RunTest(0, "abc", true);
RunTest(0, "", true);
RunTest(5, "", true);
RunTest(3, "abc", true);
RunTest(4, "abc\xE2\x98\xBA", true);
}
} // namespace re2

640
re2/re2/testing/tester.cc Normal file
View File

@ -0,0 +1,640 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression engine tester -- test all the implementations against each other.
#include "util/util.h"
#include "util/flags.h"
#include "re2/testing/tester.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
DEFINE_bool(dump_prog, false, "dump regexp program");
DEFINE_bool(log_okay, false, "log successful runs");
DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
DEFINE_int32(max_regexp_failures, 100,
"maximum number of regexp test failures (-1 = unlimited)");
DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
namespace re2 {
enum {
kMaxSubmatch = 1+16, // $0...$16
};
const char* engine_types[kEngineMax] = {
"Backtrack",
"NFA",
"DFA",
"DFA1",
"OnePass",
"BitState",
"RE2",
"RE2a",
"RE2b",
"PCRE",
};
// Returns the name string for the type t.
static string EngineString(Engine t) {
if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
return StringPrintf("type%d", static_cast<int>(t));
}
return engine_types[t];
}
// Returns bit mask of engines to use.
static uint32 Engines() {
static uint32 cached_engines;
static bool did_parse;
if (did_parse)
return cached_engines;
if (FLAGS_regexp_engines.empty()) {
cached_engines = ~0;
} else {
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
cached_engines |= 1<<i;
}
if (cached_engines == 0)
LOG(INFO) << "Warning: no engines enabled.";
if (!UsingPCRE)
cached_engines &= ~(1<<kEnginePCRE);
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
if (cached_engines & (1<<i))
LOG(INFO) << EngineString(i) << " enabled";
}
did_parse = true;
return cached_engines;
}
// The result of running a match.
struct TestInstance::Result {
bool skipped; // test skipped: wasn't applicable
bool matched; // found a match
bool untrusted; // don't really trust the answer
bool have_submatch; // computed all submatch info
bool have_submatch0; // computed just submatch[0]
StringPiece submatch[kMaxSubmatch];
};
typedef TestInstance::Result Result;
// Formats a single capture range s in text in the form (a,b)
// where a and b are the starting and ending offsets of s in text.
static string FormatCapture(const StringPiece& text, const StringPiece& s) {
if (s.begin() == NULL)
return "(?,?)";
return StringPrintf("(%d,%d)",
static_cast<int>(s.begin() - text.begin()),
static_cast<int>(s.end() - text.begin()));
}
// Returns whether text contains non-ASCII (>= 0x80) bytes.
static bool NonASCII(const StringPiece& text) {
for (int i = 0; i < text.size(); i++)
if ((uint8)text[i] >= 0x80)
return true;
return false;
}
// Returns string representation of match kind.
static string FormatKind(Prog::MatchKind kind) {
switch (kind) {
case Prog::kFullMatch:
return "full match";
case Prog::kLongestMatch:
return "longest match";
case Prog::kFirstMatch:
return "first match";
case Prog::kManyMatch:
return "many match";
}
return "???";
}
// Returns string representation of anchor kind.
static string FormatAnchor(Prog::Anchor anchor) {
switch (anchor) {
case Prog::kAnchored:
return "anchored";
case Prog::kUnanchored:
return "unanchored";
}
return "???";
}
struct ParseMode {
Regexp::ParseFlags parse_flags;
string desc;
};
static const Regexp::ParseFlags single_line =
Regexp::LikePerl;
static const Regexp::ParseFlags multi_line =
static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
static ParseMode parse_modes[] = {
{ single_line, "single-line" },
{ single_line|Regexp::Latin1, "single-line, latin1" },
{ multi_line, "multiline" },
{ multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
{ multi_line|Regexp::Latin1, "multiline, latin1" },
};
static string FormatMode(Regexp::ParseFlags flags) {
for (int i = 0; i < arraysize(parse_modes); i++)
if (parse_modes[i].parse_flags == flags)
return parse_modes[i].desc;
return StringPrintf("%#x", static_cast<uint>(flags));
}
// Constructs and saves all the matching engines that
// will be required for the given tests.
TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
Regexp::ParseFlags flags)
: regexp_str_(regexp_str),
kind_(kind),
flags_(flags),
error_(false),
regexp_(NULL),
num_captures_(0),
prog_(NULL),
rprog_(NULL),
re_(NULL),
re2_(NULL) {
VLOG(1) << CEscape(regexp_str);
// Compile regexp to prog.
// Always required - needed for backtracking (reference implementation).
RegexpStatus status;
regexp_ = Regexp::Parse(regexp_str, flags, &status);
if (regexp_ == NULL) {
LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
<< " mode: " << FormatMode(flags);
error_ = true;
return;
}
num_captures_ = regexp_->NumCaptures();
prog_ = regexp_->CompileToProg(0);
if (prog_ == NULL) {
LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
error_ = true;
return;
}
if (FLAGS_dump_prog) {
LOG(INFO) << "Prog for "
<< " regexp "
<< CEscape(regexp_str_)
<< " (" << FormatKind(kind_)
<< ", " << FormatMode(flags_)
<< ")\n"
<< prog_->Dump();
}
// Compile regexp to reversed prog. Only needed for DFA engines.
if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
rprog_ = regexp_->CompileToReverseProg(0);
if (rprog_ == NULL) {
LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
error_ = true;
return;
}
if (FLAGS_dump_rprog)
LOG(INFO) << rprog_->Dump();
}
// Create re string that will be used for RE and RE2.
string re = regexp_str.as_string();
// Accomodate flags.
// Regexp::Latin1 will be accomodated below.
if (!(flags & Regexp::OneLine))
re = "(?m)" + re;
if (flags & Regexp::NonGreedy)
re = "(?U)" + re;
if (flags & Regexp::DotNL)
re = "(?s)" + re;
// Compile regexp to RE2.
if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
RE2::Options options;
if (flags & Regexp::Latin1)
options.set_encoding(RE2::Options::EncodingLatin1);
if (kind_ == Prog::kLongestMatch)
options.set_longest_match(true);
re2_ = new RE2(re, options);
if (!re2_->error().empty()) {
LOG(INFO) << "Cannot RE2: " << CEscape(re);
error_ = true;
return;
}
}
// Compile regexp to RE.
// PCRE as exposed by the RE interface isn't always usable.
// 1. It disagrees about handling of empty-string reptitions
// like matching (a*)* against "b". PCRE treats the (a*) as
// occurring once, while we treat it as occurring not at all.
// 2. It treats $ as this weird thing meaning end of string
// or before the \n at the end of the string.
// 3. It doesn't implement POSIX leftmost-longest matching.
// MimicsPCRE() detects 1 and 2.
if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
kind_ != Prog::kLongestMatch) {
PCRE_Options o;
o.set_option(PCRE::UTF8);
if (flags & Regexp::Latin1)
o.set_option(PCRE::None);
// PCRE has interface bug keeping us from finding $0, so
// add one more layer of parens.
re_ = new PCRE("("+re+")", o);
if (!re_->error().empty()) {
LOG(INFO) << "Cannot PCRE: " << CEscape(re);
error_ = true;
return;
}
}
}
TestInstance::~TestInstance() {
if (regexp_)
regexp_->Decref();
delete prog_;
delete rprog_;
delete re_;
delete re2_;
}
// Runs a single search using the named engine type.
// This interface hides all the irregularities of the various
// engine interfaces from the rest of this file.
void TestInstance::RunSearch(Engine type,
const StringPiece& orig_text,
const StringPiece& orig_context,
Prog::Anchor anchor,
Result *result) {
memset(result, 0, sizeof *result);
if (regexp_ == NULL) {
result->skipped = true;
return;
}
int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
if (nsubmatch > kMaxSubmatch)
nsubmatch = kMaxSubmatch;
StringPiece text = orig_text;
StringPiece context = orig_context;
switch (type) {
default:
LOG(FATAL) << "Bad RunSearch type: " << (int)type;
case kEngineBacktrack:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineNFA:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->SearchNFA(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineDFA:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
&result->skipped, NULL);
break;
case kEngineDFA1:
if (prog_ == NULL || rprog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
&result->skipped, NULL);
// If anchored, no need for second run,
// but do it anyway to find more bugs.
if (result->matched) {
if (!rprog_->SearchDFA(result->submatch[0], context,
Prog::kAnchored, Prog::kLongestMatch,
result->submatch,
&result->skipped, NULL)) {
LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
<< " on " << CEscape(text);
result->matched = false;
}
}
result->have_submatch0 = true;
break;
case kEngineOnePass:
if (prog_ == NULL ||
anchor == Prog::kUnanchored ||
!prog_->IsOnePass() ||
nsubmatch > Prog::kMaxOnePassCapture) {
result->skipped = true;
break;
}
result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineBitState:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched = prog_->SearchBitState(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineRE2:
case kEngineRE2a:
case kEngineRE2b: {
if (!re2_ || text.end() != context.end()) {
result->skipped = true;
break;
}
RE2::Anchor re_anchor;
if (anchor == Prog::kAnchored)
re_anchor = RE2::ANCHOR_START;
else
re_anchor = RE2::UNANCHORED;
if (kind_ == Prog::kFullMatch)
re_anchor = RE2::ANCHOR_BOTH;
result->matched = re2_->Match(context,
text.begin() - context.begin(),
text.end() - context.begin(),
re_anchor, result->submatch, nsubmatch);
result->have_submatch = nsubmatch > 0;
break;
}
case kEnginePCRE: {
if (!re_ || text.begin() != context.begin() ||
text.end() != context.end()) {
result->skipped = true;
break;
}
const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
PCRE::Arg *a = new PCRE::Arg[nsubmatch];
for (int i = 0; i < nsubmatch; i++) {
a[i] = PCRE::Arg(&result->submatch[i]);
argptr[i] = &a[i];
}
int consumed;
PCRE::Anchor pcre_anchor;
if (anchor == Prog::kAnchored)
pcre_anchor = PCRE::ANCHOR_START;
else
pcre_anchor = PCRE::UNANCHORED;
if (kind_ == Prog::kFullMatch)
pcre_anchor = PCRE::ANCHOR_BOTH;
re_->ClearHitLimit();
result->matched =
re_->DoMatch(text,
pcre_anchor,
&consumed,
argptr, nsubmatch);
if (re_->HitLimit()) {
result->untrusted = true;
delete[] argptr;
delete[] a;
break;
}
result->have_submatch = true;
// Work around RE interface bug: PCRE returns -1 as the
// offsets for an unmatched subexpression, and RE should
// turn that into StringPiece(NULL) but in fact it uses
// StringPiece(text.begin() - 1, 0). Oops.
for (int i = 0; i < nsubmatch; i++)
if (result->submatch[i].begin() == text.begin() - 1)
result->submatch[i] = NULL;
delete[] argptr;
delete[] a;
break;
}
}
if (!result->matched)
memset(result->submatch, 0, sizeof result->submatch);
}
// Checks whether r is okay given that correct is the right answer.
// Specifically, r's answers have to match (but it doesn't have to
// claim to have all the answers).
static bool ResultOkay(const Result& r, const Result& correct) {
if (r.skipped)
return true;
if (r.matched != correct.matched)
return false;
if (r.have_submatch || r.have_submatch0) {
for (int i = 0; i < kMaxSubmatch; i++) {
if (correct.submatch[i].begin() != r.submatch[i].begin() ||
correct.submatch[i].size() != r.submatch[i].size())
return false;
if (!r.have_submatch)
break;
}
}
return true;
}
// Runs a single test.
bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
// Backtracking is the gold standard.
Result correct;
RunSearch(kEngineBacktrack, text, context, anchor, &correct);
if (correct.skipped) {
if (regexp_ == NULL)
return true;
LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
<< " " << FormatMode(flags_);
return false;
}
VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
<< " text " << CEscape(text)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
<< ")";
// Compare the others.
bool all_okay = true;
for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
if (!(Engines() & (1<<i)))
continue;
Result r;
RunSearch(i, text, context, anchor, &r);
if (ResultOkay(r, correct)) {
if (FLAGS_log_okay)
LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
continue;
}
// We disagree with PCRE on the meaning of some Unicode matches.
// In particular, we treat all non-ASCII UTF-8 as word characters.
// We also treat "empty" character sets like [^\w\W] as being
// impossible to match, while PCRE apparently excludes some code
// points (e.g., 0x0080) from both \w and \W.
if (i == kEnginePCRE && NonASCII(text))
continue;
if (!r.untrusted)
all_okay = false;
LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
context, anchor);
if (r.matched != correct.matched) {
if (r.matched) {
LOG(INFO) << " Should not match (but does).";
} else {
LOG(INFO) << " Should match (but does not).";
continue;
}
}
for (int i = 0; i < 1+num_captures_; i++) {
if (r.submatch[i].begin() != correct.submatch[i].begin() ||
r.submatch[i].end() != correct.submatch[i].end()) {
LOG(INFO) <<
StringPrintf(" $%d: should be %s is %s",
i,
FormatCapture(text, correct.submatch[i]).c_str(),
FormatCapture(text, r.submatch[i]).c_str());
} else {
LOG(INFO) <<
StringPrintf(" $%d: %s ok", i,
FormatCapture(text, r.submatch[i]).c_str());
}
}
}
if (!all_okay) {
if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
LOG(QFATAL) << "Too many regexp failures.";
}
return all_okay;
}
void TestInstance::LogMatch(const char* prefix, Engine e,
const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
LOG(INFO) << prefix
<< EngineString(e)
<< " regexp "
<< CEscape(regexp_str_)
<< " "
<< CEscape(regexp_->ToString())
<< " text "
<< CEscape(text)
<< " ("
<< text.begin() - context.begin()
<< ","
<< text.end() - context.begin()
<< ") of context "
<< CEscape(context)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
<< ")";
}
static Prog::MatchKind kinds[] = {
Prog::kFirstMatch,
Prog::kLongestMatch,
Prog::kFullMatch,
};
// Test all possible match kinds and parse modes.
Tester::Tester(const StringPiece& regexp) {
error_ = false;
for (int i = 0; i < arraysize(kinds); i++) {
for (int j = 0; j < arraysize(parse_modes); j++) {
TestInstance* t = new TestInstance(regexp, kinds[i],
parse_modes[j].parse_flags);
error_ |= t->error();
v_.push_back(t);
}
}
}
Tester::~Tester() {
for (int i = 0; i < v_.size(); i++)
delete v_[i];
}
bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
bool okay = true;
for (int i = 0; i < v_.size(); i++)
okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
return okay;
}
static Prog::Anchor anchors[] = {
Prog::kAnchored,
Prog::kUnanchored
};
bool Tester::TestInput(const StringPiece& text) {
bool okay = TestInputInContext(text, text);
if (text.size() > 0) {
StringPiece sp;
sp = text;
sp.remove_prefix(1);
okay &= TestInputInContext(sp, text);
sp = text;
sp.remove_suffix(1);
okay &= TestInputInContext(sp, text);
}
return okay;
}
bool Tester::TestInputInContext(const StringPiece& text,
const StringPiece& context) {
bool okay = true;
for (int i = 0; i < arraysize(anchors); i++)
okay &= TestCase(text, context, anchors[i]);
return okay;
}
bool TestRegexpOnText(const StringPiece& regexp,
const StringPiece& text) {
Tester t(regexp);
return t.TestInput(text);
}
} // namespace re2

121
re2/re2/testing/tester.h Normal file
View File

@ -0,0 +1,121 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Comparative tester for regular expression matching.
// Checks all implementations against each other.
#ifndef RE2_TESTING_TESTER_H__
#define RE2_TESTING_TESTER_H__
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/re2.h"
#include "util/pcre.h"
namespace re2 {
class Regexp;
// All the supported regexp engines.
enum Engine {
kEngineBacktrack = 0, // Prog::BadSearchBacktrack
kEngineNFA, // Prog::SearchNFA
kEngineDFA, // Prog::SearchDFA, only ask whether it matched
kEngineDFA1, // Prog::SearchDFA, ask for match[0]
kEngineOnePass, // Prog::SearchOnePass, if applicable
kEngineBitState, // Prog::SearchBitState
kEngineRE2, // RE2, all submatches
kEngineRE2a, // RE2, only ask for match[0]
kEngineRE2b, // RE2, only ask whether it matched
kEnginePCRE, // PCRE (util/pcre.h)
kEngineMax,
};
// Make normal math on the enum preserve the type.
// By default, C++ doesn't define ++ on enum, and e+1 has type int.
static inline void operator++(Engine& e, int unused) {
e = static_cast<Engine>(e+1);
}
static inline Engine operator+(Engine e, int i) {
return static_cast<Engine>(static_cast<int>(e)+i);
}
// A TestInstance caches per-regexp state for a given
// regular expression in a given configuration
// (UTF-8 vs Latin1, longest vs first match, etc.).
class TestInstance {
public:
struct Result;
TestInstance(const StringPiece& regexp, Prog::MatchKind kind,
Regexp::ParseFlags flags);
~TestInstance();
Regexp::ParseFlags flags() { return flags_; }
bool error() { return error_; }
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
bool RunCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor);
private:
// Runs a single search using the named engine type.
void RunSearch(Engine type,
const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor,
Result *result);
void LogMatch(const char* prefix, Engine e, const StringPiece& text,
const StringPiece& context, Prog::Anchor anchor);
const StringPiece& regexp_str_; // regexp being tested
Prog::MatchKind kind_; // kind of match
Regexp::ParseFlags flags_; // flags for parsing regexp_str_
bool error_; // error during constructor?
Regexp* regexp_; // parsed regexp
int num_captures_; // regexp_->NumCaptures() cached
Prog* prog_; // compiled program
Prog* rprog_; // compiled reverse program
PCRE* re_; // PCRE implementation
RE2* re2_; // RE2 implementation
DISALLOW_EVIL_CONSTRUCTORS(TestInstance);
};
// A group of TestInstances for all possible configurations.
class Tester {
public:
explicit Tester(const StringPiece& regexp);
~Tester();
bool error() { return error_; }
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
bool TestCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor);
// Run TestCase(text, text, anchor) for all anchoring modes.
bool TestInput(const StringPiece& text);
// Run TestCase(text, context, anchor) for all anchoring modes.
bool TestInputInContext(const StringPiece& text, const StringPiece& context);
private:
bool error_;
vector<TestInstance*> v_;
DISALLOW_EVIL_CONSTRUCTORS(Tester);
};
// Run all possible tests using regexp and text.
bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
} // namespace re2
#endif // RE2_TESTING_TESTER_H__

207
re2/re2/testing/unicode_test.py Executable file
View File

@ -0,0 +1,207 @@
#!/usr/bin/python2.4
#
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Unittest for the util/regexp/re2/unicode.py module."""
import os
import StringIO
from google3.pyglib import flags
from google3.testing.pybase import googletest
from google3.util.regexp.re2 import unicode
_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party",
"unicode", "ucd-5.1.0")
class ConvertTest(googletest.TestCase):
"""Test the conversion functions."""
def testUInt(self):
self.assertEquals(0x0000, unicode._UInt("0000"))
self.assertEquals(0x263A, unicode._UInt("263A"))
self.assertEquals(0x10FFFF, unicode._UInt("10FFFF"))
self.assertRaises(unicode.InputError, unicode._UInt, "263")
self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA")
self.assertRaises(unicode.InputError, unicode._UInt, "110000")
def testURange(self):
self.assertEquals([1, 2, 3], unicode._URange("0001..0003"))
self.assertEquals([1], unicode._URange("0001"))
self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005")
self.assertRaises(unicode.InputError, unicode._URange, "0003..0001")
self.assertRaises(unicode.InputError, unicode._URange, "0001..0001")
def testUStr(self):
self.assertEquals("0x263A", unicode._UStr(0x263a))
self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF))
self.assertRaises(unicode.InputError, unicode._UStr, 0x110000)
self.assertRaises(unicode.InputError, unicode._UStr, -1)
_UNICODE_TABLE = """# Commented line, should be ignored.
# The next line is blank and should be ignored.
0041;Capital A;Line 1
0061..007A;Lowercase;Line 2
1F00;<Greek, First>;Ignored
1FFE;<Greek, Last>;Line 3
10FFFF;Runemax;Line 4
0000;Zero;Line 5
"""
_BAD_TABLE1 = """
111111;Not a code point;
"""
_BAD_TABLE2 = """
0000;<Zero, First>;Missing <Zero, Last>
"""
_BAD_TABLE3 = """
0010..0001;Bad range;
"""
class AbortError(Exception):
"""Function should not have been called."""
def Abort():
raise AbortError("Abort")
def StringTable(s, n, f):
unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f)
class ReadUnicodeTableTest(googletest.TestCase):
"""Test the ReadUnicodeTable function."""
def testSimpleTable(self):
ncall = [0] # can't assign to ordinary int in DoLine
def DoLine(codes, fields):
self.assertEquals(3, len(fields))
ncall[0] += 1
self.assertEquals("Line %d" % (ncall[0],), fields[2])
if ncall[0] == 1:
self.assertEquals([0x0041], codes)
self.assertEquals("0041", fields[0])
self.assertEquals("Capital A", fields[1])
elif ncall[0] == 2:
self.assertEquals(range(0x0061, 0x007A + 1), codes)
self.assertEquals("0061..007A", fields[0])
self.assertEquals("Lowercase", fields[1])
elif ncall[0] == 3:
self.assertEquals(range(0x1F00, 0x1FFE + 1), codes)
self.assertEquals("1F00..1FFE", fields[0])
self.assertEquals("Greek", fields[1])
elif ncall[0] == 4:
self.assertEquals([0x10FFFF], codes)
self.assertEquals("10FFFF", fields[0])
self.assertEquals("Runemax", fields[1])
elif ncall[0] == 5:
self.assertEquals([0x0000], codes)
self.assertEquals("0000", fields[0])
self.assertEquals("Zero", fields[1])
StringTable(_UNICODE_TABLE, 3, DoLine)
self.assertEquals(5, ncall[0])
def testErrorTables(self):
self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort)
self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort)
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort)
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort)
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort)
class ParseContinueTest(googletest.TestCase):
"""Test the ParseContinue function."""
def testParseContinue(self):
self.assertEquals(("Private Use", "First"),
unicode._ParseContinue("<Private Use, First>"))
self.assertEquals(("Private Use", "Last"),
unicode._ParseContinue("<Private Use, Last>"))
self.assertEquals(("<Private Use, Blah>", None),
unicode._ParseContinue("<Private Use, Blah>"))
class CaseGroupsTest(googletest.TestCase):
"""Test the CaseGroups function (and the CaseFoldingReader)."""
def FindGroup(self, c):
if type(c) == str:
c = ord(c)
for g in self.groups:
if c in g:
return g
return None
def testCaseGroups(self):
self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR)
self.assertEquals([ord("A"), ord("a")], self.FindGroup("a"))
self.assertEquals(None, self.FindGroup("0"))
class ScriptsTest(googletest.TestCase):
"""Test the Scripts function (and the ScriptsReader)."""
def FindScript(self, c):
if type(c) == str:
c = ord(c)
for script, codes in self.scripts.items():
for code in codes:
if c == code:
return script
return None
def testScripts(self):
self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR)
self.assertEquals("Latin", self.FindScript("a"))
self.assertEquals("Common", self.FindScript("0"))
self.assertEquals(None, self.FindScript(0xFFFE))
class CategoriesTest(googletest.TestCase):
"""Test the Categories function (and the UnicodeDataReader)."""
def FindCategory(self, c):
if type(c) == str:
c = ord(c)
short = None
for category, codes in self.categories.items():
for code in codes:
if code == c:
# prefer category Nd over N
if len(category) > 1:
return category
if short == None:
short = category
return short
def testCategories(self):
self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR)
self.assertEquals("Ll", self.FindCategory("a"))
self.assertEquals("Nd", self.FindCategory("0"))
self.assertEquals("Lo", self.FindCategory(0xAD00)) # in First, Last range
self.assertEquals(None, self.FindCategory(0xFFFE))
self.assertEquals("Lo", self.FindCategory(0x8B5A))
self.assertEquals("Lo", self.FindCategory(0x6C38))
self.assertEquals("Lo", self.FindCategory(0x92D2))
self.assertTrue(ord("a") in self.categories["L"])
self.assertTrue(ord("0") in self.categories["N"])
self.assertTrue(0x8B5A in self.categories["L"])
self.assertTrue(0x6C38 in self.categories["L"])
self.assertTrue(0x92D2 in self.categories["L"])
def main():
googletest.main()
if __name__ == "__main__":
main()

341
re2/re2/tostring.cc Normal file
View File

@ -0,0 +1,341 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Format a regular expression structure as a string.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
enum {
PrecAtom,
PrecUnary,
PrecConcat,
PrecAlternate,
PrecEmpty,
PrecParen,
PrecToplevel,
};
// Helper function. See description below.
static void AppendCCRange(string* t, Rune lo, Rune hi);
// Walker to generate string in s_.
// The arg pointers are actually integers giving the
// context precedence.
// The child_args are always NULL.
class ToStringWalker : public Regexp::Walker<int> {
public:
explicit ToStringWalker(string* t) : t_(t) {}
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args);
virtual int ShortVisit(Regexp* re, int parent_arg) {
return 0;
}
private:
string* t_; // The string the walker appends to.
DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
};
string Regexp::ToString() {
string t;
ToStringWalker w(&t);
w.WalkExponential(this, PrecToplevel, 100000);
if (w.stopped_early())
t += " [truncated]";
return t;
}
#define ToString DontCallToString // Avoid accidental recursion.
// Visits re before children are processed.
// Appends ( if needed and passes new precedence to children.
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
int prec = parent_arg;
int nprec = PrecAtom;
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpCharClass:
case kRegexpHaveMatch:
nprec = PrecAtom;
break;
case kRegexpConcat:
case kRegexpLiteralString:
if (prec < PrecConcat)
t_->append("(?:");
nprec = PrecConcat;
break;
case kRegexpAlternate:
if (prec < PrecAlternate)
t_->append("(?:");
nprec = PrecAlternate;
break;
case kRegexpCapture:
t_->append("(");
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
t_->append(">");
}
nprec = PrecParen;
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (prec < PrecUnary)
t_->append("(?:");
// The subprecedence here is PrecAtom instead of PrecUnary
// because PCRE treats two unary ops in a row as a parse error.
nprec = PrecAtom;
break;
}
return nprec;
}
static void AppendLiteral(string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
t->append(1, r);
} else if (foldcase && 'a' <= r && r <= 'z') {
if ('a' <= r && r <= 'z')
r += 'A' - 'a';
t->append(1, '[');
t->append(1, r);
t->append(1, r + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
}
}
// Visits re after children are processed.
// For childless regexps, all the work is done here.
// For regexps with children, append any unary suffixes or ).
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) {
int prec = parent_arg;
switch (re->op()) {
case kRegexpNoMatch:
// There's no simple symbol for "no match", but
// [^0-Runemax] excludes everything.
t_->append("[^\\x00-\\x{10ffff}]");
break;
case kRegexpEmptyMatch:
// Append (?:) to make empty string visible,
// unless this is already being parenthesized.
if (prec < PrecEmpty)
t_->append("(?:)");
break;
case kRegexpLiteral:
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
break;
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpConcat:
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpAlternate:
// Clumsy but workable: the children all appended |
// at the end of their strings, so just remove the last one.
if ((*t_)[t_->size()-1] == '|')
t_->erase(t_->size()-1);
else
LOG(DFATAL) << "Bad final char: " << t_;
if (prec < PrecAlternate)
t_->append(")");
break;
case kRegexpStar:
t_->append("*");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpPlus:
t_->append("+");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpQuest:
t_->append("?");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpRepeat:
if (re->max() == -1)
t_->append(StringPrintf("{%d,}", re->min()));
else if (re->min() == re->max())
t_->append(StringPrintf("{%d}", re->min()));
else
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpAnyChar:
t_->append(".");
break;
case kRegexpAnyByte:
t_->append("\\C");
break;
case kRegexpBeginLine:
t_->append("^");
break;
case kRegexpEndLine:
t_->append("$");
break;
case kRegexpBeginText:
t_->append("(?-m:^)");
break;
case kRegexpEndText:
if (re->parse_flags() & Regexp::WasDollar)
t_->append("(?-m:$)");
else
t_->append("\\z");
break;
case kRegexpWordBoundary:
t_->append("\\b");
break;
case kRegexpNoWordBoundary:
t_->append("\\B");
break;
case kRegexpCharClass: {
if (re->cc()->size() == 0) {
t_->append("[^\\x00-\\x{10ffff}]");
break;
}
t_->append("[");
// Heuristic: show class as negated if it contains the
// non-character 0xFFFE.
CharClass* cc = re->cc();
if (cc->Contains(0xFFFE)) {
cc = cc->Negate();
t_->append("^");
}
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
AppendCCRange(t_, i->lo, i->hi);
if (cc != re->cc())
cc->Delete();
t_->append("]");
break;
}
case kRegexpCapture:
t_->append(")");
break;
case kRegexpHaveMatch:
// There's no syntax accepted by the parser to generate
// this node (it is generated by RE2::Set) so make something
// up that is readable but won't compile.
t_->append("(?HaveMatch:%d)", re->match_id());
break;
}
// If the parent is an alternation, append the | for it.
if (prec == PrecAlternate)
t_->append("|");
return 0;
}
// Appends a rune for use in a character class to the string t.
static void AppendCCChar(string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
t->append(1, r);
return;
}
switch (r) {
default:
break;
case '\r':
t->append("\\r");
return;
case '\t':
t->append("\\t");
return;
case '\n':
t->append("\\n");
return;
case '\f':
t->append("\\f");
return;
}
if (r < 0x100) {
StringAppendF(t, "\\x%02x", static_cast<int>(r));
return;
}
StringAppendF(t, "\\x{%x}", static_cast<int>(r));
}
static void AppendCCRange(string* t, Rune lo, Rune hi) {
if (lo > hi)
return;
AppendCCChar(t, lo);
if (lo < hi) {
t->append("-");
AppendCCChar(t, hi);
}
}
} // namespace re2

297
re2/re2/unicode.py Executable file
View File

@ -0,0 +1,297 @@
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Parser for Unicode data files (as distributed by unicode.org)."""
import os
import re
import urllib2
# Directory or URL where Unicode tables reside.
_UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF
class Error(Exception):
"""Unicode error base class."""
class InputError(Error):
"""Unicode input error class. Raised on invalid input."""
def _UInt(s):
"""Converts string to Unicode code point ('263A' => 0x263a).
Args:
s: string to convert
Returns:
Unicode code point
Raises:
InputError: the string is not a valid Unicode value.
"""
try:
v = int(s, 16)
except ValueError:
v = -1
if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (s,))
return v
def _URange(s):
"""Converts string to Unicode range.
'0001..0003' => [1, 2, 3].
'0001' => [1].
Args:
s: string to convert
Returns:
Unicode range
Raises:
InputError: the string is not a valid Unicode range.
"""
a = s.split("..")
if len(a) == 1:
return [_UInt(a[0])]
if len(a) == 2:
lo = _UInt(a[0])
hi = _UInt(a[1])
if lo < hi:
return range(lo, hi + 1)
raise InputError("invalid Unicode range %s" % (s,))
def _UStr(v):
"""Converts Unicode code point to hex string.
0x263a => '0x263A'.
Args:
v: code point to convert
Returns:
Unicode string
Raises:
InputError: the argument is not a valid Unicode value.
"""
if v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (v,))
return "0x%04X" % (v,)
def _ParseContinue(s):
"""Parses a Unicode continuation field.
These are of the form '<Name, First>' or '<Name, Last>'.
Instead of giving an explicit range in a single table entry,
some Unicode tables use two entries, one for the first
code value in the range and one for the last.
The first entry's description is '<Name, First>' instead of 'Name'
and the second is '<Name, Last>'.
'<Name, First>' => ('Name', 'First')
'<Name, Last>' => ('Name', 'Last')
'Anything else' => ('Anything else', None)
Args:
s: continuation field string
Returns:
pair: name and ('First', 'Last', or None)
"""
match = re.match("<(.*), (First|Last)>", s)
if match is not None:
return match.groups()
return (s, None)
def ReadUnicodeTable(filename, nfields, doline):
"""Generic Unicode table text file reader.
The reader takes care of stripping out comments and also
parsing the two different ways that the Unicode tables specify
code ranges (using the .. notation and splitting the range across
multiple lines).
Each non-comment line in the table is expected to have the given
number of fields. The first field is known to be the Unicode value
and the second field its description.
The reader calls doline(codes, fields) for each entry in the table.
If fn raises an exception, the reader prints that exception,
prefixed with the file name and line number, and continues
processing the file. When done with the file, the reader re-raises
the first exception encountered during the file.
Arguments:
filename: the Unicode data file to read, or a file-like object.
nfields: the number of expected fields per line in that file.
doline: the function to call for each table entry.
Raises:
InputError: nfields is invalid (must be >= 2).
"""
if nfields < 2:
raise InputError("invalid number of fields %d" % (nfields,))
if type(filename) == str:
if filename.startswith("http://"):
fil = urllib2.urlopen(filename)
else:
fil = open(filename, "r")
else:
fil = filename
first = None # first code in multiline range
expect_last = None # tag expected for "Last" line in multiline range
lineno = 0 # current line number
for line in fil:
lineno += 1
try:
# Chop # comments and white space; ignore empty lines.
sharp = line.find("#")
if sharp >= 0:
line = line[:sharp]
line = line.strip()
if not line:
continue
# Split fields on ";", chop more white space.
# Must have the expected number of fields.
fields = [s.strip() for s in line.split(";")]
if len(fields) != nfields:
raise InputError("wrong number of fields %d %d - %s" %
(len(fields), nfields, line))
# The Unicode text files have two different ways
# to list a Unicode range. Either the first field is
# itself a range (0000..FFFF), or the range is split
# across two lines, with the second field noting
# the continuation.
codes = _URange(fields[0])
(name, cont) = _ParseContinue(fields[1])
if expect_last is not None:
# If the last line gave the First code in a range,
# this one had better give the Last one.
if (len(codes) != 1 or codes[0] <= first or
cont != "Last" or name != expect_last):
raise InputError("expected Last line for %s" %
(expect_last,))
codes = range(first, codes[0] + 1)
first = None
expect_last = None
fields[0] = "%04X..%04X" % (codes[0], codes[-1])
fields[1] = name
elif cont == "First":
# Otherwise, if this is the First code in a range,
# remember it and go to the next line.
if len(codes) != 1:
raise InputError("bad First line: range given")
expect_last = name
first = codes[0]
continue
doline(codes, fields)
except Exception, e:
print "%s:%d: %s" % (filename, lineno, e)
raise
if expect_last is not None:
raise InputError("expected Last line for %s; got EOF" %
(expect_last,))
def CaseGroups(unicode_dir=_UNICODE_DIR):
"""Returns list of Unicode code groups equivalent under case folding.
Each group is a sorted list of code points,
and the list of groups is sorted by first code point
in the group.
Args:
unicode_dir: Unicode data directory
Returns:
list of Unicode code groups
"""
# Dict mapping lowercase code point to fold-equivalent group.
togroup = {}
def DoLine(codes, fields):
"""Process single CaseFolding.txt line, updating togroup."""
(_, foldtype, lower, _) = fields
if foldtype not in ("C", "S"):
return
lower = _UInt(lower)
togroup.setdefault(lower, [lower]).extend(codes)
ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
groups = togroup.values()
for g in groups:
g.sort()
groups.sort()
return togroup, groups
def Scripts(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping script names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping script names to code lists
"""
scripts = {}
def DoLine(codes, fields):
"""Process single Scripts.txt line, updating scripts."""
(_, name) = fields
scripts.setdefault(name, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
return scripts
def Categories(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping category names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping category names to code lists
"""
categories = {}
def DoLine(codes, fields):
"""Process single UnicodeData.txt line, updating categories."""
category = fields[2]
categories.setdefault(category, []).extend(codes)
# Add codes from Lu into L, etc.
if len(category) > 1:
short = category[0]
categories.setdefault(short, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
return categories

469
re2/re2/unicode_casefold.cc Normal file
View File

@ -0,0 +1,469 @@
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
// 1029 groups, 2079 pairs, 282 ranges
CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
{ 107, 107, 8383 },
{ 108, 114, -32 },
{ 115, 115, 268 },
{ 116, 122, -32 },
{ 181, 181, 743 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 223, 223, 7615 },
{ 224, 228, -32 },
{ 229, 229, 8262 },
{ 230, 246, -32 },
{ 248, 254, -32 },
{ 255, 255, 121 },
{ 256, 303, EvenOdd },
{ 306, 311, EvenOdd },
{ 313, 328, OddEven },
{ 330, 375, EvenOdd },
{ 376, 376, -121 },
{ 377, 382, OddEven },
{ 383, 383, -300 },
{ 384, 384, 195 },
{ 385, 385, 210 },
{ 386, 389, EvenOdd },
{ 390, 390, 206 },
{ 391, 392, OddEven },
{ 393, 394, 205 },
{ 395, 396, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 402, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 405, 405, 97 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 409, EvenOdd },
{ 410, 410, 163 },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 414, 414, 130 },
{ 415, 415, 214 },
{ 416, 421, EvenOdd },
{ 422, 422, 218 },
{ 423, 424, OddEven },
{ 425, 425, 218 },
{ 428, 429, EvenOdd },
{ 430, 430, 218 },
{ 431, 432, OddEven },
{ 433, 434, 217 },
{ 435, 438, OddEven },
{ 439, 439, 219 },
{ 440, 441, EvenOdd },
{ 444, 445, EvenOdd },
{ 447, 447, 56 },
{ 452, 452, EvenOdd },
{ 453, 453, OddEven },
{ 454, 454, -2 },
{ 455, 455, OddEven },
{ 456, 456, EvenOdd },
{ 457, 457, -2 },
{ 458, 458, EvenOdd },
{ 459, 459, OddEven },
{ 460, 460, -2 },
{ 461, 476, OddEven },
{ 477, 477, -79 },
{ 478, 495, EvenOdd },
{ 497, 497, OddEven },
{ 498, 498, EvenOdd },
{ 499, 499, -2 },
{ 500, 501, EvenOdd },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 543, EvenOdd },
{ 544, 544, -130 },
{ 546, 563, EvenOdd },
{ 570, 570, 10795 },
{ 571, 572, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 575, 576, 10815 },
{ 577, 578, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 591, EvenOdd },
{ 592, 592, 10783 },
{ 593, 593, 10780 },
{ 594, 594, 10782 },
{ 595, 595, -210 },
{ 596, 596, -206 },
{ 598, 599, -205 },
{ 601, 601, -202 },
{ 603, 603, -203 },
{ 608, 608, -205 },
{ 611, 611, -207 },
{ 613, 613, 42280 },
{ 616, 616, -209 },
{ 617, 617, -211 },
{ 619, 619, 10743 },
{ 623, 623, -211 },
{ 625, 625, 10749 },
{ 626, 626, -213 },
{ 629, 629, -214 },
{ 637, 637, 10727 },
{ 640, 640, -218 },
{ 643, 643, -218 },
{ 648, 648, -218 },
{ 649, 649, -69 },
{ 650, 651, -217 },
{ 652, 652, -71 },
{ 658, 658, -219 },
{ 837, 837, 84 },
{ 880, 883, EvenOdd },
{ 886, 887, EvenOdd },
{ 891, 893, 130 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 931, 31 },
{ 932, 939, 32 },
{ 940, 940, -38 },
{ 941, 943, -37 },
{ 945, 945, -32 },
{ 946, 946, 30 },
{ 947, 948, -32 },
{ 949, 949, 64 },
{ 950, 951, -32 },
{ 952, 952, 25 },
{ 953, 953, 7173 },
{ 954, 954, 54 },
{ 955, 955, -32 },
{ 956, 956, -775 },
{ 957, 959, -32 },
{ 960, 960, 22 },
{ 961, 961, 48 },
{ 962, 962, EvenOdd },
{ 963, 965, -32 },
{ 966, 966, 15 },
{ 967, 968, -32 },
{ 969, 969, 7517 },
{ 970, 971, -32 },
{ 972, 972, -64 },
{ 973, 974, -63 },
{ 975, 975, 8 },
{ 976, 976, -62 },
{ 977, 977, 35 },
{ 981, 981, -47 },
{ 982, 982, -54 },
{ 983, 983, -8 },
{ 984, 1007, EvenOdd },
{ 1008, 1008, -86 },
{ 1009, 1009, -80 },
{ 1010, 1010, 7 },
{ 1012, 1012, -92 },
{ 1013, 1013, -96 },
{ 1015, 1016, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1019, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1072, 1103, -32 },
{ 1104, 1119, -80 },
{ 1120, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
{ 1231, 1231, -15 },
{ 1232, 1319, EvenOdd },
{ 1329, 1366, 48 },
{ 1377, 1414, -48 },
{ 4256, 4293, 7264 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7680, 7776, EvenOdd },
{ 7777, 7777, 58 },
{ 7778, 7829, EvenOdd },
{ 7835, 7835, -59 },
{ 7838, 7838, -7615 },
{ 7840, 7935, EvenOdd },
{ 7936, 7943, 8 },
{ 7944, 7951, -8 },
{ 7952, 7957, 8 },
{ 7960, 7965, -8 },
{ 7968, 7975, 8 },
{ 7976, 7983, -8 },
{ 7984, 7991, 8 },
{ 7992, 7999, -8 },
{ 8000, 8005, 8 },
{ 8008, 8013, -8 },
{ 8017, 8017, 8 },
{ 8019, 8019, 8 },
{ 8021, 8021, 8 },
{ 8023, 8023, 8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8032, 8039, 8 },
{ 8040, 8047, -8 },
{ 8048, 8049, 74 },
{ 8050, 8053, 86 },
{ 8054, 8055, 100 },
{ 8056, 8057, 128 },
{ 8058, 8059, 112 },
{ 8060, 8061, 126 },
{ 8064, 8071, 8 },
{ 8072, 8079, -8 },
{ 8080, 8087, 8 },
{ 8088, 8095, -8 },
{ 8096, 8103, 8 },
{ 8104, 8111, -8 },
{ 8112, 8113, 8 },
{ 8115, 8115, 9 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7289 },
{ 8131, 8131, 9 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8144, 8145, 8 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8160, 8161, 8 },
{ 8165, 8165, 7 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8179, 8179, 9 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7549 },
{ 8490, 8490, -8415 },
{ 8491, 8491, -8294 },
{ 8498, 8498, 28 },
{ 8526, 8526, -28 },
{ 8544, 8559, 16 },
{ 8560, 8575, -16 },
{ 8579, 8580, OddEven },
{ 9398, 9423, 26 },
{ 9424, 9449, -26 },
{ 11264, 11310, 48 },
{ 11312, 11358, -48 },
{ 11360, 11361, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11365, 11365, -10795 },
{ 11366, 11366, -10792 },
{ 11367, 11372, OddEven },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11379, EvenOdd },
{ 11381, 11382, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11491, EvenOdd },
{ 11499, 11502, OddEven },
{ 11520, 11557, -7264 },
{ 42560, 42605, EvenOdd },
{ 42624, 42647, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
{ 42873, 42876, OddEven },
{ 42877, 42877, -35332 },
{ 42878, 42887, EvenOdd },
{ 42891, 42892, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42897, EvenOdd },
{ 42912, 42921, EvenOdd },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
};
int num_unicode_casefold = 282;
// 1029 groups, 1050 pairs, 163 ranges
CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 256, 302, EvenOddSkip },
{ 306, 310, EvenOddSkip },
{ 313, 327, OddEvenSkip },
{ 330, 374, EvenOddSkip },
{ 376, 376, -121 },
{ 377, 381, OddEvenSkip },
{ 383, 383, -268 },
{ 385, 385, 210 },
{ 386, 388, EvenOddSkip },
{ 390, 390, 206 },
{ 391, 391, OddEven },
{ 393, 394, 205 },
{ 395, 395, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 401, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 408, EvenOdd },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 415, 415, 214 },
{ 416, 420, EvenOddSkip },
{ 422, 422, 218 },
{ 423, 423, OddEven },
{ 425, 425, 218 },
{ 428, 428, EvenOdd },
{ 430, 430, 218 },
{ 431, 431, OddEven },
{ 433, 434, 217 },
{ 435, 437, OddEvenSkip },
{ 439, 439, 219 },
{ 440, 440, EvenOdd },
{ 444, 444, EvenOdd },
{ 452, 452, 2 },
{ 453, 453, OddEven },
{ 455, 455, 2 },
{ 456, 456, EvenOdd },
{ 458, 458, 2 },
{ 459, 475, OddEvenSkip },
{ 478, 494, EvenOddSkip },
{ 497, 497, 2 },
{ 498, 500, EvenOddSkip },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 542, EvenOddSkip },
{ 544, 544, -130 },
{ 546, 562, EvenOddSkip },
{ 570, 570, 10795 },
{ 571, 571, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 577, 577, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 590, EvenOddSkip },
{ 837, 837, 116 },
{ 880, 882, EvenOddSkip },
{ 886, 886, EvenOdd },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 939, 32 },
{ 962, 962, EvenOdd },
{ 975, 975, 8 },
{ 976, 976, -30 },
{ 977, 977, -25 },
{ 981, 981, -15 },
{ 982, 982, -22 },
{ 984, 1006, EvenOddSkip },
{ 1008, 1008, -54 },
{ 1009, 1009, -48 },
{ 1012, 1012, -60 },
{ 1013, 1013, -64 },
{ 1015, 1015, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1018, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1120, 1152, EvenOddSkip },
{ 1162, 1214, EvenOddSkip },
{ 1216, 1216, 15 },
{ 1217, 1229, OddEvenSkip },
{ 1232, 1318, EvenOddSkip },
{ 1329, 1366, 48 },
{ 4256, 4293, 7264 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
{ 7840, 7934, EvenOddSkip },
{ 7944, 7951, -8 },
{ 7960, 7965, -8 },
{ 7976, 7983, -8 },
{ 7992, 7999, -8 },
{ 8008, 8013, -8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8040, 8047, -8 },
{ 8072, 8079, -8 },
{ 8088, 8095, -8 },
{ 8104, 8111, -8 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7173 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7517 },
{ 8490, 8490, -8383 },
{ 8491, 8491, -8262 },
{ 8498, 8498, 28 },
{ 8544, 8559, 16 },
{ 8579, 8579, OddEven },
{ 9398, 9423, 26 },
{ 11264, 11310, 48 },
{ 11360, 11360, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11367, 11371, OddEvenSkip },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11378, EvenOdd },
{ 11381, 11381, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11490, EvenOddSkip },
{ 11499, 11501, OddEvenSkip },
{ 42560, 42604, EvenOddSkip },
{ 42624, 42646, EvenOddSkip },
{ 42786, 42798, EvenOddSkip },
{ 42802, 42862, EvenOddSkip },
{ 42873, 42875, OddEvenSkip },
{ 42877, 42877, -35332 },
{ 42878, 42886, EvenOddSkip },
{ 42891, 42891, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42896, EvenOdd },
{ 42912, 42920, EvenOddSkip },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
};
int num_unicode_tolower = 163;
} // namespace re2

View File

@ -0,0 +1,75 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Unicode case folding tables.
// The Unicode case folding tables encode the mapping from one Unicode point
// to the next largest Unicode point with equivalent folding. The largest
// point wraps back to the first. For example, the tables map:
//
// 'A' -> 'a'
// 'a' -> 'A'
//
// 'K' -> 'k'
// 'k' -> '' (Kelvin symbol)
// '' -> 'K'
//
// Like everything Unicode, these tables are big. If we represent the table
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
// Most table entries look like the ones around them:
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
// Instead of listing all the pairs explicitly, we make a list of ranges
// and deltas, so that the table entries for 'A' through 'Z' can be represented
// as a single entry { 'A', 'Z', +32 }.
//
// In addition to blocks that map to each other (A-Z mapping to a-z)
// there are blocks of pairs that individually map to each other
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
// For those, the special delta value EvenOdd marks even/odd pairs
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
//
// In this form, the table has 274 entries, about 3kB. If we were to split
// the table into one for 16-bit codes and an overflow table for larger ones,
// we could get it down to about 1.5kB, but that's not worth the complexity.
//
// The grouped form also allows for efficient fold range calculations
// rather than looping one character at a time.
#ifndef RE2_UNICODE_CASEFOLD_H__
#define RE2_UNICODE_CASEFOLD_H__
#include "util/util.h"
namespace re2 {
enum {
EvenOdd = 1,
OddEven = -1,
EvenOddSkip = 1<<30,
OddEvenSkip,
};
struct CaseFold {
uint32 lo;
uint32 hi;
int32 delta;
};
extern CaseFold unicode_casefold[];
extern int num_unicode_casefold;
extern CaseFold unicode_tolower[];
extern int num_unicode_tolower;
// Returns the CaseFold* in the tables that contains rune.
// If rune is not in the tables, returns the first CaseFold* after rune.
// If rune is larger than any value in the tables, returns NULL.
extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
// Returns the result of applying the fold f to the rune r.
extern Rune ApplyFold(CaseFold *f, Rune r);
} // namespace re2
#endif // RE2_UNICODE_CASEFOLD_H__

4851
re2/re2/unicode_groups.cc Normal file

File diff suppressed because it is too large Load Diff

64
re2/re2/unicode_groups.h Normal file
View File

@ -0,0 +1,64 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Unicode character groups.
// The codes get split into ranges of 16-bit codes
// and ranges of 32-bit codes. It would be simpler
// to use only 32-bit ranges, but these tables are large
// enough to warrant extra care.
//
// Using just 32-bit ranges gives 27 kB of data.
// Adding 16-bit ranges gives 18 kB of data.
// Adding an extra table of 16-bit singletons would reduce
// to 16.5 kB of data but make the data harder to use;
// we don't bother.
#ifndef RE2_UNICODE_GROUPS_H__
#define RE2_UNICODE_GROUPS_H__
#include "util/util.h"
namespace re2 {
struct URange16
{
uint16 lo;
uint16 hi;
};
struct URange32
{
uint32 lo;
uint32 hi;
};
struct UGroup
{
const char *name;
int sign; // +1 for [abc], -1 for [^abc]
URange16 *r16;
int nr16;
URange32 *r32;
int nr32;
};
// Named by property or script name (e.g., "Nd", "N", "Han").
// Negated groups are not included.
extern UGroup unicode_groups[];
extern int num_unicode_groups;
// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
// Negated groups are included.
extern UGroup posix_groups[];
extern int num_posix_groups;
// Named by Perl name (e.g., "\\d", "\\D").
// Negated groups are included.
extern UGroup perl_groups[];
extern int num_perl_groups;
} // namespace re2
#endif // RE2_UNICODE_GROUPS_H__

346
re2/re2/variadic_function.h Normal file
View File

@ -0,0 +1,346 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_VARIADIC_FUNCTION_H_
#define RE2_VARIADIC_FUNCTION_H_
namespace re2 {
template <typename Result, typename Param0, typename Param1, typename Arg,
Result (*Func)(Param0, Param1, const Arg* const [], int count)>
class VariadicFunction2 {
public:
VariadicFunction2() {}
Result operator()(Param0 p0, Param1 p1) const {
return Func(p0, p1, 0, 0);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
const Arg* const args[] = { &a0 };
return Func(p0, p1, args, 1);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
const Arg* const args[] = { &a0, &a1 };
return Func(p0, p1, args, 2);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2) const {
const Arg* const args[] = { &a0, &a1, &a2 };
return Func(p0, p1, args, 3);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3 };
return Func(p0, p1, args, 4);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
return Func(p0, p1, args, 5);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
return Func(p0, p1, args, 6);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
return Func(p0, p1, args, 7);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
return Func(p0, p1, args, 8);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
return Func(p0, p1, args, 9);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9 };
return Func(p0, p1, args, 10);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10 };
return Func(p0, p1, args, 11);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11 };
return Func(p0, p1, args, 12);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12 };
return Func(p0, p1, args, 13);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13 };
return Func(p0, p1, args, 14);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14 };
return Func(p0, p1, args, 15);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15 };
return Func(p0, p1, args, 16);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
return Func(p0, p1, args, 17);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
return Func(p0, p1, args, 18);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
return Func(p0, p1, args, 19);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
return Func(p0, p1, args, 20);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
&a20 };
return Func(p0, p1, args, 21);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21 };
return Func(p0, p1, args, 22);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22 };
return Func(p0, p1, args, 23);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23 };
return Func(p0, p1, args, 24);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24 };
return Func(p0, p1, args, 25);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25 };
return Func(p0, p1, args, 26);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26 };
return Func(p0, p1, args, 27);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27 };
return Func(p0, p1, args, 28);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
return Func(p0, p1, args, 29);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
return Func(p0, p1, args, 30);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
return Func(p0, p1, args, 31);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30, const Arg& a31) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
return Func(p0, p1, args, 32);
}
};
} // namespace re2
#endif // RE2_VARIADIC_FUNCTION_H_

244
re2/re2/walker-inl.h Normal file
View File

@ -0,0 +1,244 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Helper class for traversing Regexps without recursion.
// Clients should declare their own subclasses that override
// the PreVisit and PostVisit methods, which are called before
// and after visiting the subexpressions.
// Not quite the Visitor pattern, because (among other things)
// the Visitor pattern is recursive.
#ifndef RE2_WALKER_INL_H__
#define RE2_WALKER_INL_H__
#include "re2/regexp.h"
namespace re2 {
template<typename T> struct WalkState;
template<typename T> class Regexp::Walker {
public:
Walker();
virtual ~Walker();
// Virtual method called before visiting re's children.
// PreVisit passes ownership of its return value to its caller.
// The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
// and passed to the child PreVisits and PostVisits as parent_arg.
// At the top-most Regexp, parent_arg is arg passed to walk.
// If PreVisit sets *stop to true, the walk does not recurse
// into the children. Instead it behaves as though the return
// value from PreVisit is the return value from PostVisit.
// The default PreVisit returns parent_arg.
virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
// Virtual method called after visiting re's children.
// The pre_arg is the T that PreVisit returned.
// The child_args is a vector of the T that the child PostVisits returned.
// PostVisit takes ownership of pre_arg.
// PostVisit takes ownership of the Ts
// in *child_args, but not the vector itself.
// PostVisit passes ownership of its return value
// to its caller.
// The default PostVisit simply returns pre_arg.
virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
T* child_args, int nchild_args);
// Virtual method called to copy a T,
// when Walk notices that more than one child is the same re.
virtual T Copy(T arg);
// Virtual method called to do a "quick visit" of the re,
// but not its children. Only called once the visit budget
// has been used up and we're trying to abort the walk
// as quickly as possible. Should return a value that
// makes sense for the parent PostVisits still to be run.
// This function is (hopefully) only called by
// WalkExponential, but must be implemented by all clients,
// just in case.
virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
// Walks over a regular expression.
// Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
// Returns the T returned by PostVisit on re.
T Walk(Regexp* re, T top_arg);
// Like Walk, but doesn't use Copy. This can lead to
// exponential runtimes on cross-linked Regexps like the
// ones generated by Simplify. To help limit this,
// at most max_visits nodes will be visited and then
// the walk will be cut off early.
// If the walk *is* cut off early, ShortVisit(re)
// will be called on regexps that cannot be fully
// visited rather than calling PreVisit/PostVisit.
T WalkExponential(Regexp* re, T top_arg, int max_visits);
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
void Reset();
// Returns whether walk was cut off.
bool stopped_early() { return stopped_early_; }
private:
// Walk state for the entire traversal.
stack<WalkState<T> >* stack_;
bool stopped_early_;
int max_visits_;
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
DISALLOW_EVIL_CONSTRUCTORS(Walker);
};
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
T parent_arg,
bool* stop) {
return parent_arg;
}
template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
T parent_arg,
T pre_arg,
T* child_args,
int nchild_args) {
return pre_arg;
}
template<typename T> T Regexp::Walker<T>::Copy(T arg) {
return arg;
}
// State about a single level in the traversal.
template<typename T> struct WalkState {
WalkState<T>(Regexp* re, T parent)
: re(re),
n(-1),
parent_arg(parent),
child_args(NULL) { }
Regexp* re; // The regexp
int n; // The index of the next child to process; -1 means need to PreVisit
T parent_arg; // Accumulated arguments.
T pre_arg;
T child_arg; // One-element buffer for child_args.
T* child_args;
};
template<typename T> Regexp::Walker<T>::Walker() {
stack_ = new stack<WalkState<T> >;
stopped_early_ = false;
}
template<typename T> Regexp::Walker<T>::~Walker() {
Reset();
delete stack_;
}
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
template<typename T> void Regexp::Walker<T>::Reset() {
if (stack_ && stack_->size() > 0) {
LOG(DFATAL) << "Stack not empty.";
while (stack_->size() > 0) {
delete stack_->top().child_args;
stack_->pop();
}
}
}
template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
bool use_copy) {
Reset();
if (re == NULL) {
LOG(DFATAL) << "Walk NULL";
return top_arg;
}
stack_->push(WalkState<T>(re, top_arg));
WalkState<T>* s;
for (;;) {
T t;
s = &stack_->top();
Regexp* re = s->re;
switch (s->n) {
case -1: {
if (--max_visits_ < 0) {
stopped_early_ = true;
t = ShortVisit(re, s->parent_arg);
break;
}
bool stop = false;
s->pre_arg = PreVisit(re, s->parent_arg, &stop);
if (stop) {
t = s->pre_arg;
break;
}
s->n = 0;
s->child_args = NULL;
if (re->nsub_ == 1)
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
// Fall through.
}
default: {
if (re->nsub_ > 0) {
Regexp** sub = re->sub();
if (s->n < re->nsub_) {
if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}
}
t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
if (re->nsub_ > 1)
delete[] s->child_args;
break;
}
}
// We've finished stack_->top().
// Update next guy down.
stack_->pop();
if (stack_->size() == 0)
return t;
s = &stack_->top();
if (s->child_args != NULL)
s->child_args[s->n] = t;
else
s->child_arg = t;
s->n++;
}
}
template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
// Without the exponential walking behavior,
// this budget should be more than enough for any
// regexp, and yet not enough to get us in trouble
// as far as CPU time.
max_visits_ = 1000000;
return WalkInternal(re, top_arg, true);
}
template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
int max_visits) {
max_visits_ = max_visits;
return WalkInternal(re, top_arg, false);
}
} // namespace re2
#endif // RE2_WALKER_INL_H__

21
re2/runtests Executable file
View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
success=true
for i
do
printf "%-40s" $i
if sh -c "$i >$i.log 2>&1" 2>/dev/null
then
echo PASS
else
echo FAIL';' output in $i.log
success=false
fi
done
if $success; then
echo 'ALL TESTS PASSED.'
exit 0
fi
echo 'TESTS FAILED.'
exit 1

13
re2/testinstall.cc Normal file
View File

@ -0,0 +1,13 @@
#include <re2/re2.h>
#include <stdio.h>
using namespace re2;
int main(void) {
if(RE2::FullMatch("axbyc", "a.*b.*c")) {
printf("PASS\n");
return 0;
}
printf("FAIL\n");
return 2;
}

168
re2/util/arena.cc Normal file
View File

@ -0,0 +1,168 @@
// Copyright 2000 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
namespace re2 {
// ----------------------------------------------------------------------
// UnsafeArena::UnsafeArena()
// UnsafeArena::~UnsafeArena()
// Destroying the arena automatically calls Reset()
// ----------------------------------------------------------------------
UnsafeArena::UnsafeArena(const size_t block_size)
: block_size_(block_size),
freestart_(NULL), // set for real in Reset()
last_alloc_(NULL),
remaining_(0),
blocks_alloced_(1),
overflow_blocks_(NULL) {
assert(block_size > kDefaultAlignment);
first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
first_blocks_[0].size = block_size_;
Reset();
}
UnsafeArena::~UnsafeArena() {
FreeBlocks();
assert(overflow_blocks_ == NULL); // FreeBlocks() should do that
// The first X blocks stay allocated always by default. Delete them now.
for (int i = 0; i < blocks_alloced_; i++)
free(first_blocks_[i].mem);
}
// ----------------------------------------------------------------------
// UnsafeArena::Reset()
// Clears all the memory an arena is using.
// ----------------------------------------------------------------------
void UnsafeArena::Reset() {
FreeBlocks();
freestart_ = first_blocks_[0].mem;
remaining_ = first_blocks_[0].size;
last_alloc_ = NULL;
// We do not know for sure whether or not the first block is aligned,
// so we fix that right now.
const int overage = reinterpret_cast<uintptr_t>(freestart_) &
(kDefaultAlignment-1);
if (overage > 0) {
const int waste = kDefaultAlignment - overage;
freestart_ += waste;
remaining_ -= waste;
}
freestart_when_empty_ = freestart_;
assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1)));
}
// -------------------------------------------------------------
// UnsafeArena::AllocNewBlock()
// Adds and returns an AllocatedBlock.
// The returned AllocatedBlock* is valid until the next call
// to AllocNewBlock or Reset. (i.e. anything that might
// affect overflow_blocks_).
// -------------------------------------------------------------
UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) {
AllocatedBlock *block;
// Find the next block.
if ( blocks_alloced_ < arraysize(first_blocks_) ) {
// Use one of the pre-allocated blocks
block = &first_blocks_[blocks_alloced_++];
} else { // oops, out of space, move to the vector
if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>;
// Adds another block to the vector.
overflow_blocks_->resize(overflow_blocks_->size()+1);
// block points to the last block of the vector.
block = &overflow_blocks_->back();
}
block->mem = reinterpret_cast<char*>(malloc(block_size));
block->size = block_size;
return block;
}
// ----------------------------------------------------------------------
// UnsafeArena::GetMemoryFallback()
// We take memory out of our pool, aligned on the byte boundary
// requested. If we don't have space in our current pool, we
// allocate a new block (wasting the remaining space in the
// current block) and give you that. If your memory needs are
// too big for a single block, we make a special your-memory-only
// allocation -- this is equivalent to not using the arena at all.
// ----------------------------------------------------------------------
void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) {
if (size == 0)
return NULL; // stl/stl_alloc.h says this is okay
assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2
// If the object is more than a quarter of the block size, allocate
// it separately to avoid wasting too much space in leftover bytes
if (block_size_ == 0 || size > block_size_/4) {
// then it gets its own block in the arena
assert(align <= kDefaultAlignment); // because that's what new gives us
// This block stays separate from the rest of the world; in particular
// we don't update last_alloc_ so you can't reclaim space on this block.
return AllocNewBlock(size)->mem;
}
const int overage =
(reinterpret_cast<uintptr_t>(freestart_) & (align-1));
if (overage) {
const int waste = align - overage;
freestart_ += waste;
if (waste < remaining_) {
remaining_ -= waste;
} else {
remaining_ = 0;
}
}
if (size > remaining_) {
AllocatedBlock *block = AllocNewBlock(block_size_);
freestart_ = block->mem;
remaining_ = block->size;
}
remaining_ -= size;
last_alloc_ = freestart_;
freestart_ += size;
assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0);
return reinterpret_cast<void*>(last_alloc_);
}
// ----------------------------------------------------------------------
// UnsafeArena::FreeBlocks()
// Unlike GetMemory(), which does actual work, ReturnMemory() is a
// no-op: we don't "free" memory until Reset() is called. We do
// update some stats, though. Note we do no checking that the
// pointer you pass in was actually allocated by us, or that it
// was allocated for the size you say, so be careful here!
// FreeBlocks() does the work for Reset(), actually freeing all
// memory allocated in one fell swoop.
// ----------------------------------------------------------------------
void UnsafeArena::FreeBlocks() {
for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced
free(first_blocks_[i].mem);
first_blocks_[i].mem = NULL;
first_blocks_[i].size = 0;
}
blocks_alloced_ = 1;
if (overflow_blocks_ != NULL) {
vector<AllocatedBlock>::iterator it;
for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
free(it->mem);
}
delete overflow_blocks_; // These should be used very rarely
overflow_blocks_ = NULL;
}
}
} // namespace re2

103
re2/util/arena.h Normal file
View File

@ -0,0 +1,103 @@
// Copyright 2000 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Sometimes it is necessary to allocate a large number of small
// objects. Doing this the usual way (malloc, new) is slow,
// especially for multithreaded programs. An UnsafeArena provides a
// mark/release method of memory management: it asks for a large chunk
// from the operating system and doles it out bit by bit as required.
// Then you free all the memory at once by calling UnsafeArena::Reset().
// The "Unsafe" refers to the fact that UnsafeArena is not safe to
// call from multiple threads.
//
// The global operator new that can be used as follows:
//
// #include "lib/arena-inl.h"
//
// UnsafeArena arena(1000);
// Foo* foo = new (AllocateInArena, &arena) Foo;
//
#ifndef RE2_UTIL_ARENA_H_
#define RE2_UTIL_ARENA_H_
namespace re2 {
// This class is thread-compatible.
class UnsafeArena {
public:
UnsafeArena(const size_t block_size);
virtual ~UnsafeArena();
void Reset();
// This should be the worst-case alignment for any type. This is
// good for IA-32, SPARC version 7 (the last one I know), and
// supposedly Alpha. i386 would be more time-efficient with a
// default alignment of 8, but ::operator new() uses alignment of 4,
// and an assertion will fail below after the call to MakeNewBlock()
// if you try to use a larger alignment.
#ifdef __i386__
static const int kDefaultAlignment = 4;
#else
static const int kDefaultAlignment = 8;
#endif
private:
void* GetMemoryFallback(const size_t size, const int align);
public:
void* GetMemory(const size_t size, const int align) {
if ( size > 0 && size < remaining_ && align == 1 ) { // common case
last_alloc_ = freestart_;
freestart_ += size;
remaining_ -= size;
return reinterpret_cast<void*>(last_alloc_);
}
return GetMemoryFallback(size, align);
}
private:
struct AllocatedBlock {
char *mem;
size_t size;
};
// The returned AllocatedBlock* is valid until the next call to AllocNewBlock
// or Reset (i.e. anything that might affect overflow_blocks_).
AllocatedBlock *AllocNewBlock(const size_t block_size);
const AllocatedBlock *IndexToBlock(int index) const;
const size_t block_size_;
char* freestart_; // beginning of the free space in most recent block
char* freestart_when_empty_; // beginning of the free space when we're empty
char* last_alloc_; // used to make sure ReturnBytes() is safe
size_t remaining_;
// STL vector isn't as efficient as it could be, so we use an array at first
int blocks_alloced_; // how many of the first_blocks_ have been alloced
AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
// if the first_blocks_ aren't enough, expand into overflow_blocks_.
vector<AllocatedBlock>* overflow_blocks_;
void FreeBlocks(); // Frees all except first block
DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena);
};
// Operators for allocation on the arena
// Syntax: new (AllocateInArena, arena) MyClass;
// STL containers, etc.
enum AllocateInArenaType { AllocateInArena };
} // namespace re2
inline void* operator new(size_t size,
re2::AllocateInArenaType /* unused */,
re2::UnsafeArena *arena) {
return reinterpret_cast<char*>(arena->GetMemory(size, 1));
}
#endif // RE2_UTIL_ARENA_H_

79
re2/util/atomicops.h Normal file
View File

@ -0,0 +1,79 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_ATOMICOPS_H__
#define RE2_UTIL_ATOMICOPS_H__
#if defined(__i386__)
static inline void WriteMemoryBarrier() {
int x;
__asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg.
:: "r" (&x));
}
#elif defined(__x86_64__)
// 64-bit implementations of memory barrier can be simpler, because
// "sfence" is guaranteed to exist.
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("sfence" : : : "memory");
}
#elif defined(__ppc__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("eieio" : : : "memory");
}
#elif defined(__alpha__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("wmb" : : : "memory");
}
#else
#include "util/mutex.h"
static inline void WriteMemoryBarrier() {
// Slight overkill, but good enough:
// any mutex implementation must have
// a read barrier after the lock operation and
// a write barrier before the unlock operation.
//
// It may be worthwhile to write architecture-specific
// barriers for the common platforms, as above, but
// this is a correct fallback.
re2::Mutex mu;
re2::MutexLock l(&mu);
}
/*
#error Need WriteMemoryBarrier for architecture.
// Windows
inline void WriteMemoryBarrier() {
LONG x;
::InterlockedExchange(&x, 0);
}
*/
#endif
// Alpha has very weak memory ordering. If relying on WriteBarriers, must one
// use read barriers for the readers too.
#if defined(__alpha__)
static inline void MaybeReadMemoryBarrier() {
__asm__ __volatile__("mb" : : : "memory");
}
#else
static inline void MaybeReadMemoryBarrier() {}
#endif // __alpha__
#endif // RE2_UTIL_ATOMICOPS_H__

153
re2/util/benchmark.cc Normal file
View File

@ -0,0 +1,153 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "util/benchmark.h"
#include "re2/re2.h"
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
using testing::Benchmark;
using namespace re2;
static Benchmark* benchmarks[10000];
static int nbenchmarks;
void Benchmark::Register() {
benchmarks[nbenchmarks] = this;
if(lo < 1)
lo = 1;
if(hi < lo)
hi = lo;
nbenchmarks++;
}
static int64 nsec() {
struct timeval tv;
if(gettimeofday(&tv, 0) < 0)
return -1;
return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
}
static int64 bytes;
static int64 ns;
static int64 t0;
static int64 items;
void SetBenchmarkBytesProcessed(long long x) {
bytes = x;
}
void StopBenchmarkTiming() {
if(t0 != 0)
ns += nsec() - t0;
t0 = 0;
}
void StartBenchmarkTiming() {
if(t0 == 0)
t0 = nsec();
}
void SetBenchmarkItemsProcessed(int n) {
items = n;
}
void BenchmarkMemoryUsage() {
// TODO(rsc): Implement.
}
int NumCPUs() {
return 1;
}
static void runN(Benchmark *b, int n, int siz) {
bytes = 0;
items = 0;
ns = 0;
t0 = nsec();
if(b->fn)
b->fn(n);
else if(b->fnr)
b->fnr(n, siz);
else {
fprintf(stderr, "%s: missing function\n", b->name);
exit(2);
}
if(t0 != 0)
ns += nsec() - t0;
}
static int round(int n) {
int base = 1;
while(base*10 < n)
base *= 10;
if(n < 2*base)
return 2*base;
if(n < 5*base)
return 5*base;
return 10*base;
}
void RunBench(Benchmark* b, int nthread, int siz) {
int n, last;
// TODO(rsc): Threaded benchmarks.
if(nthread != 1)
return;
// run once in case it's expensive
n = 1;
runN(b, n, siz);
while(ns < (int)1e9 && n < (int)1e9) {
last = n;
if(ns/n == 0)
n = 1e9;
else
n = 1e9 / (ns/n);
n = max(last+1, min(n+n/2, 100*last));
n = round(n);
runN(b, n, siz);
}
char mb[100];
char suf[100];
mb[0] = '\0';
suf[0] = '\0';
if(ns > 0 && bytes > 0)
snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9));
if(b->fnr || b->lo != b->hi) {
if(siz >= (1<<20))
snprintf(suf, sizeof suf, "/%dM", siz/(1<<20));
else if(siz >= (1<<10))
snprintf(suf, sizeof suf, "/%dK", siz/(1<<10));
else
snprintf(suf, sizeof suf, "/%d", siz);
}
printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb);
fflush(stdout);
}
static int match(const char* name, int argc, const char** argv) {
if(argc == 1)
return 1;
for(int i = 1; i < argc; i++)
if(RE2::PartialMatch(name, argv[i]))
return 1;
return 0;
}
int main(int argc, const char** argv) {
for(int i = 0; i < nbenchmarks; i++) {
Benchmark* b = benchmarks[i];
if(match(b->name, argc, argv))
for(int j = b->threadlo; j <= b->threadhi; j++)
for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1)
RunBench(b, j, k);
}
}

41
re2/util/benchmark.h Normal file
View File

@ -0,0 +1,41 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_BENCHMARK_H__
#define RE2_UTIL_BENCHMARK_H__
namespace testing {
struct Benchmark {
const char* name;
void (*fn)(int);
void (*fnr)(int, int);
int lo;
int hi;
int threadlo;
int threadhi;
void Register();
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
};
} // namespace testing
void SetBenchmarkBytesProcessed(long long);
void StopBenchmarkTiming();
void StartBenchmarkTiming();
void BenchmarkMemoryUsage();
void SetBenchmarkItemsProcessed(int);
int NumCPUs();
#define BENCHMARK(f) \
::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
#define BENCHMARK_RANGE(f, lo, hi) \
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
#endif // RE2_UTIL_BENCHMARK_H__

27
re2/util/flags.h Normal file
View File

@ -0,0 +1,27 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Simplified version of Google's command line flags.
// Does not support parsing the command line.
// If you want to do that, see
// http://code.google.com/p/google-gflags
#ifndef RE2_UTIL_FLAGS_H__
#define RE2_UTIL_FLAGS_H__
#define DEFINE_flag(type, name, deflt, desc) \
namespace re2 { type FLAGS_##name = deflt; }
#define DECLARE_flag(type, name) \
namespace re2 { extern type FLAGS_##name; }
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
#define DECLARE_bool(name) DECLARE_flag(bool, name)
#define DECLARE_int32(name) DECLARE_flag(int32, name)
#define DECLARE_string(name) DECLARE_flag(string, name)
#endif // RE2_UTIL_FLAGS_H__

231
re2/util/hash.cc Normal file
View File

@ -0,0 +1,231 @@
// Modified by Russ Cox to add "namespace re2".
// Also threw away all but hashword and hashword2.
// http://burtleburtle.net/bob/c/lookup3.c
/*
-------------------------------------------------------------------------------
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
These are functions for producing 32-bit hashes for hash table lookup.
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
are externally useful functions. Routines to test the hash are included
if SELF_TEST is defined. You can use this free for any purpose. It's in
the public domain. It has no warranty.
You probably want to use hashlittle(). hashlittle() and hashbig()
hash byte arrays. hashlittle() is is faster than hashbig() on
little-endian machines. Intel and AMD are little-endian machines.
On second thought, you probably want hashlittle2(), which is identical to
hashlittle() except it returns two 32-bit hashes for the price of one.
You could implement hashbig2() if you wanted but I haven't bothered here.
If you want to find a hash of, say, exactly 7 integers, do
a = i1; b = i2; c = i3;
mix(a,b,c);
a += i4; b += i5; c += i6;
mix(a,b,c);
a += i7;
final(a,b,c);
then use c as the hash value. If you have a variable length array of
4-byte integers to hash, use hashword(). If you have a byte array (like
a character string), use hashlittle(). If you have several byte arrays, or
a mix of things, see the comments above hashlittle().
Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
then mix those integers. This is fast (you can do a lot more thorough
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
-------------------------------------------------------------------------------
*/
#include "util/util.h"
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
/*
-------------------------------------------------------------------------------
mix -- mix 3 32-bit values reversibly.
This is reversible, so any information in (a,b,c) before mix() is
still in (a,b,c) after mix().
If four pairs of (a,b,c) inputs are run through mix(), or through
mix() in reverse, there are at least 32 bits of the output that
are sometimes the same for one pair and different for another pair.
This was tested for:
* pairs that differed by one bit, by two bits, in any combination
of top bits of (a,b,c), or in any combination of bottom bits of
(a,b,c).
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
is commonly produced by subtraction) look like a single 1-bit
difference.
* the base values were pseudorandom, all zero but one bit set, or
all zero plus a counter that starts at zero.
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
satisfy this are
4 6 8 16 19 4
9 15 3 18 27 15
14 9 3 7 17 3
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
for "differ" defined as + with a one-bit base and a two-bit delta. I
used http://burtleburtle.net/bob/hash/avalanche.html to choose
the operations, constants, and arrangements of the variables.
This does not achieve avalanche. There are input bits of (a,b,c)
that fail to affect some output bits of (a,b,c), especially of a. The
most thoroughly mixed value is c, but it doesn't really even achieve
avalanche in c.
This allows some parallelism. Read-after-writes are good at doubling
the number of bits affected, so the goal of mixing pulls in the opposite
direction as the goal of parallelism. I did what I could. Rotates
seem to cost as much as shifts on every machine I could lay my hands
on, and rotates are much kinder to the top and bottom bits, so I used
rotates.
-------------------------------------------------------------------------------
*/
#define mix(a,b,c) \
{ \
a -= c; a ^= rot(c, 4); c += b; \
b -= a; b ^= rot(a, 6); a += c; \
c -= b; c ^= rot(b, 8); b += a; \
a -= c; a ^= rot(c,16); c += b; \
b -= a; b ^= rot(a,19); a += c; \
c -= b; c ^= rot(b, 4); b += a; \
}
/*
-------------------------------------------------------------------------------
final -- final mixing of 3 32-bit values (a,b,c) into c
Pairs of (a,b,c) values differing in only a few bits will usually
produce values of c that look totally different. This was tested for
* pairs that differed by one bit, by two bits, in any combination
of top bits of (a,b,c), or in any combination of bottom bits of
(a,b,c).
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
is commonly produced by subtraction) look like a single 1-bit
difference.
* the base values were pseudorandom, all zero but one bit set, or
all zero plus a counter that starts at zero.
These constants passed:
14 11 25 16 4 14 24
12 14 25 16 4 14 24
and these came close:
4 8 15 26 3 22 24
10 8 15 26 3 22 24
11 8 15 26 3 22 24
-------------------------------------------------------------------------------
*/
#define final(a,b,c) \
{ \
c ^= b; c -= rot(b,14); \
a ^= c; a -= rot(c,11); \
b ^= a; b -= rot(a,25); \
c ^= b; c -= rot(b,16); \
a ^= c; a -= rot(c,4); \
b ^= a; b -= rot(a,14); \
c ^= b; c -= rot(b,24); \
}
namespace re2 {
/*
--------------------------------------------------------------------
This works on all machines. To be useful, it requires
-- that the key be an array of uint32_t's, and
-- that the length be the number of uint32_t's in the key
The function hashword() is identical to hashlittle() on little-endian
machines, and identical to hashbig() on big-endian machines,
except that the length has to be measured in uint32_ts rather than in
bytes. hashlittle() is more complicated than hashword() only because
hashlittle() has to dance around fitting the key bytes into registers.
--------------------------------------------------------------------
*/
uint32 hashword(
const uint32 *k, /* the key, an array of uint32_t values */
size_t length, /* the length of the key, in uint32_ts */
uint32 initval) /* the previous hash, or an arbitrary value */
{
uint32_t a,b,c;
/* Set up the internal state */
a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
/*------------------------------------------------- handle most of the key */
while (length > 3)
{
a += k[0];
b += k[1];
c += k[2];
mix(a,b,c);
length -= 3;
k += 3;
}
/*------------------------------------------- handle the last 3 uint32_t's */
switch(length) /* all the case statements fall through */
{
case 3 : c+=k[2];
case 2 : b+=k[1];
case 1 : a+=k[0];
final(a,b,c);
case 0: /* case 0: nothing left to add */
break;
}
/*------------------------------------------------------ report the result */
return c;
}
/*
--------------------------------------------------------------------
hashword2() -- same as hashword(), but take two seeds and return two
32-bit values. pc and pb must both be nonnull, and *pc and *pb must
both be initialized with seeds. If you pass in (*pb)==0, the output
(*pc) will be the same as the return value from hashword().
--------------------------------------------------------------------
*/
void hashword2 (
const uint32 *k, /* the key, an array of uint32_t values */
size_t length, /* the length of the key, in uint32_ts */
uint32 *pc, /* IN: seed OUT: primary hash value */
uint32 *pb) /* IN: more seed OUT: secondary hash value */
{
uint32_t a,b,c;
/* Set up the internal state */
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
c += *pb;
/*------------------------------------------------- handle most of the key */
while (length > 3)
{
a += k[0];
b += k[1];
c += k[2];
mix(a,b,c);
length -= 3;
k += 3;
}
/*------------------------------------------- handle the last 3 uint32_t's */
switch(length) /* all the case statements fall through */
{
case 3 : c+=k[2];
case 2 : b+=k[1];
case 1 : a+=k[0];
final(a,b,c);
case 0: /* case 0: nothing left to add */
break;
}
/*------------------------------------------------------ report the result */
*pc=c; *pb=b;
}
} // namespace re2

78
re2/util/logging.h Normal file
View File

@ -0,0 +1,78 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Simplified version of Google's logging.
#ifndef RE2_UTIL_LOGGING_H__
#define RE2_UTIL_LOGGING_H__
#include <unistd.h> /* for write */
#include <sstream>
// Debug-only checking.
#define DCHECK(condition) assert(condition)
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
#define DCHECK_NE(val1, val2) assert((val1) != (val2))
#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
#define DCHECK_LT(val1, val2) assert((val1) < (val2))
#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
#define DCHECK_GT(val1, val2) assert((val1) > (val2))
// Always-on checking
#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
#define CHECK_LT(x, y) CHECK((x) < (y))
#define CHECK_GT(x, y) CHECK((x) > (y))
#define CHECK_LE(x, y) CHECK((x) <= (y))
#define CHECK_GE(x, y) CHECK((x) >= (y))
#define CHECK_EQ(x, y) CHECK((x) == (y))
#define CHECK_NE(x, y) CHECK((x) != (y))
#define LOG_INFO LogMessage(__FILE__, __LINE__)
#define LOG_ERROR LOG_INFO
#define LOG_WARNING LOG_INFO
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
#define LOG_QFATAL LOG_FATAL
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
#ifdef NDEBUG
#define DEBUG_MODE 0
#define LOG_DFATAL LOG_ERROR
#else
#define DEBUG_MODE 1
#define LOG_DFATAL LOG_FATAL
#endif
#define LOG(severity) LOG_ ## severity.stream()
class LogMessage {
public:
LogMessage(const char* file, int line) {
stream() << file << ":" << line << ": ";
}
~LogMessage() {
stream() << "\n";
string s = str_.str();
if(write(2, s.data(), s.size()) < 0) {} // shut up gcc
}
ostream& stream() { return str_; }
private:
std::ostringstream str_;
DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
};
class LogMessageFatal : public LogMessage {
public:
LogMessageFatal(const char* file, int line)
: LogMessage(file, line) { }
~LogMessageFatal() {
std::cerr << "\n";
abort();
}
private:
DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal);
};
#endif // RE2_UTIL_LOGGING_H__

Some files were not shown because too many files have changed in this diff Show More